diff --git a/.gitattributes b/.gitattributes index c7d9f3332a950355d5a77d85000f05e6f45435ea..295128efec4c525d18dd66295b855f77c19a2a01 100644 --- a/.gitattributes +++ b/.gitattributes @@ -32,3 +32,2910 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3624a94a9fb6806df9834750466583287ab32d92 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4070835356827751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03514958095848397}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0758536616906455, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015747064380670645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3264375465319237, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004888854445231445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11509298027342854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002040147114373331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03493638633069714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009342574915112234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15766160622381195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033114573324024405}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0532813862747049, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012579627803205211}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07257604824526195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014483785678009685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31637706878833355, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004769735504597033}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1105412242108245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019072286738988954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0714774644843108, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014699104009543759}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.307939556685913, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004520814685280998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10843545057843905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019083088150967664}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..79c8d568c0f7409b5009c94d43166bf768aff7fa --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.41914858834195134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.030279335876129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07536633674836868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001620641410096321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3290768382699901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00481767508183653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11424698089656772, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001973221738343803}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03540467062379218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001074817084017668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16089821041540717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033011630774406127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05368591058094131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012551880063213156}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07231503158237214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015163361416883465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3189205930522712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004694857387684187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10991123942051419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018557651460448018}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07148579673935408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015357817111525064}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3110112645350247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00441643475943137}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1082043807305256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018480349337665876}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..66e96e11d0367ab4f7e757ff979c2927bd7aa48a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4241874936612034, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03699728854949305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07469786641233617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015771153206732972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32891693541469197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004751520151482175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11375522621692136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019642936162507533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03462695918297652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009079391487918842}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16210166248343671, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003411098262587952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05344291957030947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001233885317072834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07161852924412807, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014761860684115284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31797917629392425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004598849198704314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10934081987838024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018415550723111455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0711214967812572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00149377360051449}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3130870814045286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004421211065212564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10823991385374933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018380668821100924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8a638a67ff5136b618b73c458bbe009a743d2653 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3916994292697065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02655023153261868}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07713695315738618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018521617901133295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32641437991318506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004583689746653368}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11443103117633296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019845366723218495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.036319480745632425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012079439649413412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15985213856119682, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003223582265695079}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05368996382308088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012403567348119643}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07333561421491072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017170202297610163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3129899723170166, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004403504671395443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10920281437234497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018472429946517301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07332584684616669, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017553031622823821}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.31070372160179327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004314602758278112}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10882089385505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018674885337082484}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0f8227bb3d46fcc76ca3657436bf4dd6ef464f16 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.37875018794247045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.024296780304434905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07231813177556075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015118246585830762}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.31870699434574523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046463072458484975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1102139471634063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019620155943445507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.033695317164630666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009163247572691914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15554105747469235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003236397171744527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0515680827205002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001213141047008391}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06901574750871842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013947803448898716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30638147232578594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004472401624140904}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1054595308766645, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018290764447497754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06885871876103705, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001420315845279487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30338100654345734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0043561963135752306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10492767242713451, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001836832016012516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5799e4a04b36e7d49d9cb27651f1bf9d091fe576 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3689406693649318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01833284872989782}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0725733890131515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016541722828599028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.31647681542290346, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004649887369574888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.10942321553706275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001960578009336271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.0340511038621137, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001109810266658632}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1539259113242296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003295678214536681}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05107734688924233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00122669666906548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06920155517233274, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015510532870385023}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30372126675172995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004472388579309833}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10453686352175766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018279045748061345}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0688242602910231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015744817424633028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3000075619025587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004328528641012373}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1036182486745027, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018321897232056268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..94a9b445a62dbb76a8fb2c0754b20f6f048725e2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.2320345566104918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002028663485069812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.15806405453574032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018950997131969357}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.17529221125446331, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014916739627182348}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.01677596652301549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000700778712609751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.012168726274182532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006308549540794899}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.012985384177633208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005793970465797221}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.1871404213284981, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001604901624172935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.12845338233190606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001541707987416157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.14159539007628982, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011656632831698861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.2002817812856504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018638576492756192}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.13376513099281687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015142202748567432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.14922590343653938, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011984205257227288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.041197050082785285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.001108996485999182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1dca36a0d798360896afa5ddd31b947f08588691 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.1368123990805212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003859738443155984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.21915722251491454, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005434745495444146}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.1478464813817612, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038320333818157508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.03979808957820905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017989632072343414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.07332816379948168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030403554888855404}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.04640292562526275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020116942927143034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.10824512629470052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003088739020891906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.18075899176594987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044836417615243115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.1170703502765758, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029311547998135696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.12042587524943701, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003511890506396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.19133333683300907, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004812564968367625}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.1291123762138599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034020744059904736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.8996196002528378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19349493429212591}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f17a1431be6f5d10ef5e2d6fc7a2417d606dbe4b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.17608939105537802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004253817230115363}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3431140264090531, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00534101919038467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.19633951658374446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003958953046041264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.06908776827206395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002528705334523847}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.13897905823799864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037630298921426447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.07757017845101091, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002400954415731867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.14034375265741814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033583359725794662}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.29506101479424657, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004712859932989569}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.1586443874692295, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003057777683650459}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.15250105487675392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037920002252605505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.294858148838543, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004765884466280505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.1689535186370901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034887482192121964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.640138834989728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06878839244992584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..540bcc20789722ea53b5a4cf03b7affc8fb07b19 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.1480732637246692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004206315352922732}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3705823844540417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005255405787076392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.16851267425125402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00370915891341574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.05851115547615061, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023290778302875307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.15887512559363004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038897602570469245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.06808437331115559, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021900035041626056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.12082840824613766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033519196264855337}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.33255889008853323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004852038022387119}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.14033970117873198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028896013836388623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.12767131894850914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037644778529259577}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3162971688824102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004747043761926457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.14398542256516328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033027404123591544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.4187050896119742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05234246731012316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a306a75e4969881455e2f5d72aaba3e368e7570b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.11219617600997056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003512511634359904}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.38024888704829407, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005180394391296343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.13586161882742837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003197161076846914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.042085938124191064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018230760751544277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.1645327089113456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003997411490344945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.052457179235399276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017515669453131311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.09344570812882523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027200533279541257}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.35242896432602544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0049265850395284186}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.11671197771344118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024641462705413553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.09539660575352989, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030856580577620744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.32391745612838724, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004709017013998289}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.11463457129527055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002803475929481998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.245079321161806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02778718321398768}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c6695a2be8d79a7288bd795a2b020ba3f11ff365 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.09393710373652846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003199269737822915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3745567023525177, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005019704359364239}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.11729721313967732, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027320054268922134}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.03517936482889984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016983516245859785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.1605210440862932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038664434520114016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.045075512409701604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015508059027240881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.08008998231575896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00256681324622759}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.35287444686063746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004875421748566501}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.10332377556822082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021608967117124093}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.08023373310406072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00288649450260082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3198596952956621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004543895911760099}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.09887761108872525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002397235033540902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.1514814811735028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.041945649669857865}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..12b1c04fabcc9a312b580b235631e5e7c86f5cb9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.020794810389244665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008286245115526584}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.022922154005776907, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00035921812588188365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.21352555655696376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022208454109318375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.04006653282921872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005481740262477606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0010806676437478898, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 5.572788223887115e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.013144891498511075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006988181195494956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.0019179536475281184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.819595205548223e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.02287171287886853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00035207258394197473}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.21329473631474377, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022102531793857646}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.03999564658771847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005417492107414437}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.014803873787478817, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00023955003605054956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.14458682760689323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015738069244451358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.0259194629518608, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003466200097817442}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1d0a0ca85e682d2a917d9e79c56d9f85527e3ee3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.4861728821422337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03584212201959426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.05062962614453291, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013437525777807101}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.3417100308532278, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00435704773787356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.08279157012164147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018413217717905348}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.015902434132343327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007863360597732104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.10098499560418767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033802047604672408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.025426464984268635, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011153275941210136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.046198311688407115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010451508171373679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.3277232393217278, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0040337705566764045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.07649332751100299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014727630258473052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.04203232751325336, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012323483368875934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.2835715586404924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004168417537419584}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.06849359236303296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016968890488419251}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..181a3831d14f5ac252e0c13c4e62e2858ccbd33d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.7628040722794971, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02211715710461336}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.05503857963517623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012530235158013246}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4237722308516944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004678442445263298}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.09191514123583507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015945801148090116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.019614084687761036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006958667373462588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.1680913399724686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003936935332174803}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.032591510976486694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009225376298957598}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.051275890765769695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001008153954165397}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.40628782480488934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044353577159538015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.08631782016595303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001327845800016554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.04580878529585366, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001122992125462262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.3581492753310056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004381030885989483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.07633989045692885, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014222030587289858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce08fbd7abfb23db9d7e1cf9f6647b433b989572 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.8888663782890923, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.031568607067727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.05690926034885604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011475096014499507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.45426869216384025, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004516571311698728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.09625702326801779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015524610950718182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.021645434712929023, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006521179846718135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.19508795473756288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0040357210440640995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.036709719893509046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009246274500291746}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.05330602414074714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000960960782770985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4341135138594583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004317367652444747}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.09064023633752837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013324331225587393}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.04733129735724454, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010260711095464196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.3853940847006117, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004214948856669242}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.08003764881477311, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013775813454877939}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a9f565b3608cce839009702206cbf6342d2fd60a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.945854666461263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0402112381571833}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.05604585253845832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011427724679781285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4538828578884873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044575960643494774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.09460944755353914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014786963748578273}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.021574156553869385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000666980759824044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.19771338204023536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004119461394324271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.036402813498665906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009043444520209133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.05270992641742948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000977850644744796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4347653663874296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004302103689446801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.08940502094865169, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012882532753422126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.046717493551301156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001010133723082569}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.38717142823870215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004286251024406202}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.07885538264707831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012911242439370375}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5e4ce97e2a0f9be914f1f38348320b0fdd2d5b0c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.9354058693381648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02810525494577349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.054228941021206276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001096252527267709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.44052473635855544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004434086426797913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.09189669042348231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014444352152093109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.020624829786179254, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006718344531212511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.18657022488728872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003970246553779722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.03482971179628577, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009022373095447741}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.05120088679459415, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009633103342396731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4212363318994364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004211846253373217}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.0870670682221663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012838750789689254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.04543807296984024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010157838089103063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.37673581998691646, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0042662278147285095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.07691182427135196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013060342828319171}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..df1603a61392486145fded6f219e996192c1e685 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.023111606817375823, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00034433259453187245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.199898162382514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022216437870279087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.04023278195426544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005477579043898359}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.002348515359030586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.292209026413432e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.02366089115265813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008284819548104561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.004150191718708099, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00012739617704077432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.0229905917432032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00034282566300562767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.1990997429912886, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022213443978533944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.04002572630665756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005453486432541972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.01905255519601842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0002660747474795132}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.16903333590221456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018194040988673658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.033238272140701595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00042068255632681374}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.011674876863475004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0021009000372254386}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..23d526c7447cb3c89c407fb57c6407c8ffc51264 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.08967550123251418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002937976752458282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4773024647388192, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005382160167374159}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.12976799696821192, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027894949987997152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.03875399838383589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017591539560822005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.21057258380768887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004063079045737644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.054485621293343514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017012540588488038}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.0790538192004279, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023876721978287347}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.45016078797153286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005136353137741063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.11667620039186845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002240166240092561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.0758745124201561, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002591817404665531}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.40795420296814416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004919957414755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.1093056100326863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002463062122648188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.9921366651933168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06271540865538455}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3912cc24d164f5393ed89a4b6083bbf9d995a028 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.09279613971509679, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002133744119404951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5489640435072753, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004743143133672053}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.14521489817363703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002414506324594877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.03909773513534565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011574465963972958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2581685386804721, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004193436933392039}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.061645677874947354, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013858346723081972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08058580758928455, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001676284644699535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5047433145790685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004508210716996378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1276872860164771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018563839053403013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.07756422907254948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018428480505735482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.46809438833435923, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004457544768356877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.12134285926676484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020676862675326735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.2825526231821482, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.043492144074173136}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..72e562717fe6de5e153c2c97f2cc885d5955efec --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.09618625233754133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002582113721152913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5457125057626717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0045898974012682685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.14643243275737228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026100965175248907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.04417002172231704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017784684644853198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.26721249890626364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004205423420143083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.06571024213935271, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016742580356473582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08292794661598844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020960952284175775}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.49763673601262126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043726175282994455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.12785451973740822, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002033342221993515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.08127713958483572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002294208086828085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4695938897289019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004353276805355962}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.1235235922343905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022715172117069057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.408739047525639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037077565091005064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..93643982da47bd9759f0880b5beb8234f4d4624b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.09479449002481802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002655577330384621}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5432614689390652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0045982556155288136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.14141133839063644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024329396682929907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.04363505527515881, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001672653936113288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.27112334525352516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00422467374714849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.06374220282296517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014814715876127712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08141857432754715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021601024343818197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4913866663151205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004409556454898837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1229318925720696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019091259065232668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.08007354294909853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023461776768967193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.46756254108881956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004390247412461436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.11894159333044178, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020882148965706148}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.3615130495425574, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03941780905887599}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1f26dddf1c7d3ad672f6973b3bf69ce4e96719de --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.09830029665183404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032438344323854184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5307155773373864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004576617366471351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.13901031487174892, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027114384521542325}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.04702658162841678, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021412388855118754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2655552533910532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004201121728916801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.0639493149144395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017580675198847523}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08605550281508571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002809323179439482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4835198358351258, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004434793924159799}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.12271761011397488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022754941590082543}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.08393751091304703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029130881830766323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4589032783950323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004394339374968956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.11803135941981421, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024007564296031226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.343037351813773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036140936708617705}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9cdf93c405d19d19476bda4398e66af74b59f58e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.019449928935063972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0002996123768501138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.1629409626311148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014178344731677658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.033848667122813925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004785862714798914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 3.7290701550733404e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 9.057889706362649e-06}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.000247591268503305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.608651546632237e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 6.345797512857661e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 1.5382459284932663e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.019449928935063972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0002996123768501138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.1629409626311148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014178344731677658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.033848667122813925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004785862714798914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.013350740331407294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0001939630721901288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.11827840682573859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010744315351548378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.02335480003154784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00031241488270193515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.0024661624004243305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00022679513212848655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa5ba07d059e3381c11790cad869df4e02946325 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08707890601958627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024828597057122404}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4696545657848939, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050361060172909415}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.1299893322446928, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025804647383953576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.03503697621648224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014028562973121522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.20358535015420007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004005463978193232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.051815361827752766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001524719270466829}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.0774736609326366, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019875056608878654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4430174064260006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004789275467133504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.11759668044891276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020751461605538037}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07410190870582686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002187569905436837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.40495877524310353, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004615318859024589}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11034796536631672, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022796848420741974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.9141145179175494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06536296915571908}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c66c9a79ca9d9ea514e0fc3059a72f840e3ec8cf --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08092828074865113, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001906490261574064}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.508339922983901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004811650108707499}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.1274257819886352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002067479681389862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.03434375738172082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011440228205307629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.23872647813314588, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004162934850661181}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0538074768484528, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012562150972743444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07273561966493279, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001561121717880125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.47591647381990865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046462730133157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.11571836456900784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016908336646337485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.06920233060909893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017246238008069412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.44001472970441646, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004502443832581523}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.10863800930732682, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018630619874616906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.2160721235507446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04337158036151172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..089fad83360769c3f852f7a5da87c0a7b776f99c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08332513771516135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019868347121842745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5227234573351196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004548834912859631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.1318424172773736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002194221638712534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.0368091501321238, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012212135534082967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.25680330637418675, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004245573310206607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0579898457998029, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013623575809253766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07491478363930333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016259914368083285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4884403183279412, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044306286785567766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.1196985480814696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001792364415355123}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07137645686823287, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017762733333957057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4552935323318038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004331432435421035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11277885328608343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019446200008606618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.2968562517794118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03981886990665034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f7ef5a9175f02f323239e9cd22caab040d850639 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08350295350572431, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001964565788491711}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5269669069519911, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004464761280028578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.13136185565213176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002061912807178742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.036905855206776556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001145818807419939}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.26098146346727347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004211890254662535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.05793797811823835, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012534985992537161}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07557252948320996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016449289985662312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.49370616371308285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004327952531421013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.12003948650766051, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017167875473952968}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07149122309163901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017785526726022418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4585698429370505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004258474803913088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11220846020003944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018460781766260106}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.3609245244267043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.033697352727562094}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..28fd78fbafbdf0e89c3d550065a14f33d60699a0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08171511695579445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018633798688914489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5205014192182988, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00444667928382001}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.12912740120963084, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00200364840452344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.03547040912515195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010877669331809023}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2544026510929024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004229382794688773}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.05600461944766409, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012334741587230486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07419114114811591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015341927152941845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4903982438780839, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004319214854158963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.11838679216919511, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016466394081195558}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07010518510850142, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001733506016922793}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4514049055251742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00422208781632271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11026059508958445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018150927429308838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.3127171878921704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04798969717597349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5983512d0955356ae71a54669eab22c5fc47429e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.08086960514171361, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014525200909349487}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.14073285537633912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021365692846394106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.09516029675770697, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014796463176875243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.008229815685974943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003995476871805908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.014689368147720344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008211286470989261}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.009594517812957653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00045393089210895286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.07093712041741049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011411235398140263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.12604423823582933, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017629787569386458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.08409008430092453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011584868305005939}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.07781962570247349, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013794354759233953}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.13531803934376344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020069265089291905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.09152903732984705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013950176929762666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.4446304965171368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02101564136831389}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..627ebff22dc32825ddec797155974411458a0e9c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.12036832068446836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001698822978107527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.21036263585148327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002320909333489002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.14245264302938512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016832867172118543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.015115117361918744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000577145021228364}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.02662558452098901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010503502526558268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.017846850141455827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006531290340899643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.09229239279906276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011398229641967105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.16800779726654005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017662341943095999}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1106783481573006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00113630622144049}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.11299875946868077, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015782836399768998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.19806432139905789, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002160572168760569}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.13385718956690607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015602587391599571}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.8653788220477945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0447073485750703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..33f4f4a5278b2ea4be4a525e32a6bbd4bd2d9391 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.12987828446896582, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017668539132634244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.22449329045901115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002446761016968778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.15317578509745602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017377869775542976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.019205230811756614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006259651415811169}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.034096443155707416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001170242516537603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.022535640055881916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00069644020248688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.09796065026425818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001176944853197895}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1766738402223476, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019113957567420726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.11716221211305587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011775173863940828}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.12148938565288571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016327095456720253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.21077828674873728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022836578166374076}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1434435746637532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016022296713075052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.0323682964004037, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05728647371845087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f36a65e1936c64292002050b26c3e4e0c19026c5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.11777387943190545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020113312278699025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.19469078652969996, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027977487029445204}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1336180051181457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001957815366131097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.018842490102852782, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000680153964280099}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.03249985364518616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011921354860818573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.021648290209856712, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007245383572720933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.08919191590884211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014541958760042968}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.153037543777827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002210331899287786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.10208979953114408, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001397295719317153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.10958122883328779, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018686579393142751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.1819882090629157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026135117445184627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.12445447248036409, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018122356410507367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.1833686716294687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07615919722768849}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b219540ab8001aaaca187fafac4c6c5186f559 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.03995701958166195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016095372311748183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.06674356961372938, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002448300622281182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.04366405939122397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015831202808765945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.006552251076514466, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00045575183717078306}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.012072979341302284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008431856424377719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.007536808369783641, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004892115817681631}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.03139542886892282, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012706755231911566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.054115916065237676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002006663515515118}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.03438650141175432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012227168700738242}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.03739170639098872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015159196007358158}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.062356306725636794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00228581975031369}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.040753188180590226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014756445728831532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.28624026182603163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029349438130954253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bd38bac6d49c2f75b596c68fd6b7d40d9472da07 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.007186612272605283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009791397918263772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.009626580155215095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009843679259005512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.006297564938231645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006539758063676466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.0009639306536740955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00016209042335021826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0018204921865520307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003123750046265983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0011309269927620334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001883904593636651}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.005946725242200601, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008795298956796886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.007926123528146993, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008039901837259593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.005007392411738194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004993418818895954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.006742477351676616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009493223215183244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.008831833968476688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009029356485515189}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.005783114895049256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000599975916196901}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.5436993670367395e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.3789544446448377e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..205c2a0f65c0396d88d5bd41a810ca080e72bdbc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.06660092740337692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010310436131496232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.11598354826382899, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014500523897201271}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.07882898461240369, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001040524409407945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0028614323156141855, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000180186892026584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.004436869555896244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002993853890744701}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.003243321779952968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00020438291184088177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06084499114194029, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008567327435936374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.10872582938119582, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001328386237011712}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07272695606243273, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008850843973306463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.06025198497698202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009166556048550033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.10601353160224014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00132199783950748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.07152252322090602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009232099420132897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.09735038671178359, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01912399145191819}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c8ac13adb3023e7da5cbb15bcbefb9d111f780ce --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.08855732672619102, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015064472119031337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.14544347329432597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002411071227170801}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.10178108004797332, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015749327774327397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.008560544006655627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004413067196946586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01616782557058256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000930039738517821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.010181112623842817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005119910300898817}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06881259230018068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010402326401547975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1164247420147934, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018577911811822583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07977466300345593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001103140871324476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.08290521041610925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013971705074040997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.13668730020343445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022409703680440853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.09536226516262464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014532751500060744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.6029781033915351, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028174541781078276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6f1eee7e04f6ddfa1a6070d2033920905d4639ab --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.10757274244241427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001866638618407923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1768593713292845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028614155741575843}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.12386118481423918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019396226221506604}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.01811267415569796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006924773828138044}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03241684505960385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001268207503708144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.02117387153026309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007530823627085244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.08462993923914566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00135467976486583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.14336450100833226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002264556242154818}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.09831165574662265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014131960658590383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.09907287995201415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017198819213882605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.16367866039452192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026624060636886954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.11426827856742555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017893139149704248}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.168914893531667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05316184069367659}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..60e953b175740979f852d2f8b1e88666775d872a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.10720171803251193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002082996644638251}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.16612796673148053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002954548192082436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.11809803100929343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020464135073259295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02009963124741285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008011300876613331}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03248360016391421, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012256498842740874}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.022197025590925616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007547906063236964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.08482141911566571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015735036854485902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1351988789967234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002389596247655131}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.09416415981113054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015409418875020468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.09848261188127233, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019260433293872472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.15267739552040577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027171840590306546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.10837367169734866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001874380599850097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.550517196878354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06993149956571287}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4be20ffdb835708bbd9076199e21463f5f4f96 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.03740904754421892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001610680414155072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.057441900471446455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023292227400479332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.03946196457385396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001572740226370851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.007405435782553709, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004938171762244464}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.012822151487257908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000886391837086689}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.008171833643724272, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005061088990249447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.0302784538841258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001265349446879213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.047977766809024824, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001937859235680591}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.03213695990330082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012354431038129371}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.034287485555940835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014785361480277362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.05268920626830129, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021423352455123288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.036104092106741856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014394317299840206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.2906818896887448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032241949265799034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..63b38509bfdb2e13f46188628c736f0256b8a263 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.006416762662669792, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008067399599762491}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.008596118967962398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009759171996390612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.005997866632095373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006446489437867567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0012325600300179128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00023262166513716688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0019877130664642207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003765849024360566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0012168332924537228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019190718073544674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.005180574608491138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006638732889323238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.00723418284441452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000838041683119617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.004877643128103321, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005141597079202806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.00601329806116974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007735773924533153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.007945295399230258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009128968590461367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.005540816115674005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005975088748642658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.3695690005957795e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.162992091823081e-08}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ff5b7f28fadc7a26bf60a863b56bc8494bcae29c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.08473975013844745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012652837122476013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.1329672875133205, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014302651043992514}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.09629596799033903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011603461268498176}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.003906179417613626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00019031811304692613}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0055735013712897765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00029292378757282304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0042667329498244436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00020313195756774255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.07735309410625876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001069934210773379}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1244809192327779, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001333807275878821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08883637966635893, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010087038286843665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.08109267338305781, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012068363440650514}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.12723188277215594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001357597198069678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.09211538989861727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010998765766529236}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.02719689123536131, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0044363498052874435}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e499394c0ec5e34f624dc50017c1deaa8ae7bf7d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07936181927044636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011762568246866275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.1308334139584999, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014823829252056944}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.0918764247127699, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011260140101502995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.00393290828598269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002040513153487916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.005507726691318861, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00030125280804493035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.00423567615381497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021018422233903274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.07417084074707798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001018585184337875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.12499113633870057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0013966793071261054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08666723799936435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000999850410378775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.07610364515575635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011064264710643904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.12615810768661415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001412447688376772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.08825782099755484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010565687929762516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.10510472666663716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02342215944592973}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ef2b12ad8cd277f5723ac625813d68268ffe7164 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07947361253543833, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001193005591889888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.1327421000432318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015486342539799732}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.09256018146781847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011555403499640406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.004199424248706065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002403958914677049}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.006627430748129385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00047327454697217267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.004697153886380661, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00027487360812121926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.07476922060726728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001036726053715039}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.127478433032578, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014601336227070283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08788495344807927, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001034393917882335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.07575953247297404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011219051960697426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.12703786617602278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001464179232269078}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.08832933606271764, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010830718460803723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.19993488449475255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04329139775558018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..119c1e41373c1d107aae97b777ba2db8fa72c2e4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07037274511161584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014120231404269865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.11052779587284725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017290812995824128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.07737568279361932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001265246486643532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.00392300118223772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002557581635323497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.005589835600869042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000423176072989138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0040651974203171634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025930635468911277}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.06512620770805046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00121951222912035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.10533676686618007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016204045193858512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.07258759044318532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001122423619227686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.06672709213349835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013263129049748312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.10524011607734472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016343300862625232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.07343581428013704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011835280613330765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.24585325834528793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03702715577574997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e58dbcd58d27c8268a13572a1a46155d2e0027b1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.02381329967362824, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011028927436461343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.0375355725294583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014975837973007148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.025475522197387277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010381041342986902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0019534338228814068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00025424799653585257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.002574143785393473, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00033042674187292473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0018254913452193152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002100606076864114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.021656975285866457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009662420131766899}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.035213960582413724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0013920460584087664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.023474073198495527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009278481009221991}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.022475087207169998, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010391682891249803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.03568293304710748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001419767775872114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.0240648893225021, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009742444146235487}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.0725273475872256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01184587819436072}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..af9c93d44797d7d512dbad7eaed43c1cc1a6874d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.003599842790127908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0004357453635530341}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.005450956260566027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006157647454877155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.003773178583642327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004233214040303885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.00024100397622410201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 6.428724333491122e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0003662081649364038, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00010026927803212084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.00026468573039586365, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.948326335135335e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.003261903403975752, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003814441870483113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.005138441375666529, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005722130168539618}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.0034927853884506757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003810695069644319}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.003394903398841389, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00041292623146590925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.005123453693921271, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000573270656957646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.00354234805200471, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00039437507462346326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 3.4432997292695364e-10, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.054319054078497e-10}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fd8c42cad30324f51237e96ec603c2002faa3dc4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0505257980847339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009316535384306269}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07944574030730557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013436416711421135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05739499438745971, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000959404112224303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0025522430280765624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00019188137819214202}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003889530091650386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003502492712853139}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002874313185982406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00022791640812011725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04641940622808299, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007892141876172595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07403750267734954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011941764542527037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.053008425140295905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008201296289562219}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.04816018092226108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008696938078640268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07599978162074517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012632037879506343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05476968826480647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008953065390907387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.14859459498800928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019538924284114197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cc2bc0a1042d3146f33e06fdfe533db6a2b6c390 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.13584035826185337, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001883003830967405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2300655738827389, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002771815067524841}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.15836293018887368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001891640433790272}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02400583408187123, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007285343955308211}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04289680599794199, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001353605133009988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.028190707681194575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008118715649523407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.10298984208878467, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012746961770117027}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18009351587124983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002104050870387258}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12130392597137107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012906736654645788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.12668865175777289, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017429813042225584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21558088869876474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026000978058887433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.14788907195330203, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017520956258023405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.346869321685321, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06607443264134674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..468179200f7aa95ff914e8778df19241fe52f7c5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1708173947979992, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002081500885534108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2857546859749413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002766073063501205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1979943409707515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019701979235594003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03867526801746755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009249837176721441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06488659439579164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015151297174771778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04456119604899187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009779372383836055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12436000788073821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013936495773448447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21512580022359384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021392819398317084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14561448339286645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013308339769298708}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15896932268065497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019322450604166179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2668532631896298, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00261075911429361}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18443888049277404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018323545176278458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.247794388992107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09928029909737168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12c6e4a074ed67f81907f33e779b82cf25b88818 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15395480397824707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002402542953049042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24557521515724243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032199609208037362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17165885454358776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002266523062316873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0345849314932535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000925086814362582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05685215254790676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014741611375388177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03887583188926559, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009318869486006191}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11262382269704937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001735467609168603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1853949845751863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002495030067359919}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12611107179368627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015753873989903184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1426484686620457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022377962576445335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22876101619014125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030299996388814796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15920866146470747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021025779963518188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.212268753332442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09749124513916169}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..06766596b97fa977f4557ebb675d0e24a1ee9a95 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05323367871163294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002019764759552398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08477832633294664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029256767843087337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.057194516095400834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019532780600384314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.011892613439611567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006525291351364055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.021014952711012305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011420293722952714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.013407675922368708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006687476239191229}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.03994947969161068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015323136750776513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0654114763710059, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022890377474177525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0429641799328658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014362645716637073}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.049203521589357486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018745212624263795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07828616324493042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00270998239625019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.052823275351703086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018085153746161442}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.5159665881377578, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0355848354666848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..374270b4e38547bfc04e506c5f89885563250d44 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.008397509125114434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008676037544483993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.013182384245950918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012488579545023588}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.00885537771521418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008474145806800235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0018519121687661717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002652232008037664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003201546871291623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042951400653203656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0020845828252393957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002776449859686965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.006312806712827651, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006339300731280119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.010466170637582555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010119772330203227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006795329450745382, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006430172034977336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0076894646083944945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000786099271409283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012227793486074013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011695312324965957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008112137370163754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007719985459243486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.133528491740168e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.288876136024227e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..170d78cd0261080e50deb6ff2ad812b37c47cc63 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.050536252348243965, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010303713135263982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.07249280873981946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012874203666130126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.05331824068511711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009108204142889356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0011620656110374618, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014360139445475996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.001522213922602407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001808244327036453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0011035986294212138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011286020352486199}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.046800024712754074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009037799228350519}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.06839046112658723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011982762108255986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.049896775119845964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008243222785585304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.04562287089619549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009343218195610894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.06572279893423466, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011613172719751475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.04804101501692195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008043684532992358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.025005452627405448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.004798052609004983}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..281420b7d74d11457b15f78897a59b4a21515f53 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10336569841892168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015572986318203581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.15980468261934022, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021843290096728234}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11453232328159935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015165645681982799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.00937353738620273, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004621640258235172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.01583249449894538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008242203780143205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.010646298254836605, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005004715634605305}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0790900720241622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010673462835914323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.126113983071195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00164608010815976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08826342080759857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010184729602732176}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09781712366399713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014684084322258415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.15149051165585709, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020607697766305863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10835203045408379, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014203571658610793}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.6148382566867375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020840940915060138}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..095edf5ab6f8184bfbc2adcd93387f67bf5b6384 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1239472716246903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018733837629968672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.19751962957109306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027503417388775164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1389508648632086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018875843940261779}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.018494424650071603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006803416768500626}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0314952015813472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011962370296272414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02129586388647884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007438044749129246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09361354942631474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012858909938593117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15349834743094354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020743788462299056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10552665111055606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012791910056744688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11595972550990691, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001753308531131635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.18500426729144692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025796753919204852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.12992721021691855, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017593130424382794}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.157589240412585, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06749886174670726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4f429852ff9668395a4e77744ea6f19b4a05617f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11061508225201387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00219210163323952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17229542877999413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030954808441172386}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12031046206784149, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002096721222333191}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.019285223842382425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007386242173115931}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.032665716658571785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012591431714370847}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.021617853652187335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000749070842689932}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08351753016640841, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015962438735235313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13414611484067868, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024039219745857114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09132576653371158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014923093239882957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.10363733813080353, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020595302264073945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.16163856120472586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029085585556816312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11255926347759326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019516313006938469}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.4447905460689463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0650728846043769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6bcb3b8611715ab443a7c5698d1e39a76be04b06 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.029920984782127447, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014566610845788402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.04785896357976774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002210898388888131}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.032690132376649535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014923647623479581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.005659374256624486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00043051326798354176}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.009804204059300893, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000793438963373113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00641884861944969, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004672678132495265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.022749908515385872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010830329089833192}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0373074119688992, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017190468611345826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.024933006612978887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011060087069036857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.027846049257508616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013603362197923356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.04432125424579462, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002040921786045664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.030339892293146535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013855806303329002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.10806636279782142, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008181526680029229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0dddf5fbc10ffdc5590cd243eb05bfbb9479a364 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0029392502676391357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0004853225161067237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.005172582525022545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000765814668280875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.003428639083443863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005233153160450194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0005834403624431923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001396349568301966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0009316463701771211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00020955827176984525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0006554801175224404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00015037462344468343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.002100593784823454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003210510857538418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.00393696053174691, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000586165963887078}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.002504342041321112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003638861281311792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0027237658351042765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004415537544986123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.004866254903763763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007242312593154887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0031942471606000973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000483410224652734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.464939987980638e-14, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.151354860776468e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c3e301f5da88decdfb17aa006b34fc00fa15b52e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.323, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..741e93e801730eed9cc991fc8bb73f666f757ff3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732961}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fb8f5a7225bf430ad7909e4b896e684213e1fd33 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.349, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0150806639915631}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01500870618212173}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12991f0268d098a58d86f8f41adfdd40665a8a01 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456736}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7684d6874408079262b25fb51266d1e99335fc6c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01460648312734276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c28e9915ada82fb22f74b1861f9ee9320e6b080c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934656}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.32, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574881}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..47476cfee6b58479c8d10cc728ac15342c266a5d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..679a1c2d88e8e726cf19b5be508d5349299340d1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8b03fa45f6926704d2490f4cb756763d323dfb2d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.361, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175108}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.36, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015186527932040117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1ec14a0e6a57f9710d6dbd99dd3cec889aa2ac --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c83ea474ffe294f724c054094c34ae22e0f8f119 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0148995972428115}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.342, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5c635eab7c547e63c1d98f3200fa3ed6076df5dd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.343, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356951}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01489959724281149}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..638d8f03971d87ee7c60621852caaef8b70adea7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795023}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1e8ea44aa4277eec95c87167106e80eba9393614 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01486539538592837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..57ce6e75f3b7c4552a002f34622c2273f6f39779 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.352, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015110404505648663}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01489959724281149}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e09a40476099486e9d232551a5300a10a56e19 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055233}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1f8ac66a8b62e9b4a077574ff8dd520bf0dc7705 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411242}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4b8ae650443c2fca77927c6e552c3e21af231c30 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0146966319607925}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996678}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a551aab71951eb1fad677dbeb122ffe248f0eefc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348632}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d6cdc239a310fe42ef88d408d8c8eb81a8d617d2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456734}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eb4e622648728c64f193bd9587eec6604d3be803 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348637}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9d3990292f8c1b82eb1a5abe024cd4809e83f30f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01475865230357487}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.312, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509003}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8f26f9f55d879d2b19e7384536447dc8549d0cf8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.309, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206491}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.304, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..353c555eaf7f8d11829271bb9e5e78d4550b166f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456734}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.318, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0147340793093119}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..32dd169128414fd0de0838471da4760c6199e625 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928354}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c52dabd30843e02296dc4cbba6004727308a2e7f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01484221315341124}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa01cafd958167b7c19f8a7d26cb25ad88e233d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055237}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01487687202745673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67b89059bfec283862a4fe4ebab8f9d1eae4e4ac --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563102}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f9bab785ca65fb6acfeca1ddb9157fcd3343d1ac --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01491084616422987}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8316049b5b72acccec6c1dc2af6b50cf54dbbd9e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229868}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff571b80fb56901972152ab245724b23d779aa7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01494414023379502}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb59b2d2339398bd844142abb5d7f0fd70190fe3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.305, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664378}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01460648312734276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5d81c96a64461916c24005d41f824e7e4db5e34b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.305, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664382}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..46d3008f5c0fdec7dd7742c6ee35067dd38fb42f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880215}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57fd9c01b1b568b845df1824141250dceb438c7c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977886}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.311, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014645596385722692}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91aaeeed94d95e70dd6f7efd3da8ba06460c2af0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.324, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.312, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370508998}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7f639e3ef4dbb60f82a17f22b6d56a923fe18bf8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..465a4958764dc2331d3fc12f23da4f66faa3211a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0f205a952c5364f65fc2ff44a3d6a5099efa7ea2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.327, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..53bc84c8a3b2087cfd1b2b1a3b4059ef6352e190 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.311, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722695}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.314, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1af3f855ad0317b866c501581dafccf277b01832 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348635}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b5f814cd4736ca314988e3ca5ac527c3205c8148 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.338, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01496596071022448}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ebf07b9f3388386b613d6cf98bd6e05cbca9cb55 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e65c0b2c9d4b4494951d31eb060caacd4576f72 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014658474370509012}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..621016c760cbd4163134ba00d65cbb3b45eb5ea4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811495}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2e79fee31b0fd8d6866d651d92e7d579ecf116 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ccb98641e34835c8d29eac59eab11d1f7a69132 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880219}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eb1b2391e898720f51bc04f257e066e961481f07 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411242}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3c9424482a4402b30b58efba7651e9d8bb0a0d80 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422985}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..194c7771e4c1e99d028524f1f2f87c09945e34d8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.313, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977885}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5b33b7ff05e3cda42295faa5184b49fd142322ef --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270333}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..16cb42980aa205c087bf3e22d4595e27ee90bcc3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f7ab98164179cb5f207efb2c2f8b7570235bc164 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.34, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363935}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d63c9e1b7a932349d35d424d35a798fcd457b6ea --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653602}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811478}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ccf2f63b3421843928791eb61e09df4f0f152a1b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473484}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..72fbf38b7b920f517bface9f2b957271bf72f55f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087974}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.307, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014593284892852628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..63e492c5f89768f904fce3651bd3b359a157018d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01493311749093258}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..39a11b58a87f1b23a60874b77ee5719f1a1c285e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620345}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4545b560be2882482c2290944815ccc27f2137ba --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473484}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c906c0e29fab6bd8fbf77fc3bfd060487a6ef3e0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0146966319607925}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67be734bf7af28d98b848d49ff88c73d36c28149 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618268}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01368049572576779}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a16dd1a1ce8f836a92f1a48a165d2b8870f87a23 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..47d66d2cecfee704d7c015198bf94c38b1bdd674 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32416666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013517438120881629}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881643}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..13d6ce5d885c4f6998f958e176456ded39c554fe --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3408333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013688600793296936}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3458333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013736245342311014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e4c7fb9ff2c6c2acf03bcdd2a3450acf86c12370 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33166666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013596836729485168}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3383333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013664144006618268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..643c7e0679f6f4ab2a07915274688fd25a380526 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.31833333333333336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013452948996996296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..27d558e9617ab9fa98963bcd9db8df4b34d9387e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406389}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ea8002dae8d2444a5d9bdf619fa062f891bd0913 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932889}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..079328da7a84ec240ceb1a740a28b8e5d05b4e66 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769142}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013415009084004862}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..df4cc098a76e20e33163c154abc4ae952f03c198 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013774667009018552}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3525, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013797164918918357}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e0bc607f46d75f55f2a5a0d78086c887d8c14f5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3275, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251944}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..17ea5e0e5491848d61034b522c1422f5545a9a02 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01347162092976913}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881622}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ddce3e5b4202e36c576f8656d2d8bac9b689469b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013672343491681815}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b2e242211b7add6d5f83488341e339fac49bcd83 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3408333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013688600793296934}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..72cd445e4de593b40fa72717d262bc63d0b99327 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982108}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.30083333333333334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013244749345624925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8b3c16128782af169ba73d0196186726336bd0d2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225603}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d5cadbd3549b30b741655816fa10a3b7d83f4cad --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618266}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013613950010225603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a5f3952b8b10b6bfb791ad82eb6e2a5f499f1a4e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406398}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..36a177cf7ac9c65b4771f657eb21ee25d9c9e9dd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.2991666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013223742523347383}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013579531277800918}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6cfbc63ef5b40e267ff7acaa34ce06bd751ade7c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01364760294240639}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..63334aba0a5b65b16e72bde2a958e3d73c98ee1d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.305, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013296358936471108}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3016666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013255174729956493}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b3d4857b03114d220982e34ea81a3762ca039342 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01346230971200513}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3175, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013443538681348054}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fbf6c992b078544440b9e136680f0623e3fc4443 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3375, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01365589718546366}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01356203291952902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..00f17c498671b92518572ab642cb24cc271bac75 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31083333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013366457845965433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ce3e1e863b9f2fb2445bd493d02b50d6b36d1832 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3433333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013712633830465858}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a2ca49059ce0f4d7e8a64f014b8fefcab59d1ac --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01367234349168182}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3e20d1aec82f65b546c1f6f93f82140cd560ec --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.30416666666666664, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013286140243317446}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1707b640a4c9dd10951d09c01bebe38d4bd64e72 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3441666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013720551062295756}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013664144006618268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3b7513acce5836787fbf388f91f08df555d0c05f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01363926119093288}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ae1472f07ef3a7fa7f430fe246e73fc1f5c917d6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01351743812088163}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821479}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..37aaf9e8737c193e1606fe26ab74257c35e95760 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012461071376316628}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012461071376316628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3611da3f70405ab3bb6cb4f34e62509baa7180bc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ee8c7dd68572716980ff9d7a26a424d2788fc446 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012551447627856253}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012551447627856253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..39d0cdb8e9872ff0c72db4b8bc19a423a9d9405d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587085}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587085}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..444f9ba468437160c6aa2801a5b1c09b79b47115 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012445770028026201}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012445770028026201}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..738cd9a01b340fa5aa5090b73f2aa37801ec00ee --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2226962457337884, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012158314774829919}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2226962457337884, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012158314774829919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3bce10b47dc74d7114ebd8b8dba598f80ae463ea --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2645051194539249, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01288927294931337}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01332975029338232}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b71d313c8ea2733612a6a0afbb5be4d8a023602f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012862523175351335}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2935153583617747, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013307250444941129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..453e00953045590497a63a3cced6e368ba2c4b29 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01291577478152323}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013329750293382321}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c0a071ae6b1827d34e0a591dc5f2185487c00003 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01294203019513642}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01329591610361941}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d37ac03bf5beaded8fc88447de986734d375c733 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2687713310580205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012955065963710696}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29180887372013653, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013284525292403504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6237d597059fc29ca463d7e4dac4098332662b80 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012928933196496349}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013094469919538792}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..acd01a54a24c240afb1405835b1dfc7a8bace500 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768668}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012780770562768403}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..588ce6772afbb9d6d223c894b256be375fbe8dc6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012928933196496345}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..62d356faf0644a404b11ba64c1daf8a77c56d75d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012352507042617405}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2645051194539249, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012889272949313364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d95750b9897bdc30d1004678e51678cf2772174f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587089}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2721843003412969, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013006600406423707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..65c00108f9c568186e3a1c635e7fb467352545a6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012304928418747611}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012794553754288675}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fbfcb3f5eb2d69d679e7d71ae3d5f024d64218cb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004741}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01286252317535133}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cce2305aaccf21827ff80011470142a609232e8e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012272853582540807}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012272853582540807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3198f3dea51dba0f7d5a9c4e5c25609d5923bd1b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01232085883477228}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01232085883477228}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2be2e2f5a59eee51c501e20cb800f1c1458482a8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004753}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012399451855004753}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..594d981e1184c1d42d9a51f76170d5e0b74efbed --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012491468532390578}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012491468532390578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a953f2aaf519be1e95cec67c4a23a79237a2baa1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012240491536132877}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012240491536132877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..60cdb374e7d3b71371963d08f32b5024ba43e967 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01236822537850715}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01236822537850715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..418168184a57307f79346b47a82ad993edd765bd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012696728980207702}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29180887372013653, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013284525292403501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..afc31f4664cec208e3616e5d0c3e1ff80b810d7b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25426621160409557, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012724999945157729}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013295916103619413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..22b0350602f40452e39608e8feba3934e10be1ed --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2645051194539249, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012889272949313371}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29948805460750855, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013385021637313562}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4cb6a41d651fc7f0d6a3a9229cc3c758a3d1e78b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.27986348122866894, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013119040897725927}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013318528460539426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a29d61b4b8bb14fdc977f23c8f81fd1a1b6a0fe7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136432}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29692832764505117, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013352025976725225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..817e0cdfaa5ada14c9758137bb444ee2b39045dd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_challenge_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012849054826858114}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2790102389078498, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013106784883601352}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2f12eab4ffcde3720ca09278b989eaef2a0cb843 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008855114414834707}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855114414834707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a1121c71bf64d620a4c5fa96cfc53053bfa7b2f9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008762298774190583}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008762298774190583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8afd58cd9b6d1576a324547c2abe1a1d03a65cd8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008824588611219073}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008824588611219073}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..65e1d2814f38e24fc07000257dd5b4732963a868 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008924765424529257}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008924765424529257}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b96d87c8e1aa7b79f55bb502c5d5899dd95fff0b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008804009846865534}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008804009846865534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..26522574a70d7d72654560502a8f93c79652d7e4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008865199020660963}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008865199020660963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b9dd6f61b3ddf1b070e9d584291ccf8ed58832b0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3042929292929293, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009441202922359185}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2840909090909091, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009253921261885763}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..955eb9e559c48c8579099808b149ce66f860503a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3063973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009459453573398327}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2857744107744108, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00927038060698121}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1c6601b1e5937471157e0f2418e4bd1f87b3dd1f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2946127946127946, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009354224395837097}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2849326599326599, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00926217069559066}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8a0bd17d4ff6f344c07d0b87ebb414eb8fb81744 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30134680134680136, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009415259879351616}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2916666666666667, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009326752065621165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7d880ef9ff3df89fa37c2a5f7ccb292ba53b49f0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.29713804713804715, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009377397867796849}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29335016835016836, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009342508331708561}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fd058f67a5a6ecdc04b67b4b634ab322aab2c6af --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30303030303030304, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009430140669278955}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28703703703703703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009282621598983076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0c20e220c1dfcf99660c498d1a90a1b48dc39732 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.25715488215488214, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00896839476897199}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26346801346801346, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009039157374497706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..804f5fdd7d041c3d59ce24de20ec96ba0e4b6414 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2836700336700337, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00924978169114074}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00929073316167016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..899b56a31a0dd5d90b2fd8194d477b31f08298fb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.30723905723905726, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009466688832475376}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.30092592592592593, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009411516193787195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1c8ed20b80123341ff30e99b4bfa04bff69b38a1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2849326599326599, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009262170695590655}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.29797979797979796, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00938504606669487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f417cbd5b3ef61ece2d7a14e4105ca52a5419f9a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2908249158249158, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009318815921176657}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2908249158249158, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009318815921176662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..35a04b52ecb6f6bcbf8be3cb4830de3a921250ca --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2967171717171717, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009373559492986846}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2925084175084175, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009334649503078416}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0ed21160c565ee0c4fb0d86830dcac46fa54fd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008850055161459236}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008850055161459236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6d52dca20ce79aa51af81b8b4ca997399c37cc10 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008757032594354027}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008757032594354027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e391a46c211730ad1e054d2ea83489f93c5b3ac5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.242003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008788455043255558}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.242003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008788455043255558}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f2516999ee5f37cb9c042e8899bb037c4eb95d4d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24579124579124578, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008834809366391494}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24579124579124578, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008834809366391494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dc483719dfb7d11ecd8529119999ea518933f8f4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00871448049171129}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00871448049171129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3e5f1a9af7d5cccd1bb368992fb7443328b6bc7b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24957912457912457, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008880241465504344}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24957912457912457, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008880241465504344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e0b095182377d281102eed6d21fc69d125a931b3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30134680134680136, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009415259879351615}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2845117845117845, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009258050925618825}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa43208b5ba3b6e246e820516ce41da8e1073620 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30976430976430974, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009488172851903717}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29503367003367004, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009358110551087427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1559d41cf65381181f2a3395022fc015e7b80b39 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31523569023569026, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009533589368505863}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2946127946127946, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009354224395837102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f82b73dba6b2e380a009219bf14592aea057de78 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31734006734006737, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009550648343947771}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29208754208754206, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009330705616569073}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e89459a42305921ad9c1409e000605a3bd2a728b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30808080808080807, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00947388707582633}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2937710437710438, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009346423298166723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0f0bdb42b49f2f09da4fa3f9582f5fcfe9823c25 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_arc_easy_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3101851851851852, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009491721291998515}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.289983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00931084097076903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5334f415de112d46c92748e8b940ce09e68fb753 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6163333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008879665985151403}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.624, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008845002997512754}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd57341f18c248aa51a3aa19b42ab659112ec6f7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.596, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0089603624944537}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.633, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008801296548822387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..87046a79ed732ed00d3e1d420cb7bc7ba39665d0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5923333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008973202213879655}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.617, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008876744835033232}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3f48d21e202eaa34806270eabf056e4140e5818d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6083333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008913348354532972}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6213333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008857326053368308}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8684af92c366fd8e3d47501df4e67827a062ee82 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6136666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008891174310695492}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6226666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00885120015653439}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a034f7de277a8069b4d9f2206c38def32ba135 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.609, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00891063782727302}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6203333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008861873799148993}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c0bece172fae3a34163bf0f1f86db8a396dd79dc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.622, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008854272003440052}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.44, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009064255084676055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..10789a0d086279c5ffec9b5d7f48466c694c1971 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.546, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009091509877386519}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.543, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00909640486825282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f0d7440ef31629bb6cdeadb4f8d87677292b8e23 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5926666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008972056373066369}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5723333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009034185176145654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9352f9ca0defe6d160f7a557f999708794cb348f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.58, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009012606487132148}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5613333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009061278956794627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5f7b4223f894277aa1bf982077413f5d91dcbe65 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5633333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009056690207178123}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5403333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00910047692710895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..15d2d260db2ee7d380691df3deb83c4c04333a55 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5546666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009075496684215473}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5166666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009125157363376123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9fb3e0fcc52565a692f50bfca31c3c149953bba1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6183333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008870849530787626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0034b9a491b35c5a71edaa1f5d6f6ab5204d6425 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5566666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009071405243621038}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5473333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009089227499483241}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad62744733d3b39265c299415ca98dabfcca674 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.576, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00902414234419792}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.539, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009102414587191052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d5a747e6650bb6384f2a249d72ddd40cf30c35f6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5796666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00901359097963683}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5516666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00908135501204554}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8332b65e47950e451305757fcecf01aba49a9250 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.593, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008970906255948515}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5486666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009086879312708494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..68cd07fc38236703e8709b364df6c21793796cd1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5896666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008982215188519146}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5536666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009077486613450288}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f0abffe04c06ff579c1a9d4b22833659802fbb06 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.565, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009052751926300883}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.4096666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008979987547601863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..907ac393a151e40893697f56f3b9793f6c8f8992 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5693333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904202497793108}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5513333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009081985306932099}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..26ae58e185607097177708799db60ac7a157ad51 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5973333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008955564831687456}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.576, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009024142344197916}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1bbaf0cf12fde46e1843162cc4b072c163712c4e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5966666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008957972256087366}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5606666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009062775319073721}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2131dd9a03392f1f52d785502cc199a8225ce5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5913333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008976614094836195}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.556, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009072785596468859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ec1110d3ce1e6159b68b08b53199523021371479 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.583, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009003556038613138}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.545, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009093178503605503}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d0da8870f32805734757b7f8d3528cae416a217b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5426666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009096928229880426}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8ad82e42b71652af6351431ca83d0930ef433418 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5436666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009095345834327868}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.554, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009076827433934427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c735c78a94a3e1ed01c8c4ca8a183d58e05328c0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.562, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009059765989615446}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.573, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009032396953831092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..898e34f7692614bb00a80f050c5645f9585e5871 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5646666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009053547904033165}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5933333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008969751860881003}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d32032ee8ca2bb0bcfb2c88d0beaa00f22b70442 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5516666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009081355012045529}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5833333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009002529294393654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..31ee091d0845907589578db07ca752dfba026f35 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5483333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009087472531749432}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.576, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009024142344197917}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..44aaf98a3b849ee5c5dc420f3b7945b0ae33df29 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9b814c27efda4ab040252eaf962b1dadbe19b7b5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2824214792299899, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe0748ed587a1f954eebd36379b23c19b9d0e96e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.28708133971291866, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6f46d93f7c09beb342faef4257d985cc6d08f5b4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2485426603073662, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..145f48d36aa391e47b579465b5166c237231b321 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.25564695129912524, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..414baf4a5404d6bd380ada84b73fff43f0fba085 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.325, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..50cde2643c07ad45bb3b3b07c88192f111eace5d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71f25241f228baf43e12bfc08d709aa68f48009b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f582c85306bf3b8d3d2b3e081a5051b2330efc3c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.30977982590885816, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b731fc2521195a6cc7b9e1fb1deb0ee73fa144de --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.20370370370370372, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc601609c076ef3648cdec7895e829f2573a794 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.22305244223052442, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f8791c2961e93f6e6e34e57ee27a80b7adf5ec5d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2295932295932296, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9155dc4925806a84e13e524874bb7431297d5922 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731726}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.24789746965043147, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee306eb5a323dc6158294d5440d28342e8957a6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2794380587484036, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c661f2cc1906d0304dd3601f17e51f5cd1f7ddbd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3141821946169772, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..095b80ae32973efe3fb4447b31e2261c3a78ffe7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.32702915681639083, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f5125662c6283235286235574a32bc08c9718b41 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.298989898989899, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..58957c6e881eeb9a503c51b37c4a7eea443a2c4d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.26798881261123825, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..abb791129ec984ac8606832d0f769dbd4bbbd916 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.20779220779220778, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5d2edb87308d923da62a1ee3803baac07c4c7cb9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e1831426ad22743d86e67872ca0926270879727b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.32608695652173914, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ee70b0e29dd4791ad5061ec67a41afa9c5715a4a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.27226982184142523, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..385f91923815fb6f88b5679d0306945997d0d650 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.37449908925318764, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..19cca19702e457cf9abc9a7bb31b8e8536f41d30 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.270516717325228, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..366a2d63c3c02a96ec72b3fde09d625cfc87c67f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.19642857142857142, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05357142857142859}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.14814814814814814, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d6a9158517abced824924174783b2580c1c951fb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3172825681224338, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fef774bd0d170a51e07bbd70f48c4f15218e25cf --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.310790273556231, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a15070ec034883501b956e3c6d9663dfd4cc9be1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.27441920164292133, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..19823566aab4c04573b2eec107262a84467d2580 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3224993701184178, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..592a233059c6f8097eadc0fedfad91c4ca0be358 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_cb_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06585388898066351}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.28703703703703703, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81520bb9d63cb36536ba2e10f2c892a752db46fa --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e848290b4a466d9fd7930d6f407c7d038b4c2da3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..77e3f03fcd3f5db2cbe6fc6cd15fe7c4c1ba1196 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050241839379569095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5f0515821bf9d81002408bdd8da90c9d23d05028 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050161355804659205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..10eee1be84261563187c4b97997979db222530a0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..824f0b786b55a0b1289bd0d295e4ddff4f794b1b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_best_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050161355804659205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7170b035408e6e4f11101e14444e56961bac8132 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..74b6d41daa081c365c2592c84a59b1d1c3f724a8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04975698519562428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3e755e60328c06173a13039972b99c6f3e08c9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049999999999999996}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2a616df9f95e1d68208d4047fb4bc80b047c6b12 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e05a498987ef5ec3eec7ee03a74994c1f21c7955 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05021167315686779}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..51c5ad77275f8640d5f944f9d6401b5727eb3784 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_cause_effect_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050241839379569095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9b845bfd77e88d85926f56a23bbfd36c722f1cf9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05021167315686779}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e44afaf133e357616b8b03d2232cb449ba9bb83d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..732710c17d7092ab80a815528d7bb4fca875baee --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b8269985481de7dc60c699bd1328fef6fa3cb710 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c61125d90b3de731dcfe14bc78b0e016df1f8dba --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8f8070009ec323af88dd408835f7bdd5359b68eb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_choose_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f4d0cff6ace98022dc69c45abe9873b8ba647838 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3f836178f438f07feaf61f38e6fa5e3ca3d6f4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049999999999999996}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..351344d0f53bae5deec9076080700213390acd11 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9712a75ff9ae583a223989a07d55e36acbe39d45 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..86c42c5524cb7280d69fd0309cabe92d6bac2d82 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8c9b9157da4741ef2e57bcefcb7ef406d12a08c0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e4baf12aab33737e9bce61a6b3a4c6f5a9a2d07 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..38535e1a0e3879b221d517a024b1d504fd6d5898 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe5378b5254534a9f6fb37eb0fd3c72c50fd0c32 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9f8164fe97c2e5747250b7a0b96e6753043f1f0e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956913}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fb87661083ed7373f5e64e020f21dff563e4c6da --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ff91f6a573cc387b6af29adf4dd73584363baa --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_copa_plausible_alternatives_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7316e1988bc1a87a9cb894d288f479f3bdefe18b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.6185138767655642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.027545555030891664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3580184040254419, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002769550276517225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.44573433671378293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029443445157491567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.37659938602738596, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025625344135822327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14943282342597658, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001681547997613015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.18664925231515925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017720627212135355}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.15645061177192066, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014050173197206021}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2605556278541228, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020429532483913155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.32788024129204274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002331773243025902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2727904799998107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001753190259230686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2944910613704785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024027482769953855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.36501687453094783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002476795135078731}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.30749802061359216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020994896088046516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..160b1bd0d6f90f810acc47b93fa769b54036b3d0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 5.940650821215176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07251202191064558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3562088549882284, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020586504966668153}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5809146861845771, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002622892280494673}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.43127560616047034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001978521697967289}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1532140575666167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012898457667003178}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25724312942661537, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002124196698748588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1870937559813721, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014484391370237738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.25186821456916564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013627204573516862}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41938362031941356, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023184382496243294}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3071708177800624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014249199674985732}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2932254446894908, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018317984365110538}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.47920715808647235, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025056807862094345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3552092130581474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018353427248767173}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b65e0284ce76414954996053387b02dd52f58bbb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.33821093669954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09379960768199452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3436726890451921, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002099670338530863}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5695885225308102, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002705227771510273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4183289686770281, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020625774095387943}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15145968082746675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013480346131076102}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25931282860324156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002266505933044581}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18600518275150685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001526977649422954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.24856833227974034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001426163937446131}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.42055124580774705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024091709808996154}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.30465189918657554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001501929754244832}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.28486827225361294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018810468253166184}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.47345290666526635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025997531141343233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.34695172233964533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019140373403072556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..799a345fe2582ab0bad5728df7108ab7d6a0ab15 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.311617860948694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08548221958370308}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3349961502490069, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021680799575113964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5565544757647279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028291574048521497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4075743912737001, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002146090481914194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1491383011087303, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001393658080239258}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25579275901300663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023006833356505204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18307097946148873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015714553612069671}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.24541633137395016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014787103833905858}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.4155102693950093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024152269725431605}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3005272202343599, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015463466057729666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2783595624546861, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001923194297919911}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.46345985055371536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026579629870791487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3388120379840606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019607864106101058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..46291e6c26087377359d07d946706c51feb27cc0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.3919491491112925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08289306046133078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.34763102577482935, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002149233305061666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5691735901016647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027018434473941233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4212144473373565, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020829348398401265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15665647152252904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014198314983417174}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2641675222249262, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00230189707841787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.19134136835621748, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001586785118204661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.25481057551592495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014950883188841327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.42503194148437995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002378950785673738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3107651494371883, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015448622097670513}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2874633204241909, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019236933160864085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.47133290982559206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025524403420989594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3484043781862422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019286965322109623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9444e1251e51b564632b8361be6d59f6c6f8c446 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.2572003772902205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08895652378880192}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.345895095360714, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021030157057793835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5666156627306035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002673571521803025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.419255158305618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020235703061870854}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15435273004708677, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001384871199965745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2608233042022455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022656300390166877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18872128486346074, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015510519443660666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.25300940768488106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001425167078952542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.42247328220972996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002353052068466571}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3087981367592903, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001478900728238061}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.284821827767446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018887063780582047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.46645604559897036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002510391137764271}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3451904558791544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018918033978421436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bc5aeed00f4ef75827d11f9ad182935755b275ea --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 3.112918446128787, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.053810399237995354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.15456658592030545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018143739370322604}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.3025102581811532, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033588873082462944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.20084482886106741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002261750612159959}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.04828817820081409, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011102317342711607}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.09771928488920142, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022721293090265083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.06347842363431547, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014480298422468413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.12503267788087088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001400501149320938}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.24757730148054255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00273114149938929}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.16303102771266634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017676167469174691}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.13774800739240092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016042003429823542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.2701867863891116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00300144186310966}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.1791143226266472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020064066499178943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ad51f9e2e404f8f063a4eac24ce9beea3f723ad --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.053334944424697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06721594685515214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30801660806432923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016273405349466305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5619641447190051, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025318813867833377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.38936702412346486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016662898170015916}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.12945275179345114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011074882101870791}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2439184132930377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021503693219860074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16511209673657395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013352227993337418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2092448958861131, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011482208527024708}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.388326334625175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023092628247571192}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2659105891113473, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013002731121155686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25794640278330544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014986373993336347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4707609586779557, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024378163211520084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32608244869018355, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016028641869031382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..274185b58d8a07c58b4b19feb9e82c2b8ea3d40e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.561233872713581, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09108477364016405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30557997424529676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00163203899887396}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5616556592909835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002469578127127456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.38740329284198877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016694245894016385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13344266402318036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011422005728536733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2537552926960219, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002233457163496332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17074360575215342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001379403429380591}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21330386118745845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011724736772492886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3990960192237072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023163293106240614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27191789630264573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013215420536361491}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25891502247240206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015027118915720513}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.47690020829771507, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024407095587869695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32843508910149455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016104077951613867}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d71da8c836b9d4e9c24ea5a7c864e9b615415194 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.776788277555548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08908888038445728}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30214712366118474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016165598811510716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5592881145815344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002519531092257658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3840971278102562, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001668160634088681}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13422079490420966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011667771588757267}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2566073804641227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022681846275686155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17213478001357976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001408023581847778}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21374794181163875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011830966338559523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4017997775289984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023191692353490933}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27303357092908037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013329054733735002}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25778209737895724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015185694019430708}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.47756511166340027, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00247847378438377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3277571464283619, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016344606337249433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b58d615cbd35eb08241d145449bdc8a19b56c471 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.799544352473561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08238532707877078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.2983364923590482, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016137840485591932}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5543295832415513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025227410815235234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.37986106085998556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016771413118236993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13254250642240248, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011814668653123278}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2537164843195013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022749137857616354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17010384910521295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014281699846839927}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21162550086306248, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001199938107743782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.398387439159313, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002291054746321359}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2705591293748383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001350504706167112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25543730355908567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015294759522292728}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.474621306847035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002506933541584412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3252326795969852, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016631041936946884}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..805f576a384e28f9116715985e8bdf5c4172e5a7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.697212269931585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08297111861263176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.2957939018771136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016095004602289511}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5510537049287292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024827548140078834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3769172098402743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016610029156639574}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1310898884871186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011695549044830754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2514989870527316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022100542132105377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1683711858028947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014048286913860427}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.20985604615376746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012015087997438354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3967540619875405, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022977607550473756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26868909350069764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013557031410614696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2527358286967553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015401236971888642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4706633716608987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024801531809459493}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3220148127414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016679134262009298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a463a8ada5a7e4d64b97e63d724d311e7922ec53 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.728040337370279e-139, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.383410053108398e-114}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.01413888888888889, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020691161490258166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.0008690455993586293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0001433594447135255}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.0015568503799152772, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00024935374592081596}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0006693121693121694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003915966570899454}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 6.965488215488215e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 3.5859820215685834e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.00012067093428409366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.188322773568673e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.013305555555555555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019973445104400247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0007596001007290696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00012365940229831348}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.00136520023174339, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00021513459627798125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.01375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020331190505576643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.0008096001007290697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00013118909454719753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.001454791799595827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00022889798881739265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0dc4f6bc217768195048590a91e1e8f70e4ff9a4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.20327154719905477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.034768468483106676}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.11777046155460656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004787552105977667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.0755797825871043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003197252899893255}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.06799149288514703, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027079375902244007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.025207113777389382, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001302556335227448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.029482308327976877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014690441197124206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.025195913355673966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012304784432672295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.09889828492871724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004441805820458746}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0535759232563342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022760149883595216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.048776083503696964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019127958949913073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.10804282075733199, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004604581172532683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.06371930488469042, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002700349737694304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.057898257679435405, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002300168304929039}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5d290352073f43928e08b1623df52da8b2bb22ad --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.5836328567888602, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11778913760335698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.16142810684796663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004922963050007353}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.12928310552643027, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038524381489268497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.11549730510595205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003274787299760805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.04496956013107435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016446318482885244}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.051485578046763486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017825906801569658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.04447784117945149, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015076383673862696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.12870133893762628, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004434980395614415}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.09129308646670774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027323522213050914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.08223474440316005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023266800965344265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.1434849944924965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004641427545467661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.1085877089549346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032584779260767995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0974072227519707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002774866384923383}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5a2f16f7293ec16166f1d7388e9ec4d57282ccee --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.2251973658832964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06840209821605239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.15582501118061967, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0050057405436943385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.11693432609433732, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036830042145809486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.10350129867546688, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030845729308722892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.037407232865964785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014744828664782648}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.045276097550827864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001668077617467975}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.038284747118588126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001383633589370129}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.12825581895473886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0046484472676466704}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.08317479654292953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002625824446891428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.07447530948404624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002203151007472917}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.14099199993249983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0047989121526212306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.09896314553373009, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003133393959775989}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.08797882063149828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026214171075166757}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b8e1b5d471977acc9430b859931239d2aa9a20fe --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.151782007884036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10917316383418346}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.153541975103279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005017944717364684}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.11318510222695441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036272890057710877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.10057563196631636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030484749132734744}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.03679979821425693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014791535592911546}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.04434331636886536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016832005869182524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.037516989850184534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013843301579119466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.12610380196461451, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004649593887639469}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.08015880317670764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002581090415922211}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.07193779825133709, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021569276149122152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.13842028827055886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004798912895625669}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.0951361419739092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003071944475077794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0849355679057312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002576383143938577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4666e9740a9565cfaf7aae0d6cb4948956f397ff --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.2641292736786365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.097056472591506}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.15290909151602036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0049595466535822955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.11499927704768495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035939559105208365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.10178181807506882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030246507145682004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.03683534296753499, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014422357682860986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.045310106634696305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016718535230670977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.038242180726931196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013857098786122237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.12582895602125432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004611397478517857}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0820526177378216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00257106175276383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.07332298026105327, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002162653775276333}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.1378091723714546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00475897658619402}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.09649871874135414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003040230110947172}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0858643495813317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025590711901703174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6578f8a2a92673a572f3e1644951eab56eee13fb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.02601889547824242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008845990174481217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.013188289488289925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003020110162685144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.02331912229418711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005487407492538043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.016342850382717815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00034912236129064063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.00018882696164487857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 4.0143613745290394e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.0003925072247489244, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 9.925690772584578e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.00024104025657346095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.3010616942847675e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.013188289488289925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003020110162685144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.02331912229418711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005487407492538043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.016342850382717815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00034912236129064063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.012772945572946016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000289959796992291}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.022577540598345845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005237847970514886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.015820908294670494, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00033227991604236137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..540960e6b201c15e78ef25730b2697c53e04fbd9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.11194167971178, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11325409958385195}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.44662580344767955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029145470980427935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.403325138761631, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029812540605776657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.396708212066539, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023271634805327288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.19849995481108837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023301034372897886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.17476709268624893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019834376005625296}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1714205638298909, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017391694364972787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3259862976774872, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025491424613272398}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.28928330619087117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002329063084463768}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.28469304359904984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018043411811584805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.37806486217321694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002787004218176944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3367501905387613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002641205675752919}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3323030796627126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002111384569072981}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..40ba026b44cf5a7e5d1f7ee9eb0bb571d86b0b20 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.765851233592166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1147637687545087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5029109698404333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032326251122724503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.399072749631299, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002770754484292409}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4194853660757534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002222220598248002}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.23608052397188775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002567411684370418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18324767332759007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001989894281113984}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.19259169221915515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018423181170468268}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3660346353864222, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028533265797324394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.28763635547225785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002249079446710024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3027130154416743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001900798692043736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4152150772992765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030631524445173153}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.32804484742896467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002520109021821381}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.34512069878778673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00213278119755698}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d5fb4856af0dc4d6436f0cf082d5ce623d8e06ff --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.504414399066166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14144404460789148}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.509958754752616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003154730928607417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.3954899202585752, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002608601302312822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.42179412730593185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021527795098671134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.24242098353513022, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002484157206893866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18360253637850166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001904297402323581}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.19636018570824587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018014064871279597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.36777401786978037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002648283289980144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.28409076327843025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002115488452238855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.30275824983777616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018031245552577217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4185549498251625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029283489596877298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.32342169868683124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023588979812351725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3452111361624014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020448977561221436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..089c1a66a7d99d39d4ff279c1624d61d9cd9eb79 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.453461006006084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20323399299325623}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5152975398825912, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032876121566522126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.3875757012647283, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002563136912882847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4184745538314975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002152424221911221}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.24767323400172267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002578983235298861}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.1802765312255684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018268424519338505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.19590832872090894, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017695553874619732}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.37129359217610464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027272181980460375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2780682094337028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020775510109816452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.30013711995116676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001799774395616833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4223690533102043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003005111083307187}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3169024351381635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023241001320442878}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.34206938848652774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020120604230570572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..53665959f0df468aacd847ff3a82595bf1140dcf --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.336987597938899, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20513507856533955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5143545157562858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003337934175013788}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.3857868235592206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024923995035469678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4170315564856164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002124259039598598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.24631251817723201, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002657555991369566}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.17787346956272435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001792217462473695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.19402158147865167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017876994973534497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.37299125515052484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002815154394732632}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.27850203471081203, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002038628599689064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3009842944910936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018055725861154817}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.42379837225389067, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031034115053880863}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.31679500554979756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002259000761426462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3425854571657328, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020124639191903327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1957f914440292df2dc7e139e6b4a8ef5cb73719 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 3.026054687943056, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05078028430795098}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.22893008098167747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023230957141361385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.3777465242623155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028593004842754027}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.27711271390608916, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024014333114825595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.09022137807900418, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012595960763224963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.14929791491865801, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001792572395692284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.10910465326076894, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013890979482078735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.16625817127303685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015302425277458882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.2807767212855276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021268958542967034}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.20293981930119875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001618501844111904}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.18691135016064478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019403851673907941}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3092909346574639, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002464400666534724}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.22644058640190284, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002021283150451286}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5e7e0ca8e0adc1c2497d4f3b4c822f75a38e17c5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.1092098281513625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09127976996292099}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.38095564749467725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019709425802198264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5860861386660807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002426219681779596}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4510092881121515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017948907933983346}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1695045775140401, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013385932546957934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26780052036509366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020869517300861743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20219167803744306, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014562059203023273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.26311648134376764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014102722543714375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.412622808220604, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023307564982918876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3135694240329392, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014545694851811255}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3132751166335577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018346757269402277}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4820736892660319, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023982212141781824}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.37081917705524653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017754950366532115}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b54c86309ce5f0593c42b836f0f36840875ca02d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.46596372404059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10222019609135197}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3651355772456116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020414630989659278}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.576357756634015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002576181875133805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.43663869577593034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019277436539416571}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16361081730624932, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013840748749673782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26604599291958914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002256079964522369}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19722529213201134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015363235060857502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2602719118470187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014547965301518198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41863229087687953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024024801780310424}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.31324149645192695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015109155091120891}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3042042275101151, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018792439139020645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.48113522705415146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025570057038425613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.36392101007128824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018682328209929703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a75b944cd9434043515db2d63cf6ea788ae4a1cf --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.660859618805935, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1172038453417659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3587661880944985, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002029335942530353}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5745232266800461, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026290745515900977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.43141055859111643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019365140286472272}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16212842829928545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014186202913670232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2670249903635504, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023104076885023827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1964954395976402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015922666319257745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2554771790178779, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014430344926745968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41621281205529004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002379747308022119}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3090193571542357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014996989848492066}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.29982900805480683, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018862992882486876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4809216581836253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002619984298616209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3606858214661092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019006216836532906}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ee3b808392860bba9bf82225479d32157684bb45 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.669912151694831, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08998324955735416}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.35581234182428756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002046080023554725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5730282795070204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002647698915295993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4288209301236101, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019594115330927517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16088582394762457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014414061079427547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26635743558551805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00235484941364112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19536984000862256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016184104384078227}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25423407055295205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001471223089557672}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4166450671427576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024110898290359713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3082343858102462, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015342038958960327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2978062975462721, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019203204157120337}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4794333800307403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026149511981629395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.35880379298869464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019316365194543348}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2e116fbbd0a870044ea2da9ba3aed1f5853b4f36 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.620027445143791, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09375792130378542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.34835218897272274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020272801988729847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5642232234850835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026596537106805445}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4206980580109448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019612026897740434}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15719137843889044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014304675402805396}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26134576822206673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002331449346276716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19119099944111612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016112209219818218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2505811728785357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001468887250836978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4121293895675098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023983911022547304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3042241306518907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015396594533973271}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2925850043674371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001886060348908665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4736084330903114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025599032117973853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.35323293802974065, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018984796157355765}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bc0728e03d73a206ea91ad6698a3842f3f047f30 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.07843610918389116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016304546568532293}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.19781222980051757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004008447460929873}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.11088128154225567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002247020389730472}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.009871857582884632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005640034860620715}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.026368970171206544, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015701889057261075}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.014155568509608755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008063057663091102}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.06563283750134696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012425342106169393}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.16622607422512797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031008472748429656}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.09287467603067165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017134113448546992}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0633625318403226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001306257704460161}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.1611657589875393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033067458850410307}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.08976009975925459, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018118631890276285}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.5037467911105132, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07923156278606476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6bb84e4256a696d84e49e1a7170d913d4021cd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.12349083765294645, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020272360845469154}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.24222254243586877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044424282559846874}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15514646838340018, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002450478109255688}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.015138644522028519, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008835351646857368}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.034641410753845345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019099475070161016}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.020262527556005907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011049454084872373}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09008731584721652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014860613149287796}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.17489515829679106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003107522060120389}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11242576970395425, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017048075155665426}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09704873972752216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016068769402850923}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.19197702048862725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036060489138808275}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12222801015667094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001959356143537563}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.8932555367691859, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08607396085989544}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..18eb2774f5631fc94109178a4e60a9f9fd06cab5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.12268381637109076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018383239534197776}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2846467249093633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00416866407812642}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1672594497220971, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023821447431805398}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.020208437649204568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008690875508991531}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04992195461086151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021341735292668943}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02824595859604695, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011938469906633742}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0902464384634346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012744889966160597}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21079924235160477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029910133249198756}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12312979455198288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016331251581310881}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09655299851334746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001434721313601341}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22560901395675043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033495485137766405}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1318735139607911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018667917007005078}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.1080989461946913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09146307320028801}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f3446fd781d756ddf095d38abc8f6ec1d75c04be --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11891262113570582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002001687206629081}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.27367870029835756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004475568220518732}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16149874282108523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002582011824364168}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.020146490797565537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008642878920700437}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.048883434139436875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021010331707940486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02800561543388405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011878939240017126}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08847153927567653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014199450476196438}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.205201991606198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003311790274919139}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12036250889852927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001844426823256787}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09480824337780196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016157369586947718}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.21941969701456296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036949095354463033}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1289105526954833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020915120619956393}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.0644281139673455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05948687397505724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..80c235c4aa87f5b7bd7373b4ef981ce145b89e17 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0354039096330574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024221017075653785}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.06348261494972335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003911537053406857}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.040556158583151695, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002414780159795799}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.005222818308406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006123821346103362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.011105721412738316, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001297198171401599}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.00666835063292078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007560756618639585}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.02786040640367174, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001905350850283381}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.04948373109395915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003054509322886882}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03166574075281406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018736700983275614}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.02824398060982596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020175634460573052}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.050498368281708256, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031803074224818545}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.032061891009607074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019225657173298427}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.5155503095519125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12577156331553185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dcaa930c2d35e26d13b2a11cb718d758f033848e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.002572898799313894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014841881904327362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0001111004565947103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 6.627016158645234e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0002124229718569341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00012635147814425192}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.002572898799313894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014841881904327362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0001111004565947103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 6.627016158645234e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0002124229718569341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00012635147814425192}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.002572898799313894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014841881904327362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0001111004565947103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 6.627016158645234e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.0002124229718569341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00012635147814425192}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a92512e4f5210405497c53df052ed28c9ff6e56b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.05503216443294283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013208178441723422}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.12247694126975207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025016532931361178}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.07476921388091318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001653767538430365}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0044345537535997155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004672948221132119}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.009160898916001237, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008442058248417855}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.005848067139995684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005763476196672857}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.04930420474602141, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010839279734555082}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.11045765316293539, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002067309095667932}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.06711829058828381, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013521377427549437}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.04612657642022773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010535188227501662}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.1036251864997193, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020318246258696476}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.06282572428962137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013169186728595223}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.261262554550647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05001884278243581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..160cab10e1d601224062c27bb6eeb1a8f2954ef1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.07325117374129199, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001655255942277255}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.1712083046666863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036789822383315267}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.10028369295745565, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021540097009297766}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.008743555836383905, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006295019423085548}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.021042178818604606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015442126736674505}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.012072025290438592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008659837789447237}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.06221015702227314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001239278126124422}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1469982161156331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028600463911305847}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.08551310402778783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001624161272115942}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.05955098102853854, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012927157425887115}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.14057793619440717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029763151410751565}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.08177586301155866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016972607545623106}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.5215024206550787, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03819786214468247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..be4638de668f137f3f5c5f7b5c7cc98984a1b421 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.10254403634811139, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00205888122861127}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.24531036433761338, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004615810458504819}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.14215569897506622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002716440843767019}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.019657431817826233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000956296408061839}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.048701420734218445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023261449730351148}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.02751335673945438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013158018001875164}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0829632148581498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001546655224020721}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1999850784660946, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003529529330015122}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1152398964081201, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020348120567768926}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.08247426765333944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016467303434733415}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.1991641708522746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003807983414613027}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.11464083134883725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021942880329705087}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.127011706781588, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08800278654546138}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..43f8afe0eabfdc1657574f2abc2f54bc9489c530 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.1271432311032113, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023074163775109917}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.29372316623222805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048620902876557745}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.1724090676689418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028418088279796444}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.029429750673113986, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011851736679672697}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.07034873459931881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002719393016178684}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0402095932041227, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015367221854734703}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.10042286402298024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017760282981670063}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.23390892681593342, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003893287643962479}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1365099491658134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022085318962482355}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.10075735870532844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018958298832118272}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2342919714250497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041467607949485135}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1368078332815587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002368380322788649}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.722308165092786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11202608038530598}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0a5acb300ff0def73b3de6ec191eff4f7772e409 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.039100730854070335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024339409571454023}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.07309487784533099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004271503462792883}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.04736351863009491, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002774562583687205}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.008641952844867724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008209970490511706}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.017044130872016457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015311516528441142}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.010845224152235416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009811807847928044}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.031181187159859413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001963515304654915}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.05798493820733094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033712132693060827}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.037416996326406736, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002165687085012485}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.031826221071430495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002021894152923463}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.05922841807653992, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034905888872791228}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.03817823692527911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022299777911451387}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.7851146582603888, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14248563320706192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1b0d37806f2fd6ed639dc45775e235b2a2f8cfbd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0024406380399721135, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006673287893985838}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.002125069578891824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006354178152671431}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0022081044921040066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006238927634391291}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0003296901951174981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00018804946463358912}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0003008969679774049, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00018767733482335345}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0003107051777238192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001860944713122803}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0019949852219552553, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005366470253705943}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0017177557215678593, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00049942930432405}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0017991228311771584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004996511492676782}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.002116903629112286, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005752535478594655}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.001816496355313458, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005356117222057922}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0019064317416731434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005368859371026533}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.5125237582746943e-42, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0979696174822401e-35}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3648ebef82c58256aa71cbb1b03384b5079cdec5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.08057623396604993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018427190672612415}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.1903611222785915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004119319495363413}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.11144536072130062, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002449613618907836}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.012439638488265747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007674450472725088}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.029923546174765742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018325311907880977}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01730052045113504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010550283614850532}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06836163772266587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001418824283099982}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.16290273051352785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032456160924942976}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.09482469984719238, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001891115330305303}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.06720724706425169, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001458820899362241}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.16056752379802697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003380030301216589}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.09329416183941738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019565438808381327}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.711911214189282, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.062271095680873176}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa732e5a5c238ef3dc8feffcbd43399c5455617e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.08658830544513134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018612611645494558}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.21204874097915127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004437751832600217}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.12151658666281231, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025542911124592704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.013510304634606875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007312649740318255}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.03415295300257043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019425669982816587}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.019132118327200527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010395559026960023}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0702987343925042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013468612933160927}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.17295883718094748, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033149082764143117}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.09875044840716671, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018572603815356456}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.07169769773353042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001454220219767976}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.17642520944692217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035711244979865823}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.10073542075586238, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002008424522536892}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.7485653629026496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10161315249240252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e7c7d1f9636ea147c1474d60dbf374613c5fcf39 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.10919972863971208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018326070695569962}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27085077984418787, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004340987074007443}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.15372555310846955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024917036123975646}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.018660104842681158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008521301906071727}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.048121178106264206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002236763054096929}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.026545543337132424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001202505861016018}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.08598679476507894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001347479595633704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.21491972476150967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033319697703370205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12124448179130719, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018406701231741705}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.08936454412712036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00147707143314301}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.22313932236999878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003637762226705496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.12600811759241443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002023272430841333}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.0047358326681721, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07696741647689843}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3820f5d4bb34cf0427c6fb5cf169a7e4a97dc62e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.12156176985300436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019916743006548566}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2919486745230574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004594202930152972}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.16788870363340208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026015869149755492}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.023390561110934703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009482124699139898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05931283277905219, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00253961685376344}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03291830334125208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013348595001679636}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.09410425031338661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001505615984367887}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.22729829891507616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035793671647219765}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12998891345659275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00194502365003499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.09794367331761664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016077634502078913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.23679450093245694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003856459984346274}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.13536430596241408, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020963457328233175}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.3876528749760366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09352517366139018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8619b226c8f60d900cb802e8c35f017ef8c4e457 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04061391536633297, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025885117246031656}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0774019664424454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004501264779451914}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.04817132624367883, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002710318861619294}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.007571424737600509, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000774569805719568}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.017306355810365114, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017078169253319931}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.010104068388385765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009810253225945517}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03226926320041072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021853447839680425}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0606544963878778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035695265285015203}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03758941214391744, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002107269506833207}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03361123521194327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002255713167385482}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.063038545340123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003730201727070476}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03925168824333697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022220799806552142}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.8091606018729823, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13365493953263705}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dc4c17560cad5ae9a5036e5310119fe71a833d10 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0025292500918997793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006843948420078455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0020532297175197265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005556296741534733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002218415772902317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005977307451849004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0004376650603065697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00024169751059179606}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00040004436924525715, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002447488200268485}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00041371259854665804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002415623180229552}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0021231909904583543, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005742362353931501}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0017609197535564964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004851127145778472}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0018828340816919485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005121954193145257}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002170837264519722, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005838940116621144}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017952250708806817, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004910386484249093}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0019227239855572795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005197852399034371}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.9417748605436574e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.644365126953672e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93e9b3cb4f03c9e14a1dd4b151c4e9b8998eca73 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1290022798344525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002385695780968086}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.281977709942812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004346065008819319}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.16881791798821513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025746455343428143}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.02404571804293375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013659743034430886}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.052449257583153115, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002298683613553151}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.031013676801335422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013438213374447965}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0991345122176881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001904974022092299}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.21710334307827317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032407820193998097}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1293936279197813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001905181791064322}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.10327457231643225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001973038870062356}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2277601179019926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035824212720264757}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.13530856271788863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002063773976743065}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.2025114531043062, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10655129877623105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c78b04bcb6c1eff02a4b778c4dcfdfe3f9f7063 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14417853642749137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018611534066647365}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3537153376225196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042546837674507666}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20187019098994882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00244120776332977}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.030641973891149483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011156075040613734}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07800524098991497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029228930462181354}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04334620232538617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015695593858469068}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10744022317808914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013648772722477944}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26585677142562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033591403270810637}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1507291074053435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018184345906770076}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11450326924869379, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001534569154151368}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2828354708969096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003689314769244396}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16062701136729154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020501392930297666}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.7762050506045142, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09926295387249034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7071b4c790d7bb0b5d11648ece7264af7b7f4ac3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1415629115103802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017838032550705376}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.35091090779004147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041843712980091045}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19932155852386563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002392922806012122}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.030702156454896785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010693945263455453}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07873922100816551, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002781171908981175}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04362070001507444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015090115501029922}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10636784579398613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013043402766199064}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2663335312559493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032882907730531676}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15012616545978386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017748529391718238}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11293750566369144, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014708469401565362}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2820820207716993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036056577642723567}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15932750811184002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00199892928658887}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.6727033378910847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09495317261707087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7d12ea7c5f77a2fda5890ad890876f7b6dc00c28 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1366930941690243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019671239516403935}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.32595135506801376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004477539106095352}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.1884376588525654, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025642932469730627}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.028528787233145586, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010703763072091093}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07151102797558972, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027297730367125982}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0400453211123096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014862606183809399}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1040074617013233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015105673758063752}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2486131025320305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035056965604027203}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14317246926611424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019379403187274123}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11019224321987507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016599707159991781}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2636677319513756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038844849580001823}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15181862614650385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021601749082742227}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.742293158445559, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12475047173558831}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..66d3d9fed9a69c2f395c01068e3a1a25e86010ce --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.04410024857440917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002722944168692114}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.07974432748836109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004470156036803333}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0510345184963839, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002788963063519097}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.008214763901131556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008388994526412292}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.017137458752784145, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016302525508184434}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.010522073701869125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009826679169524012}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.03366378122590164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020732284962604146}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.06128142643464567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003508572574499959}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.039029603911429886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021514536166429943}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.03523949695288221, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021552641230410848}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0645342916738449, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003702351211849086}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0410270223606163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002268750341839602}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.8218145273207486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13259402458841246}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cef44a92ff4f9a6178849c4e9e78868336afb9e6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0030006953279340686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008260104673619008}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.002448875117757725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006686883097147919}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0026601966494432523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007275199319288471}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0005102355407571833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00024289107214722606}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0004277053874723131, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00019999235661580537}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.00046275158053195667, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021785531794482434}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.002398796026490401, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000661743079139523}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0020040874122394833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005526887153193406}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.002154351416462934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005935536504835335}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.00239564296423634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006561431092058282}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0019841067166653605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005417182019917762}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.002139637125943983, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005838831183619806}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 4.273566994720392e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.8291126911657586e-32}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1160284a54cc824356f5a25178f4e5405775ddd2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.14203799200266776, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002270644021652014}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.31732904475268425, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004576952267589019}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.1900087776056167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026590712025527704}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.030454613592282354, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001210755597735052}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07004943298066989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027142097244891622}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.040900489822348056, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015444912268875982}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1084493048877211, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017320556174949715}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.24390681149682772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035304448560530047}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14526081266341737, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020046277742893984}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.11212112563171714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018151655419957806}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.25327587259397216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038632056834617744}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.15054379734755471, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021753294084953185}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6188434485166334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1109564339936777}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fa4a68905d721c62ba5cb4ab773cd44d452fcf93 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1310768758781413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017542917965564532}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.324216765100875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004101611205461987}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18445302591356055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002358252600127643}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.02727413002222952, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010077697140453462}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06992767890558657, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026846374203769387}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.038774277981477374, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014338449153380509}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10150380205667145, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001323267745441165}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2537245165326843, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003368126070672136}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14321716258429631, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001812335103355609}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10624902236675653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014584079648567682}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.26509353636861205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003626121575001854}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14987577117769335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001995803271934798}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.5775644045947161, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10747802490972798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd1c8d413523bcba12e46e777678636d3247130a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.12929139605322415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017250930103395053}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.32127297765523716, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039219092067371624}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18223568358253522, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023053601425562413}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.02577874922325936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009382634954700045}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06615069752192394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024396021514475836}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.03664914264570665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013235856869957899}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.09993852104012416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001292895388479902}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2507415234860173, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00317071813365378}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1411906750067624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017542193090228139}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10524886017120061, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014473708528824547}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2635160093607098, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034669204083299546}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14863975004627103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001961623937053356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.4631699049107234, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0818224941441452}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bc67416c3162df5ec60f8018bb694cee7dedfd7c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1270012394870458, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002091745941086545}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.30424480950743027, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004523774432666432}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.17573577441569346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026465445466935253}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.026605921898331948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011008712275775547}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06560029901447437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002618665050517068}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.03701973106444136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014575936915393465}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.09771059146617143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016171559842817049}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2357515770528734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035751298993674996}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.135384466992445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020400308616851347}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10278375996682501, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001730936323308653}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.24848166333986804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038955439908868214}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1425837105636293, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022180825213510558}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6024997408404598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08124844005963922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..38aea9f0cea2f4e11fa7f1b448d035a0b8e57641 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.03513411483539431, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002220905791930144}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.06807789519735524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004165774223715686}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.04277432877434612, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025508570289566007}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.007057485487635765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009007492112467273}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.014720343410307037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015318015560624846}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.008786196844590121, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009097928996719041}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.02768868248284088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018118702472072936}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.05338300755867666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033015217413937302}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.03335136680279893, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019990065744291984}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.029185508102300205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00188906175149616}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.05628446465745351, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034924975937847573}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0352809944078966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002126337470382871}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.7057325975464012, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1666309026142423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bc1aa41f75e2165cafa167a6e3f4a875534c4e7b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.003716409376786735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012903356644372312}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0004836451099249516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00016932836035127237}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0008504247165137163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00029663122917791953}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0004288164665523156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042881646655231734}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 3.430531732418525e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 3.430531732418596e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 6.352836541515787e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.352836541515829e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011921960276713975}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0004359988358635832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00014796403981705004}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0007687453895513705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000260905903643579}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011921960276713975}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0004359988358635832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00014796403981705004}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0007687453895513705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000260905903643579}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b15559a857be9c4773b8f14742c8254daa4827fa --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 4.4726418073213345, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.16753723606834903}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.09990759568043395, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003962742821805288}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.542668484521527, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.008163857356083524}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.13738094874969342, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0040877314692411865}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.07078028109191174, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0034244977285782584}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.3897797823742886, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008526350760099893}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.09706102035374112, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.003653466658553787}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.09777072760562562, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0039192045347804325}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.532792582986373, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.008177843897950418}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.13447315238265775, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.004050384508000915}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.09637631448880932, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003951999111601633}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.5181791996400547, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.008308241749147611}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.13173120941865174, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.004084755897159187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b562976bd9096135aad0d4f6df11622a4320ba3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.388997984440001, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.22030624130684912}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.1899985239367049, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005524798334171625}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6197096800293467, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007218975234253726}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.23339725988760335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005577890928986144}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.13527730400642915, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00492492204254613}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.45578433847724403, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008317705400555226}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.16839814753926893, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005178370714369835}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.18193802149181904, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005417257807881834}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6024769760148035, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007410768687993907}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2248230330602144, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.005533593484296289}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.18398509430200422, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0054807247743772465}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.5984789556418103, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007439193497312767}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2260074903815481, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005576688947640353}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1b86eed481b2b61323c00173c9c4fc2564ad518f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 7.1654235804393975, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.23664442530069865}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.2632982490541137, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0072621786329556}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6328871785091988, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007011851144328955}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.28818203262436376, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006789855255273055}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.19631851485753235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.006522555304684875}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.47001882425100405, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008218114595065874}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.21700191007059494, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.006362729074282588}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.2526725103593678, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007156069139226169}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.611025032648354, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.00729448166920107}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.27789781089889687, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006777757204322281}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.2555628679056802, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007187190219846719}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6156889934973935, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007241268875757391}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.28052899465966125, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.00677957486590257}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1e01c6ccf1516c67812270444e3ff067381e9ee0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 7.130010006945033, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.36614351972675196}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.27187069677018444, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.007724489648815434}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6409983231082317, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0071549123595310085}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.28765257778776016, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007122033948855405}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.2074993154694547, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.006958791199014906}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.48909595674061107, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008272314038685013}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.2220313726729203, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.006635540925947997}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.2622707060445547, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007591102476600222}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6217153240581279, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007407513056831873}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.27918830024031255, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007101399798328581}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.2651656938002711, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007644202201817951}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.626997130842089, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007343317727382677}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2814597792431154, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007102157344044307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..404d931f55e8a50ac00c940c24cf9bc760b200e3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.664095500884982, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.34838973765349207}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.2587985355155675, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.007619203655004035}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6552543491315841, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007002192904681263}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.2771124149282886, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007080361587320621}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.19927034297156443, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0068409334865776284}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5044582337397387, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008192238508307557}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.21583669822052345, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.006557201662967979}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.25108837349831326, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007533159919158992}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6368030343144712, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007249065063061643}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2697811327830668, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007067126106733718}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.253246129669511, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0075494739014644666}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6429352137151435, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007173543257338698}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2717897500548224, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007057987573730667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff5b74c626ff9f6e075694ad2b3aac90ef9befa --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.446672886311619, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.27464231120207433}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.2528718402860418, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.007719845208329688}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.654561878141819, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.00698855116841065}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.26736214869813785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0070988275428792045}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.193812416956008, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.006844516632300937}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5042421877334644, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008241858798974178}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.20868674330105244, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.006522205599689379}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.24481322454808105, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007592775252281026}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6363050209515578, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0072450011340579445}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2599202410891578, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007057224366690571}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.24690879630896684, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007607566463992895}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6431100118567753, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007155463175240526}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2621481175778254, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007056258826347817}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..14a701386b5f3fe007737efe100f5d1b913d4d57 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..963a9cb1185e1b4c8228112393f559dd54376deb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032837}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c673ba85e76048ea0e3aa0a017f1e33da26ce075 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5223068552774756, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011654208652596476}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5223068552774756, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011654208652596476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4732b747a69ee43e4041c4b2b52959a21eb39f8b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5092491838955386, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663828032649183}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5092491838955386, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663828032649183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1d161302398537368588b21e0c7ff2a7c73a59 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5282916213275299, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011647134172749322}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5282916213275299, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011647134172749322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7dc09c7ab8cba337f80e8358229c180881c5b194 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5114254624591947, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662778026451659}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5114254624591947, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662778026451659}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4f294fb69326c978a12a3cc82bb83df383ad698f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.19326617690121023, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.00752510845276588}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.021769865204723387, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005817240957449695}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.23632094446041846, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004083513329156124}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03752514813814976, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008846335161188192}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003405753331635085, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00016739880607996655}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03960148154546242, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019531504378148563}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005928136888518339, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00027560748473478435}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.019577427737286695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00047793881065340873}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.21917768067787397, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037979740764717976}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0339518071889011, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007449293744359491}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01809711784394688, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004916087758295677}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.20397788862519056, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003704689919592339}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.031229364613330524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007424902988435048}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9c9e36ac7087eb85f08cb730cf882cf2d6aab5f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17941583254637414, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.008838972109532342}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.02104978239320399, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006182804474426174}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.22374374255453272, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004184853646640156}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.036115788468476816, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008787800610938908}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003222523734541128, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00015290781225805715}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.04029314519755627, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020801960108103577}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005682715949708656, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00026165286218016116}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01937672404519423, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0004938315065636547}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.21137283250925631, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003919038757731567}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.033515515939669824, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007571377468473982}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01691374585639976, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004598604234910933}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18918805907421266, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037299738776066143}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02922662157472046, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006947111027433623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..242629932b110cdffbb0f66ed5b463ab60944278 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17745296114678968, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.008624930392648838}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.02112776839925051, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.000877980468137592}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.218206359468982, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004101332930154146}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03499069784081937, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008446359024973105}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0032849283759100656, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00018537681002876464}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03820330073739487, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020212152218120854}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005621916396892083, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002732592116855481}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01981043631160736, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0007993400917579435}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.2071458409919012, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003867415965532175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.032907905075395594, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007674706272733912}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.017213819703019412, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0007926417088864289}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1841964822860343, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003656569397088453}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.028356416929466385, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006881727791999624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7fe00a9465fb172e33cf45f99d85ead889869093 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.18436487057820314, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010658373126425667}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.02057704871830096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007494006749779765}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.21357292385095153, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004127038723114807}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.034174719119795666, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008567973159331906}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0033630628167333936, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00028514960802026207}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03759233299677987, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020872224529434385}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005486989401606149, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.000276704602860595}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.019175227864817394, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0006222647397159332}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.2029780421179166, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003897679155093511}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0321009603843197, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000772867121470867}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016726612502688427, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0006097435643352546}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18052347991125617, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003650693342311041}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02777350418405329, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007032861760892736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8479f4b54d2dfcc048ec852d32093bbc4a6091d6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1662496800572863, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.009003816484519792}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019771160862529565, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005967888743923156}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.21013503593016225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004080380870732406}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.033492958359078145, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008122527510855113}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002967361151580668, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014505452013749797}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03712110581081379, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020199022271893104}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005250361302057742, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.000250744194605345}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018431702367967623, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005086664104353652}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20000526433171545, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038644792190701566}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03142149026212096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007237635478632389}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016047371965998675, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00051159998028551}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1770149433781237, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035827141690138965}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02710335704553083, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006613328324069825}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d0977f9279eb2b27d5c0f6015ebddebd2ee8b9fe --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17909655705334981, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.01001948544264451}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019621328243135686, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007677978709294303}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.20744825708534467, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004062728081792751}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.0329209856904888, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008425698682080111}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003330941009610586, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0002884450625712378}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03734761717066073, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019901336224895593}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005515135528910162, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.000282385997180886}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018273289410454367, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005985872356689969}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19766569572595405, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038632884293854372}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.030971756359625915, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007421166076215399}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01596955228121598, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0006686144046345224}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1762042254326998, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036326407496276346}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02675062591564841, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006935141958106178}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..18d487bb764fcdeac64227f000acaa15621d3fca --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6002cddc6dfc24eefa7940e2537e9079b083bf72 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5076169749727966, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664470424044976}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5076169749727966, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664470424044976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3537a9181c969984dc40c72dcc9ae91cd2ff6b3a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665824165343952}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665824165343952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dd5212885e88da666c103905dba7b948df16fc65 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.515778019586507, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011660014400426185}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.515778019586507, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011660014400426185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..96a5524d30a22a6ba8a2df27cdc8195e497024f6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5228509249183896, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01165363483240117}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5228509249183896, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01165363483240117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..78a360ec2b86c5d7225d8137c444c5e0dc528f90 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..221daf874c70db11855caa191d7bea9dc8f8b89b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011590883373666863}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5418933623503809, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011624803747232126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b3b9ed38661bc0c5a7a7f56a66d3c54027ffeb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5685527747551686, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01155565729886461}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5674646354733406, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559142916063143}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c8af23804d7789fe49c5321a57c3811949bc7fcd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5718171926006529, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544859155318844}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5723612622415669, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01154300962328283}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0e5351e2640553213ff094f9d564ba426fa79ae1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5663764961915125, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562571737707342}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5723612622415669, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011543009623282828}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ea60d194b8d6ffa3e1f8c304b59a69afac79b864 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5865070729053319, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011489895831821135}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5745375408052231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153546884082453}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ce442f41bc3af0d2bd4d1ab87f539f4aed8f1fbd --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5848748639825898, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011496520442659124}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5761697497279652, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011529663270276293}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8af01ec596b082b912a0a13730be798bea745064 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.613, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015410011955493933}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.543, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01576069159013639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b3019b879e611d5d1182804cb7d36fa0fe6ea7f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.663, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653591}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.622, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015341165254026642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8609f2adcccabbdff640c7a2b55cc7086cd84957 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.673, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01484221315341124}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.637, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015213890444671287}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5a751d1297022950f7a972d769c092035f31848e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.662, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224472}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.65, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fa660e0d6c2b37960ec43465affc2b33defa178c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.671, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928369}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.66, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..595a3913c3df8fec9889c6cf6c8736940fc56a81 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.682, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.678, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..85300af613f077c930bc7629e70a0c89b3e86f2b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.83, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011884495834541665}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.741, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013860415257527911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5c70c150503a98a2b43945ebae41099669fad5df --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.846, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011419913065098698}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.794, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012795613612786525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..99494308f768c3111ff4541f525661589da08aae --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.853, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011203415395160335}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.805, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012535235623319325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ec3f80445f18e244eb992e2a9b66519007b4c077 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.856, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011107987548939149}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.804, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012559527926707345}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..247e1511ff8badb0694ffa9111c248f21aa16f37 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.849, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01132816522334168}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.81, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012411851354816324}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..acaa8af20561c9a4ad057ff163777a506e715deb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Direct-Question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.849, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011328165223341678}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.816, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012259457340938598}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8644d05690cf120eb65dd6950827d0e91ad7d96 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.287, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014312087053809965}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.315, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0146966319607925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d1836c9490fa7f79d4fe8717a702f2f817d25d4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.378, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015341165254026649}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.358, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015167928865407559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..66c2a163f4f45206f90eb06d449eb96c8166ac22 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.372, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015292149942040577}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015275252316519362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..eb245032c626319d50be37868e4c41716813f78d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.349, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015080663991563102}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.364, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015222868840522024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b82108042f00f147ef713e946ef26a2b37a0cb64 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014933117490932575}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.358, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015167928865407559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..884872433e5991a133fe2673a915480c092ce536 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.362, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015204840912919498}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.356, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015149042659306625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..42477a5992936292547a212b44884104f91bba3a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.349, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563104}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.339, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..75f2dbdbf601d836e2098a3c61731a0cff8df734 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.392, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01544585946377129}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015356947477797579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c2722582fe64448cce07b322be7aff650a9479 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.363, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015213890444671278}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.386, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015402637476784376}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..eea876d8987b4dd98be77f90f03217a116377f40 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.363, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015213890444671285}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.363, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015213890444671283}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..871f200015ca90144dc62e83728c4d7155b0e061 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.319, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473487}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.345, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4737705b24b69e420b539ca8a91ba7f9a3f110cf --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.333, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229852}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01505026612756445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7df47662052c074a70ad56cc4b47551d9dd168c2 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121728}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8bb9318c7f67ecb3439bcfea9ca51b9fc0200d20 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.378, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015341165254026644}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.371, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01528373621182319}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..930691d6568440f49cab633e22b432eb40e1354f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.344, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015029633724408948}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.323, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1b11079953772658155374faf4dac7d1424280ef --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.329, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928369}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.362, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015204840912919501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..732388e4d282ef18a8fbb41cb653780663b9e10d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..353993376b968c7af9431c391f4bdc4db4a00b16 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.327, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411249}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.333, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..969853c45b8779da08c8aee5df1474933ae92990 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4719401389631213, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544210396951672}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5050774986638161, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561836054238783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0cb6a0ae8ca8d45d531a708d84753dcedf48de5b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4521646178514164, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011509395748220104}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4767504008551577, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011549925483927456}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d28ffe2309b002a0eb4e457c0584de9dc39fa3d3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4510956707642972, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011506993144185188}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011530479981182628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9a89b6970ce5607f05066d95d2b1940c82d5ca1c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298178}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011535764881641411}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ebaada8f6f850fc69e778420552e64b79e88bd4b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011516282203726655}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011530479981182626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e166ee9285b60e6103f13924ebb2f9ce85c5687f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298173}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011542066509767012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e88b2e971d4be057014222cbf1eb637b363efde3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4906467129877071, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560409019420364}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.532870122928915, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011537420054210306}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c57e5008fbe7b06f109d7398b837b283ad7047fb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011524715486240657}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4917156600748263, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560845076525713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..197a21624900870c9559747c229096add37eac1d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4623196151790486, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011529552555884571}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.47995724211651525, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553138977961008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..251a44385af4a6701732b6579a69f3f83bd37323 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.45269909139497594, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011510576955232206}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4756814537680385, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487319}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a84daf32054eebcb0cdca2ce3e0c3632ca275990 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011527657726586461}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011537420054210297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..129393b1b0027542c6ad4a9bc1b6487c802b29e5 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011537420054210303}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..97fe0c95c71fbf02051b4d4f874c69c26ab7810c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..87994628cdf6c9b9ea7af45b85a52ae965d998c1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cca0ed93baf59b99fc9833ac9a079fc98579fde7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dd83a2029df35026548d448e3b26c6636a8f2a4b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..81e883412f9473681ff2260f080c1ce15a4a8f83 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..79bce00b2868fe0c9bc34f12db6a8736eb23ef54 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9aea9233f7841cf6279e79da4d17b278b0e2bbe1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4831640833778728, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555875693960773}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.51309460181721, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558466383367178}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..673262a58311a467e91e96ccb95bdbfaaf062165 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4494922501336184, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011503288699799176}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01154206650976701}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..31287f452c1045302596b46c7e663354ec71d22c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4478888295029396, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011499463505491369}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01150577173876986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..df481421d98726a5a7b398c757acf7af9f8a7b2c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4494922501336184, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011503288699799176}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01151954486592806}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ebc5cffb36436272c840dfa508a7ffa3a283d1be --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.44200962052378406, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011484402719452577}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.45323356493853556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011511744771088355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b795a77e3ac606452db34c0fe18a1c6bd903ea46 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01150577173876986}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.45430251202565475, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011514040245583501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec91c30ebedceea7198853dcad43d28b1017e91c --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.49706039551042225, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156223242154194}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5259219668626403, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2330310103cd71b2a1b9e5595b02cb1efa6b45e4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153229486915312}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.49438802779262425, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561703928784337}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a21d8fd8a8d7f80e25b816b0710d9b3401dbeb7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.45911277391769106, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011523708060182082}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4746125066809193, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011547518083754583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..250185724e3e3fe1b60f5e4ba24432374bfba98a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4521646178514164, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011509395748220108}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..06477ebf7d4ead1238392c03c2042ed11e6a3db8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4510956707642972, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011506993144185188}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4633885622661678, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011531394084549621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f40d87ee64adadb8c78ffe9f001c651089e1bbd8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298173}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546234813777397}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fc5b3aee9efa9a38160d9c09c12add3230024f18 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..908be0e3dd7788505738fe0607d843b3a715a051 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4729241877256318, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143713}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4657039711191336, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..70d1d1a7bf0abd6f358d2040e3150766a0d03917 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..817b16817d5de3b5bb6c3e21e3e7a7919785e15b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4657039711191336, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8ee4cc99b60cfb5db663bcef52d7590925f33fec --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4620938628158845, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..31fe269d4697a55a0e29a884eedd83f754828e6e --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4548736462093863, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415252}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4620938628158845, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..abd53cc28a20028e3c753f3d9ecb4d5712e9bdd6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5226eb6abf7898c0a307604a122fa1da5922e1d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..181878bdc2a4a39f589f37bd3a7f2d53c9f7c364 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fa561dfb8d56c2122113ab18e308a4aadad0042f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6aae59376f7c6a5230ebad037302ac92b37168d3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..646949d0b2cb386daa368eb8f53b7b19be0b1ed8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4693140794223827, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.4729241877256318, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d760541fe4e2cf9a8758de2827d9a6ee5721ebae --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f9fd599851153999eabf8e48329b8ca240fc6606 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..92b69b44f48b76c21735fbdc02af4a245a0548bc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ca13ea13ffc8d63d723cb09b562f748dd84d6820 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a2b127f416bb3a87507035e000155074f9887c7a --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bc953087da756fde2a987eaea56becf27c67544d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8fff3b07d2ae5c4ae48daadf7057f86cd6bbf836 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03009469812323996}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6a6687da51f12d9e08c326e70ca5b6491ddcd821 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7bf486311e848d2d1c91046b6014fe4acd168189 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300523034631437}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..58bb027ef44a08bed81222ba96e1e23b0d048f82 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1f87d802f361e3d0db6ff79dfb905e061a6ef0d3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c6289f005aa38ce28e23c56f38ee997ad7cf20c0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.4693140794223827, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bff5f10bdbd6ea9263c9a0ba0d1fdd0fb1692101 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373324}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0a9ca3a7b5e425b7d54e2829eccee495f02cc475 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9682f3a8249d9b97218f9b8647b4619d7be2b0b0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..092db5e174cebaa81bfb12e5384ec89dd0b752ea --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f2f57416857d6872ef1bd9f376afb04de47e4d59 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b67ace993025763ba352fb374bd5fcb805b530b6 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_superglue_rte_should-assume_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4729241877256318, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56cf6b2b550d8dd98005e9ba3c00eed68b79deb3 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c60cfcdd25a7e88b602457a9c83405f5da0b2e8f --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404880419985932}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dd914d8414c11e1e87d271cebb306c76c7066afb --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..31f94a885a94cd3dd4dbda28c59819ef1c52a2c7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367592}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e5d7454e4d8e1393ca1ca9b8f1edb3d425fe17b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405090552122858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2caabd6802889a8590e455d55b7fcab82a2093be --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_Replace_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076887}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..625c1b711fa84a8858c276d83940d47cdccf5ad9 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e663eb0147d4bd3efa28e8e481730ba3d1539304 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051745961790516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..43d0fb46bd2393b612664ca0293fa78d198146b1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.49329123914759276, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.505130228887135, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051745961790513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..17c8a23676999eea0efa476245f06d18822570b1 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4988161010260458, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052446290529024}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5185477505919495, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014042813708888378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..34677dc19e9c992e9455f5fdb2074f7c0c811415 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5098658247829518, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049749833367589}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5169692186266772, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014044390401612967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..77ee920d02e2ebb5591ac23abdb2572176d20aa7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_True-or-False_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5019731649565904, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052376259225636}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5146014206787688, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014046492383275842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8973d713461b06b0ab8a3ebcbaafb40adb3150fc --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915867}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4877663772691397, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048278820405612}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a976cbe48e7a34cd242806e576a83e705c8ccce7 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47908445146014206, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014040185494212943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eb744f16d4367dc65165ad9f906f87de31fe35e0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824197}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440419}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7ace943bc599949be704aa554cf930a70173f2df --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048804199859316}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5e2d4daa1b96df8f0ddc1ca8fc743b8bd1305590 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4877663772691397, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405616}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48303078137332284, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404439040161297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3f208f4ca532745d60d6ccdbde0071b6de3d34ba --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228584}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..326e84313a1d6c5618fb8eb335f61a9e77a0d890 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367592}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c3966d6e976b1d730fb57ac168112a249930072b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497704}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140492945362904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8dbba40fac7cf346a98d2d2bdfc91a90e9dc1cf8 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330346}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405174596179051}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..eb107553f9c17204f1b158a4cd9b85d55a2b1f4d --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f47ce46aa0c4abf7a9fa1886f4dcd06ca45ea975 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..28edffff3b1c67048d6537b70784e1530017a86b --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_stand-for_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049294536290403}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.48066298342541436, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014041972733712976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_0.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8f62387d37f21f571ef0e4c59b395d96774a0ad4 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174964}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405213114691586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_1.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..db9eb6b6c1d06b296d1c8639f00fd5eda58b0200 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915873}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_2.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e757258f54d0775a26fd1fc2330d563509e1170 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225629}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_3.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c09d3095823ed86731295160cf95cd5d30ac0af0 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014040185494212947}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_4.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a2392e1f93ea0cdf5795efc82d3a68b35b9a5217 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5193370165745856, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014041972733712965}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_5.json b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b0d2983b2abf88a35d546af782c37ddbbaeb0963 --- /dev/null +++ b/4b284b12bc4/eval/agg.4b284b12bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497707}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f9b39faec71b76b4dbbb9424ff3229bddd7d951 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9946da6639d26f1ddc0c35dde3183a56b283513b72584fc6212f5cd87223275f +size 8199590 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd2f728fb7b0ba1907e839c4375301f8bc81d36f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae180baf49d15678287f9ea4f8a2531ce2c2506ccb7480d2f3c82ce422f399b +size 5025263 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5695f85b30cc00f2273f01b2b6da8a8822c81773 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a571714ef7f663c8ab42e99f87515216b8ce6c7f3572ce6d1ee0531ddb85a1ec +size 41587812 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5beb79abf062d4808f9f48d1ec930844c1a44e3f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6289feadaf4c1f4a766875265d1183f0053f2ee7545913e44971d651793c046 +size 27340604 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ddca0e7677ba113f11e2a64e89b5ca4158ffe9c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e6231545ce2754e80158e0c76a12835f3f302485d949d4d51ac01e13867053 +size 7746498 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bf12f3f4f03d00e9a708bc16b532123528b2210 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5889424cd58fbc7a46c4d9a97631ae02081b3b3c3fc46e743330d8f336dfcf +size 8648664 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7f8a10b1e80f7b8c1cf2451fc92c3cc136f95ff --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3a815c1718d5bd82accf25fab41c8d62df752a0eef4b291dae18ffd5827709f +size 5107172 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..420b11a1a5552784aff84affb2fc8bbf421b2ee6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e2ebd6a5abe4868eae5c85edad026cc3d70f381b618d22118a8a4cfb7c211e +size 3517553 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05c50837683e31f7da010ea6f72b6a5a3148a083 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db83f776483e8636afcdd3f81685e32b465dc4faa72689a118cd8b41134d8230 +size 28099344 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fbe373f3929340204ae8c66f205cc0828e39973 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c74ffe6ae79897ee299d5fbcba75e654f8e989339369d414b3fec7683ef0154 +size 23061220 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4fa0dc32fbe2970521645d892619a72201e8b525 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5916ba339b71fded585255a34c77a522cdac0295fbdfefaed580aef897710ad4 +size 6748573 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0334525e64e202c0243f0095929292613f377289 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70190841ee252299f41df80ccc9060da59ddcdba2483e4b8d34e36d4cfb51ed +size 7596628 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..782e52d8ba8c225d307794770356349b58593498 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b5167b0bf02875cfd1094683e01f12562eb58bad10bb55df6206a62736f8d7 +size 9783962 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7a92dd8e815b361e1c7e0a8ae89f30b358d42994 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b256de9db21c642a1d7e9a4b30949cdb6f330b604fc9613475aa383648bc7e +size 5102351 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7079a8390bb91dfab2b125d1aca4ee82a13e4860 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a05206b9dbc72471e2afc6b659f4f3d6cc2dd9324b645e8eb4f53674a0b6720c +size 34507902 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c876c0811c2863d515b513675752da73209e8348 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c235a59582c806d6773b6514d9ca0f188803c837c60b9de02ccde8d4968e07a2 +size 25758980 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..654c907ee08f41c0f3d65a17ed9ba04202926b5f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a91a956b9480602fd8f1e22a3c6c6065cf64d1a95fbd97df966863565a2a5ce3 +size 7127389 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e12f4b70a94c6d2d85414c733396445071f8dfa3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f5c2ebd70be8d07a1a6e417eb031df5ae1d12ad2a8d1a8997695281bdd8e50 +size 7798668 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ad8d45347b5421f6f9f09ee857721f2c3dd0f9e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50a640deb3a65bb9c1c2036328c84f2159a49baac41678a8d958c441fdb20d23 +size 9400660 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..84a61c5e914ef4f1c7080bf5f2b4e140d5d09f5c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:797aca0428e8724d2b96c25677b5a63c0077819c0cb5be2aad3b048b19c3917f +size 5186724 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..775c28b3220d2eee113bc2059583c4a885ac3bde --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc8d4eb5584e749d5a5a3172e56812afe53728b6fff135b5f2f1b54683b5fab +size 42856023 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..483666e19910d70ba510acff84d6ae1617aa019b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:126d6bc30db361d76979f0d98f14a905409f8b512702a0f4ab488bfd168201f1 +size 28084988 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5a205f2d12bd92c7a2757f351783ffc80c0568a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827ae4de2edfef91b47e62317e65dfe6cd941bebab0541155d4405a0155f8cd0 +size 7925326 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1294da1e11fdefc9c9b3bd0a10fd2c6560834620 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4d5cf3de76c67808cbed06a5c51e2307c669f96b6e6d82544ecbbda7884395 +size 8815630 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbcf5419a5c63bd41d924568bad614ad905dfd68 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3c356f58eede4d95dfe484811ef6aec5379526547a01daf12617b9f99158fd9 +size 11862626 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ff0790bc13344d1074430ac8b1869444859b2bc --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73dff4f524fb7cc38b0bcd1e50b9d0d36b2d87ac4a17a91b021b7d2b81e84032 +size 6690618 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e43a1e949ccc12109b2a468b56b484e2a51ac4d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e1a87db713eb03b3f2d58f934405629cf89ec00307426e4aca73248534e4e92 +size 56820372 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63e123d65140aed3a62b7dca6190211ceb22366b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d0d19c15a4203e66347982372a4e05346b4ce7be360c6752209698a74f50d9 +size 37992144 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eb776afaa699df856514d8b346d5600ac8720363 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7fa420701eb0fc21121ee9b7499d0ea4012eef46ddd3b82614be99849b9fbae +size 10844736 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db717be216ee1a95495e7a6939544f3ca52dfca5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186e2c333e0f6b63cc5ab500409bc802c6b11a6c42a2a9b1b8d1b6fbba0dba57 +size 12169298 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8a84a941686eea5f661a5edd4e01bb085116e58e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb43059e88bc9a8a638baf36acbdbb86503922e1e55b27a9ae12169f0f0c844 +size 15840572 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ac6311fb1c21ef5b1bd7b733d8d6a0f729c7a49 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fac114a18ffec4ab711f03c8ee46e242ad80ec75769649f553ed950fb713d9e +size 13604690 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8715a0659cd1af51ed3b22399e7897ee4db517c6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caac5f0e5e9d67c9ec08d55673794b754c5b2ca1420f5dcbaf0990012dc07f92 +size 134760626 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81401fe015c1f48e1fd3f559f48e655870a48eab --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7404171c06f98300eb98973ec865cf6bdbe21a7b9fc44c799f4bb84c31ca43fa +size 98879556 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48c3bbf167b44d5706b39ce1fa704026d1fa38a4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1771407c25b90b84260886cb98abe0a726012a355bd88b949610183660146d6 +size 29902009 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5568d7bcd96a7c224bcaa6e1bb57f0fc883d1bbd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d73b5d8d482cc6bdc7437d69460352d562edcc50304a43ad66b684c90c5e95 +size 35294242 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0e37e685a4984e0e651a3fffbc11644b96bec0d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbd4cb2a96206afe9fe0e106e9959ef8bb807ca65b2dd2d23262f42b93bde47 +size 7890555 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..064b7d0d43c8f986636589453b3d5eb2e3e98a70 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44b1bb8084348d8d906fea83304e92baf8d52ede545ab6471ea7657e4242769 +size 13634427 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8733228a3835a957912cfb371b78803f439ab547 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6304729365ffdd3739d50be046c06a9733575a0534cfb42ea45702fbd10571c2 +size 135078027 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0ab828b62102ae4277d4c50d108d7ed6d1ad523 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db2cffb424062fc9a000aa460c187a57ead6a7aeef6490790116e36a34c9ee31 +size 99149808 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba4c5ee5121a2a07c77d05cebe2c0b16bd8d041b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc8466f576cc20df95bc8b24f1b4f889b0c0fecd177a8a9b56a47a6f5dad08dd +size 30013854 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ba2277a50a53af407e596c89e1c606e21f18127 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2026a4e14758ac3ef607fba442677d4620fa35fa39c8d21cc1ad17375d32d57 +size 35439020 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..124eb4b08f705dbbf298c84e9e2f51bf71b5fa92 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b98bce042db037a0e3e3f28bbffa3fa40baef91b44c9f323d16256443808433f +size 7917188 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cda6b6eae07c1740a16e3394d71d300c8a1489cd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:893be793810de741d650af400657cf757d7fbcf1f55f3123c5b98e13de2d6457 +size 13711730 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d97f4249080621bf6140ec9169157cd44d63086 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a971dbee500816e8d724c337e8fc8296f40dbf111f54be757af52fc3fc8aed72 +size 135943752 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..953d98e897835e11537967b5198200c934702d83 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226115e470f29216b57c6a7288affdffb20aca49ccfc97d8b48a39d92e1db49a +size 99646292 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f3171dae6630e09a15385402192d32bbf2356de --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3254e4e1c7b6acdb83eaf0f49f5d85558133b809f8056b6ea4bf1643854a5de2 +size 30136335 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0458cec0c80904b80489287d8289277a473054d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e83389ba0c4adcba43787541b7609ac8cfae3c74b1cac9dedc65e91038150f4 +size 35581944 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4f97c96a80d6eda711689639a8b8bb4d891e2bd1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd08655c7804979d2d42b0b795b3304a232de32da2e1aec18e7cff1eb7a9c1b9 +size 15079552 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8777e07de9403b4734c8e436ed4539b7ad45a958 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76308bf919382d2e39434fa022215098c31b1e87b35acf237b7f4dbfdfeef8c4 +size 13283194 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e5930dee220740817f40cf6e044463ff1070811 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5010b0ec484264a38d1f64d8648aacab0f9b961c7f95a79ef6f803ed72714e +size 132322512 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49d7b7487b034e19d979990dcc1e13e0900748ef --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b4f0a5190dbb5d3ae70b3b3ac60deb41f83100534dad87e4b1901d4e7b6d88 +size 97320016 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5abfba424e7fa7121fa4d83f6f08d789be9b2901 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c9699fe288786d06af2da10adc5b93c7ce100c6966d3937ab56ffd48d4fe9e +size 29475168 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8906207569508744d8cfdbc3fc72726c58a2f05a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c44facdd87e2a3c2977ff99254f6c7de72bc0939fb92acf70b1ed18ffe6e6c40 +size 34800340 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7be9a2a0598c5fc3ada871f8b82930756e686644 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a99549ba61b346d16e9a9ad2c2f95172d0ff5b28611338f39ce182a58eb38362 +size 8205489 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f24e5a636b085410866842afd7601f760451be2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef3702f60186750573ab5f8d4c10ec75d0a25c088bbd95338cd746449b7bc2b9 +size 14069208 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78f83e37f24590d7d5c7c12e0f50c2d66f12de01 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32e96d086da40edade9de5a2bad7aada0bb12ef0c0b7c01b29a60df71326d19 +size 139178102 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca65d512e8268d965b8cbcadf5b1d6302268347b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10e625f03d81598a251fc7627d1cd028591defb28a983bab50d6902791656e05 +size 101885548 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b4b333b39cdd5c72126724bacfa8da19c31de8c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d43c9dc81d438b6e505cbcbc26a08fa4bb851cdf5b8ef0ed85919dd27b5f5a95 +size 30857914 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df8818f23e615b60d55625e46df244195de91f0a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6419f96197e541feca9d70e92348263700b084b83a2ff9990ff3dd877cd94cee +size 36472757 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..317da5a9ba9e410d2f8ae3a30566df8f9b28c930 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61fe4219b6fac720b73f8c3f4a133c764c44a749c9d66d8d7da35128b01b50cd +size 993114 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92af5e34951fa71118fce40af5e7d087d718d355 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6f0c5f30d16257a1987d91ee6172c1a01e1eac6995839ee43581b1ecd150d +size 1451961 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c9e0c72b4b3f3fdedf9bd5891793c88f44e0642 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2506541298383c6247f12733d6513465651a406512e28d7ba92cf89953a41da3 +size 1910555 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28b80510274fe6f88f6b4e6ceb59b008dfb2c8c7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81de2103e6f61e3e79b73cd538702e9a36d45a5263564ed6a701b473d642a1a +size 4735906 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5518643c99c291eef883f173ff7cc6cbc90d765b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2663142397fa1ceff4b27abda8c5433556a9bf3c2fcdf33dfc878041c4cc13be +size 2822355 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82d4469ded716007b4e45a27c2d48bfeb4367d89 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113a772182ebbc72378a1ac50796aa976ae13a468c4740c5a359a659bd561f92 +size 3278616 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d547a4de266bd3cf593ae5e569aaa2c566c9ecda --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690fbff917364e834bd75674605aa260eadd58013b949c4593a3d5d2ab3599d8 +size 1203113 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9 +size 1755006 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..999b27dda32776debd3d6d43656cd6b1764fc1a3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:358aae6218028e109622ff0cb063f605c5242296839b13f879fe10c69eba426c +size 2304108 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..317015a962bf49f39bfd321ca40479272bfb3922 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa55e7f40f191a65ac59a9b07fa4434e699df9f86a85ce62864ff2ca6f8afe7 +size 5704710 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97ef66b7a51c58c7f4eab95c8574771a6c8cf96a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d85715fb4caf9b6d5abb1125977206fa91302b9e683a9bd0bf8afcb30c0d0d0 +size 3397829 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e50fbaa307a6d40c414c2f337c4d1b593b7d02d8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daec0ea6b682b11c1edcc06faa95eeeedd7992c768593644b0ef54b9b2778bb5 +size 3945156 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1549ef51d2e1055f0f12e670edb18e0e9b67e859 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd35b2645682802eb3146507acfe89535d227a6d45224039457452e90c13d2f +size 1008260 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40ae2e008aea41c1f109e307906c520ac6f640c7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5605f50b0565d56f98a5780b37694482a721660037745b04c66f5ffee508fb2 +size 1478706 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2703e0c02d24253ea86ce46fca327361437ae89 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd3296b4b76c5a807b4b71ba49e9a52795e3a5eb08f63eb12b01422e23a35e6c +size 1949766 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b600b48cec3d7e2a6de757b43388409f387ae31e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8c46e44329fc885119e27c21a5ec3fbdabc72f60e3c41b9e72982576737ec7e +size 4838662 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e2fdcd4b85ffb0691936160b8fb5fcfaab51559 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29cfde6fb4c8bd4d22bc97502f88dad68528b8255aa5e31aacae7ad89b0293fc +size 2885876 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79a87892007b46ec2b19a70f1b86af7687ab77cb --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ab77bec5324f3fad50b6329a957f5e8e6a463d18257f3d6e24574a59cc5f98 +size 3354436 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ba13f9aeddf0dae8861d7ffaab849e6fa74b283 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6535f025fbf66e8f4ba9e03bb3115ae9cfaf0b29e99a0af78d740ee96282ee54 +size 1161498 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cbf0aa2096729bd16193d7244318caa8f5be554 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58411e5a43ca9b551c9821dedc9f248166a56bc1060a01d0f83e652c8d445f11 +size 1668635 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..76ceb2748a8d550f55ba6272e11d6d4fdcac835b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0d46be097de9a03d61160efa377fe3dd2c6506eb630f81857cebc13b7c869a +size 2176641 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dc3c9e55d7b1c8e8030c235594e2b621b8b30966 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e346dadcc892b3b3a26f1fa0a33e1d1c2a1289641da65120aee19450083d85a +size 5366690 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea5a2e70ebbdf398469f5ecddb236bdc4c03a6a1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ed030a800a832de6cd9cd379572487a46bb7588b4b87328d84c85be2e8383c +size 3187300 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c5f9c027c49ba2b2574034f8a5b15620bc152a5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec738a74ec9ef3528469fe3a16096eb6b45804217a7bb537461c30e2bfed5fcc +size 3693297 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eb66e734e1d5beec70a1dc30e760c44bd50ed202 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bea8214e554ef376a8bbf4489028a7084a98ea6ef31adb6512668a6b94e25d1 +size 1026895 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8af8be1262eb2399bbc08b460762f3b279d77212 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f1f881f1a32798c17d43da1f136064c10b7204cc08fd072fbb07ceef41d6ac6 +size 1503678 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7b56d7f96b44ce22f4b60923f0abf9f7b5c21be --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c419e7b4b38130d45a34d245189d35cb2c8ad00b5e3df3a57a39bbb4da6c9bdb +size 1980774 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2e4e65a3dcf7757855953790768b071a6e732cd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b12b47dfb2e5cc1135195838a00025ef799d2e9ff8a3b21edf1608e8665037 +size 4912586 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..849b8cdb63ad04080e3208c418b0d8a10060e831 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c029e8c7f4647240537fa0b36844bb02ff0cddc81abd54d3132e527c112107 +size 2928833 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cd02dd8807f448919625a417daf2ec55ba9e70d1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r1_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91fa61715576d1f1ede8ccfcb9c9b5c6a843845dd86fa95303a562622d8393eb +size 3403262 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37cf51abd9831d984d3c350ff772f52c1ac993a1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b72c7800a07a4b8a6341f96df16ca66f16937fa2986a732ca3a57acaccdaa57 +size 993518 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e5f9bd40b8871c84f8dcd2948cee3c1ce4be41af --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd46735c128473f81897359f1241d561af29221f49e3c2ac90243aa4de83755e +size 1447006 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0bf110dbd7864e0803bf612e4d5e883282cb5a38 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcdeb0fe297660116f0a449ed0001b85b5b9841e18c10c89b93d3f3698f8f3c0 +size 1900438 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..576d4766c0ff49c327640e884cbd1ace88e9edc8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431ef07835058aae22b087efe23922eff0b1a90dec3a31708c897ef65c2c8a27 +size 4700758 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b70bf808c3e6a8d699d7444377ab30722827205 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36db649cbcc51428754a11fbb44650f5300846919bc498012dcf857b7b758553 +size 2800079 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd1f9992deb49599513a4bad17e4f124a610a553 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22e53e46d40a299555a6edc397291c8b3b6afba277bd7e582db207ef6ef8e0d0 +size 3250645 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e0b84f31a5eb3eab3280ac4bd32f009bdbfad10 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8655412b9a42f8604bc975b2360d64f3b46a6129e38e9e556f686cdd8572250f +size 1203513 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23 +size 1750064 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..984870f1d296a9c3a5a95a47eec9b7d1f17f5e52 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06aa8f233a7d3b093816395859c4bf79b3873061a8440e15426b7ce9c78d2e3a +size 2294165 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1e91380bee63f4f86802d55e7c4c7da06483833 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615813ec790a8ba87382f79017b93b3c5ed3fd73e99f29fb5fd8a53a6cf21823 +size 5670142 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ebce69a119caed0e9b4be88e053366c6f56a8ed --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bce4912dfcdf26490fff651f08f5810fb5c31a1d3fe6fd140d232e256e74be +size 3375905 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5beda6fa20d6cdbf10c73ed46af40c9e66db0477 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5c451ef746ca5824cd292bc2b9e765bb025c226ee03b333073594c1f46fcee2 +size 3917622 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f627f9db8bb6d592b73ddc1a14a532cd6d7b508b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bb731141eef88bc5e77d1c3b75b0ce829dd9977fa782af2ce624fdd4cdbe6a6 +size 1008562 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a412e437dbb33e8a1f12571498f802edcc283863 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547ac037939ace1e314d241980e5d800b409b741c36cabd7e1e162b30c59df53 +size 1474101 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..029e696b93195da3131e69eb10df40d492a719b0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24d4f1ed5e0551ed3bcc0dbc151854dd2a7dbc5d9b7cc37e70ee45bb9df6605 +size 1939985 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ab1c1650a2825d3a076beb33ce078356d10200b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d4c7f1295b6e278bb9e06ca07ace13343bdf13d2a78ef224df11577e94d15ac +size 4804038 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d08edf561714ca09044b39113c006857ae081fab --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcaee2d213c2c385b8e0bf57c846fa033f3fec73c65c149aa353c4ee8573f16 +size 2863935 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d03c1a2d8662d907b69433a3ebe17c5c3c2e3e60 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d9b097a5ebfb4a4b37025ea788b25216284977380a44bd78de30f3b34383fa +size 3326710 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17b873a2cc1ec3a7d47b636f2458f731d37868e4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec210a0915021f8a8f95c763806eb2b463e9205400f817c5498321e5dd1e1f2 +size 1161940 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba2b5a5ad1bd6abc5f756bb69fce273fbe0846ef --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1b2081f17b9ae0b827f879e09a23b786fd7671654ce2da922d709b2230a004 +size 1663779 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1b4f44c27b55e1113123158c2868703c0819826 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e57da405002d5603f5e7169536de07ac02ca3010598094af4f295eeeeed76f93 +size 2166218 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e1cc02967ef0152b59f5af9beca14e0e68494ac2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d45da67bf2aafffbd8ceaed977e5eaa68030875ac584943332706ba1b33a3b8d +size 5330658 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fa96c9d1ae3d78c15787a3e0c67dc26d6f039d2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c004850440f8b9bee55ba941b3a5d7c44de0a3aef015daba7a498e16a1db539 +size 3164367 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb38bf72bcf344ae4100bcb26080553b23268574 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ed9659b1c5675bbe5969802d1b5124614327de55a044e2ae88d3e2554549ba +size 3664588 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1915246fcfd219f71792e7d7a772a9705786cd5a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f8686d106b66bdb57348da589a2d466fe96039f4da54de46a3df7d07d1b44ca +size 1027252 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cfdebd5835399ab0713c0af2f0ffa7691937c148 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d34a44837b964f3e26aba4c5ecf18e56cdb13c6effaabf83b32c34dcbc8da2b +size 1499087 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36329851d3fe8357df0b67a51bdb2aa09d609069 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:683d6e36284214bf54d7f7430b90254b9593f6bd484bc280e4c153c4070d0393 +size 1970979 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3dcfc1ddf09ae55e286cc280cf4519547b22d213 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c69860e28bae9bf031c3e43daa16a5a046db5e091c33487e9d84473f1715fbc4 +size 4878010 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17aca0d54ff0b4a3f15edb99afbd186389153eb6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19cd08265675c54fa7c5b1b63796fe0432e78621fefa7e2211c7e6759ff2a86 +size 2906840 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..686ba885d61e9aee3ece97d4213479742405f578 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r2_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1410bbfd2455fa9615d0de5813e0babf6b4d01eff66d8f0d20d5cb53fbeb0d1 +size 3375504 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d09fc3f55d890cf26915e7261132c60da402e90f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719c8195c8c7b703c6e4338a8fa02e4f25c3aac4f121b323e56f32a620f11756 +size 1169175 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54eeac5141aec75cb9ef8c1f9f448212886a8f96 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9544277a8fad1a92787bf70c622f38bc8185376b2f9e9dc20be3ec084245fbab +size 1698559 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7ee712b859603433ff88b14f2db4d79574fcd56 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b121b35b8615158b2ef1fb84b246248fe816d7f1837fb9e05d351df0e05d7c12 +size 2217540 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..407664af22aee0af4433b1d910eef364642bc853 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51b4eac3de63938c12521c44109fddfd67cea80a2252a602cedbfda2f6ba250d +size 5461672 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b19eb80fcfdaf9e388838b64fb617abb68b1c68c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4b8636b03ec0e1d638c6c7f2c3b035c215b6be653c797276541655a4d9e14b7 +size 3248344 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af121c9a9e1cea5b77b9e24fb9fb165fdb76fa96 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5578e532a4c674b3ab076026813204a67bb6de464ea2cd4790f56bf68828f84 +size 3776901 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c90bfa3764476fbedfb463777eebf713e129ae73 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e2b461841b729dceaf438eafff4762ec1097bb4b4f36643cce19a1c57963e7 +size 1421159 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f245b4428d9c62c747d4fc14ed350e538a669cc7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f0277f730631c38176c93a3f3d5fc2905f3d981877bf796d4624a9f0934cc9 +size 2061857 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce95c828d3d75be63e171b7c006579c42c72eda1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1999ba16abf646eb55a984bfca87aa57e8b4876c0ea2ec310318b82a199cc473 +size 2689259 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f0bac27b4b545e378872fc366fffb957c1079275 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae2a61eb6d24eb1f9bbf0470becd70c18c351fcb3a001d28a88323e5b503be0 +size 6623712 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2cacf8e3a4357d3fbe161100ceefbfd79f0ee345 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff55f937b270bdefb8938c30f3866457be291a50435c25069c1cf859865363d3 +size 3938647 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0656a5939e08b7a30c87ef48a034792281e0b07d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf88e07839d995179ca4618316b7331591a5b039421d29ca8e57548309455fd +size 4576548 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49222b63d88af0ad5fd933ed2782601c254d8cb0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d3ded4e6069ad092f948e537f1f840dd1bfb9b90e26d6accbf7170c230f14e6 +size 1186774 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b4df653a29d49723aba9fd8350c2f9fa03ec94 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d69a97f688c1e61341f6c81cfd995f2a7fa9533d7c0374b70e2f4ca513826b2 +size 1730808 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..039d9c69981cd216244a3cf59b583ffbeda5dd11 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ecacb2a29e966975683b4ee73e6ff1c9e0431d385c8ce66f1d6095fc7b9790c +size 2264724 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..32b4ec5613f47d1d02a90dccbb6254e2c0b62e20 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c21c2122f46e0129498c31f1729c23cce926f9b85e92f850c74ca48f1542a701 +size 5584996 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69eacc6c87dc95e37fb1ad455234774e0a19c25d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09510cf22c2c8921c43ee0dfd066d09483c06bedee99a4bcd414c91b4b369c94 +size 3324606 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87f5ea7bc9c8bde74db2cdc5891e1764bb359816 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3e8d22dbb2326499b628848cead634e86d5857e00a3526602188fc83686fb0 +size 3867545 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ee98d487e8be8942a33719cf64d8a83acff61a5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9181dda4a8c025f12e1b6b5ec3a56d3d36960dc854835c80caceebbb83629ca +size 1371155 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..692fe59a1a2a29b5d5cff000d47de2a4f220ef1b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9131619989dfb597396e0ae7df4e1e844f81205529c35b43a380baea2f8853 +size 1959139 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..32e4eb58bfc5e53dbaaa43eb807fbf605384537d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9b6627cd8a1c544a8deda86e7f6abbc50575b9c2d8264024e7c57f16af91e9 +size 2537440 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aeccb135af95754e58c153daaae592a4fae0d417 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:054110c79065f7ed62473c0944e3216912deb121d30776959f293fca16210642 +size 6220722 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..425f4af39518a531b3b28516876fbca929f04592 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de24cec470cffc25c786fe54d5ba23a6be9bff6c7590445f338c841b519e0154 +size 3687586 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81c1d160cdb28bb9bf379aaa9fcdf94c64dee1d5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d36dfe3a8c980c22e6e1b03cb105203a5291b4b03bd4f209568d999f933e5690 +size 4275753 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dc86932f5b278a24677e5d947a0379e3d9f0cb1b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b7e78a847e8c50b08735ded3d80fc88ddbbd234939272f0d679d619032ae1b1 +size 1209420 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..733fe939611a182ea013d4d8356c45ccf6b143c9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64fe26512eb40aeb3575b39d75deadc251fea6837a0d8b86eb5195b0e528c6d7 +size 1760786 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..767ba143c08032b18ae9fe0ea7bf2c5df608eee6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a79fc75824a0715b92cb90f4b1d70ef030d8d2c8f89c3c82a2102eb4cec60e67 +size 2301855 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65ae7bd6ebeef05c31fdeabef6bdaee131d9a831 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b60cb885a626e43e7fa1ee6c0d1a742606f413605d222ee108c7a45a266cf04 +size 5673656 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d27f1ebd97408b8454504c9b4fc7d2e4939cc403 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246219dc03f503080f1c409612d528b83a038037078e1775196d895b5b9ce046 +size 3376059 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da953b49c177e0450939eb3244ce45059532c434 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_anli_r3_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c76525e7e39cb82ca0534805116981ff9a07002058d3954cfd6895a7ff58ede +size 3926169 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97ababdc578acec1ec3e60170a2bfa14c2b3c08a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb4562a58131f6e1018cc95ba8885e01d04f2feca41f5bc38d995e9bdf713392 +size 1216777 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..34696e78cbc9db72094a3656c678b11727a2359d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b614cc97506bf1b1046c8923804cb7db3f8e210eb12c1b0c9f95eb1289fdbe8 +size 1670632 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d00a63aab4ebe65092e8dafc3af29f7b4bf82392 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1905b5b0e77253a39bad619a3aa2ad219cc1e1a2639017c6d64f4c38fdac2f13 +size 4240168 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4903bd66898d47ad299cd4271a16a6052a08d366 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3867d09a316aec6da08b0ebaac2f5d46ec139c23f9f48312303faa3bcab442d5 +size 2577615 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3dcfa11db1f7d97d7e2f5c05916f4fc4a244273c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:910bbf24472c88aeb673b4d6d67f6623b323d8cbc059de8200f017b67d2035ba +size 3027517 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..704c87fa709b9b077c4ea85855090288c30acfb4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6052ff0db963e072b962741f0be78c8f664f00d8398eb31e09a16f03ee46aa +size 3479076 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c3a2aecf9313d9acc852e98a1645e0009216a30b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cd4717241002004224354fde005e57376bd96fb26d33d0468797c9f100a9575 +size 1458158 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05db0bcfbd8854c8b7a0cb1a100b49e4cf35e898 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3a8a34768e9d68b2868929435aa7d5d704a184969abf73fdc9b54a6d4ed45bf +size 1960814 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e14fc2c90d29bb79d9410c361081c64cd115ac76 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe476f521f10897e405fec8bb05d0b735636cbc22db94d25708d469b8ff7445 +size 4914494 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd6996dfb9cfa291b933829b0a59c49b18b0fea8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaef1ea08bf782a1045a87658e5aa5bd3434523bd29537454c9cf7afac4cda89 +size 2963045 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54170f92a370c2ee7aa4be73b4be8c97dab08213 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e290857730db124171e1896594d6d23358b46a050959eaa2d119eb630104ff +size 3460909 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58fc5e0b41ab66a7b192ace26919c65704a9d686 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ef4ded57a1391c065d7369c6abba0d26e2792ad1ed7fffbd3631b30498262a +size 3960011 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e6bfbe343f5e35ea0a15e6c9b572f310bfeeddd7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe59bfc4b2cb48cd9c55046c42228f1e65ca9fbb25bcc8d04eb5db4275a19eeb +size 1505611 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c895d6e5a117125821b7605ff9a04783c0f59926 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:293626570b53adf2700b811e807c9b47e219992be9c896cb86bd1d1e551e4ff4 +size 2033023 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ebbc7d19fcc0b26430bba84167eb9d84ac67301 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6e064b3fd0f713bcee582be98284665bf57558e1cfb4d259996051fd556361 +size 5109428 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5cd3615243941454171d9dd3106703462a5b05ba --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67fec44f0fc342eb136ce015e85e2a6bb5ce9d19eb4171d8f646ce5b74cc644c +size 3084887 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0168ceb35f70514d1941e6233f1e85a7462cc30 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4676f6fa58172a9041e3cd400881d17e00e1b55a346261046e4d2b99f7a0e7 +size 3607270 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d89f168213fc62892597819695c124804bf9217 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aaa81971706d3803ab4c000d75e2ef63de20ed385fe751b2a44cef2f8d14b74 +size 4131134 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ca54fbf21d8ecd2fbe9f9d3fc87659962f72384 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f759f9f9812c73bcaf9aaa11019cc432b92bd8ccedba6d26d7c5f4e72177ff +size 1202714 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9aa9b0d259d8d33c594bc77c01c78829d4a2c861 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2f0920b71780c5cbcf559bd4e910e28d9068b60beca8ece48eff72e280bccb2 +size 1638992 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4765807a40f73d5af0321fbd648466e5da01e437 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938a043d856fe52cfdfe8d8f4437e8d270a71bbf86afa77b9ecafce4b7405a50 +size 4141728 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c376020df0421d232a0d7900aa9ecd2a37d7c087 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550f67a242ee68f158d3c1751580d8fcab2f23b5743669935f15a60be4b161eb +size 2510815 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5639cf32fc2ac8a186dd93f1ffb33105a0adca6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8228553aaabd782ac93fa124b9a48cf0071f67ae52bdf61deb51d8fdfa35c78a +size 2943137 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..503b339c04736446b77cc1ba5f707922ea2412a1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a41909854ecd4745bdc426f0a65aa7d57371ffa3b5ba7a17fc02ed48cfd87a +size 3377116 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5387e92314214f1f49a606b997f23b44d2259676 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4433b5f16993765a682d9784d5b0cb92e411d7f9fde123131f6d18deaf1bf7dc +size 1187362 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0b9d9cc5cba607e47e3828dd30afd71f2909bdd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ffaff526f21b2279bd86d44c0b459a72607dcbe53e0722e7ac78da8d646c2e0 +size 1557235 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4e7ec70c7c6084a50cdde5518e22ecae578d0e48 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0990e1725c136346ff19dd4ff886889f28cf7a5eae5f15687ff14591c4bfba77 +size 3843234 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c57430b38994bd5503c856c3e1e8138cbb744a10 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b1f1f5ea9b5e175bff6e7ce6feec9df1ec1c01945b03aa5b678a2e946752d1d +size 2294841 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2df8d6b2252157d4e7ad104699c41ec99effa1cf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898c08909654048bdd3dd96f0abf380636554b851305712bb85a329399b91ac9 +size 2660279 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec804183bbd9e531035711fa9e3e8c18bbd09398 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_challenge_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00bfcce38eb12463a14ebade1d43c0014368894078dee773e0efa5667497bc52 +size 3027008 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07b8b6579e1e71924f23858c632220526537734d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa25e6f1e0fde11b88d41c2702564a8fab702723030a45b0b512097885febc8 +size 2351241 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e1d95af0521d20bad49cde991b6a91795350a230 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53491516cffd5f527c1793f475585642663037ff451274de25fec5cd34a56792 +size 3174164 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..257766eb276285f4517199e3edc135487fe2ee05 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:391911f330598540d4f0693a27362d8c878b1c30b70c23fe724780cfe3488b22 +size 8017892 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..742a56175815335952fc0835ec3bea006eb2f4e3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96844769df6ce53a232c4f76177f204152a16622defa76be67eed007df7b5e9f +size 4831612 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..717c6280b4f0ab8a3e505452e4f3993db4f505fb --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e7788e2c803146b8abeb2ea1e86173fdb726d37760a1f4ed0b6a68b9546b17d +size 5662356 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1e2c6c0878ab9a85977a1f6d8193eb497a5f88ae --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2c3a60da1d4f5efe945464a34d25390dc5e13ca7f0f4b2f4e515b5c3866cd4 +size 6494729 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b1192f830b4e861fd08138a5098c1377850f2b6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ca94616cb758761c52c498e1e9f2b22044abd02c5494c11d8dd4ba9b2d0da1 +size 2745992 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..790e9c65e52ceb14a8dd569bff69edcf06ff1cbf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a279aaaed725a4083940fd29b3b586538653c1f88f4fd4e00a380d0caa40eb9 +size 3649219 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ae543c05d4b84ccb481e09629f83dd84cc86cb1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c485712d6842cc87128380c911880a4320c98a3196b3be452caffd9f084f5a83 +size 9132992 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1510b884c179b3ff88b453252322452eb7711f5d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6304a6b4c78d52996e2559afec08f309cb14e571bc7ec5dd3e94953fe9368b8a +size 5469546 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99dbe488ead3914bf7e54cb8bcd4f5090fe0e58a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a920354a9fca37abed0c205c30ba9f1bbf9064cd03baf00cd8c3ea6a050f931 +size 6381679 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..487be34b218a662e3f787652e678193a02c53706 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cee1a04f94d003b642f4c4897f6436260fce672096966824991c882ca808c21 +size 7294026 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6734c37f8afb15a8202b3a3a4cc31ee5697793d9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7949cf51b0c543397c7ec4fb2c07108bab1bff37021b9364c2199ce9106769d5 +size 2843647 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0ddd0ed0cb9f555c113d66c89c7b3cefefbf259 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebedd6ed04765453762924f40ca985cc2c4bbe44bfdecb687ca1262f8214ee6c +size 3796696 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd40cb2d5e031dfc14ce86622d50cbbdfc95d00a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4520f2f304f4c88221c35c5f47930914e91a7861b78d31d8e2c9fde162a5d8fe +size 9529380 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74d6bb82d918ffbf4629370748cf5a1484836d55 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4ba2345289f45a5c3930c673df4faa41c53642e3c301b65a81fa39c6d18972 +size 5717597 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..635145a74ac8d9c7611f5717a3acef5f97f9b389 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b902d08e8d48bfffa44aa372aaac2e5dc4ce4fa9c3fb23398ce8dfbd8e2cf9e +size 6679554 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ac99ac2e09aef254cd9eaff2753ecf68edabb03 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:267f2d33b1795bdc14f46ee25b05486ded3bb59936bf9d2de07ba31e14061215 +size 7641858 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9589a817b3a732d6e07fe6bf0c5fd048758a449 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f18d10cef02afc7665c3d73775331f4d88907cc5176141e46a5d0ca19c930c05 +size 2322732 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b129670226df889e8d7e3b4619cf2d2c25bffcf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510c6d6c8d81401d4dd2d8a0c845825e76848c968785ebbacaaf0b9941d3f34b +size 3110021 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..952a2cc4287797839e699df4aade3e819c422024 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a13ce1f40faca42a9eeebfcc2cd3e9fddfbcd67e8a0d93923f2fe0d7fc8375 +size 7818320 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef7d112f3081b5a467d10ab404d98357ba4c6835 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa41ecdef6ac78f088278faf7395ff7447529a3ffb7e355f87bcee2b1f255d92 +size 4696184 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..12e8f7ce0d2f50f713f018c5671f374e91093e93 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef53c3c9705b7e066390458887217a5be66c8cad5d3f02b02d967ee3d0447e2f +size 5491290 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..670ec9fc473f75e7b7b29a662eda007b43b46870 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db87b427b28c45eabf9dcfc0735ca979cc08c1ed895e6fc06ee05f60391e132b +size 6288023 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e596de321aacab737a173fcd47c10daab2c2b0f9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73273dce733643f8b21948a688906b0e4b5a7ddd22936b6074727855366bb195 +size 2197211 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5468a365fadd7e3ef00a8ce37a0b7b00bb5e0384 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17eb8759d91e1576fd4614f6089462ac2bd5fb90cb25ddf36d159680334872db +size 2831504 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2cde9ef36138414d73f628e29c180048d1f3d73 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7411eb71dbe556b3eac0e47e5705ceeb4c961e6c0e713cd6a1431f6e903668a +size 6960192 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3742e9be8c6bede453bed142259f48c24398904d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29043f9770d5a9d15b6a0b727c2a6a516e069545b5a01ff50dcec27115456eb +size 4115043 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26001bb3099c2863f0743b8dbd847d1ec6075c21 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d8f5d50ae5e09a6d277e56836071a194ef1efc57a9011da88ed718874851db8 +size 4758750 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..67e5da0d50010c9d7c4893679c51cce13a35572f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_arc_easy_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d43880148cc77eb955bb390d0ec478e4178e80b7adb2f9f77ace753404117e +size 5402595 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e6f0414e32c96d1d4baa8f0b680edd25be1ca6c4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d2f1672fdb38ed88d5c4e03b5fcdf77b29392ab57335edc6e3e750a2781e80 +size 3642984 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc8c1e73752df63df41dd43d7b6f04ba66453537 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba853d4e22643cd858a3a656c42d6b3fc9997160da779d7aae507c3f82aab647 +size 5657596 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fa46a690e69c68ba78450031029cf9341d3343a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38f6519550e1de0313adb3b9c84539b4badf20b68b540fe85f2a220c87cb445d +size 23083911 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2fb8f42afd42b9a8532d4aeb9ac88a27821edf5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a4fb2e11c7dc091029ba7fd3c71357a1aef291e35280e737b8bcededf25fbc +size 19457786 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20bc002b4a0d676aa7f14545b9079cfdd9ecc4e1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd684bbad9e4b33618c1099bbe4c5e99d4618f7626557f5cdee833c6aa640b59 +size 11769664 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96993248e5ded7743a3a6e02a115ea2a1f3999ed --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8d49aaa197828e7f46c64fb0eefcbfc129dc3eee0db8b8965f1a17f926cec2f +size 13791213 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe9628214092f3193a3b5bb818c1a5245eca6faf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c208231beb22feca4827b120cb3e39c9abf97f6287a1d2c2f07db82cf27b611b +size 3984675 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f16ae277b2f7d5d65cae3317ba7d4a29a3585665 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e1207705fcc5604f4ebd6f6e3e47d80a57473c5a62f9091aefc6e60710fe24f +size 6168092 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..18b5a5eeb16b662b3c64fb4e0553608402bf9e45 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:008f2da420b2d3cfc210e7a057556b3b0539006977ad0f7eee7d110305b40374 +size 33487048 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0b752d86bc94ee169c2ba8643a460145324c5907 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d57da4a8204bae2a1cdb2cbe68ed8c7e6d533043b8c844d2d0bc874373ee331 +size 21146356 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4f84aaa2f0f07aaabbe9f6cce9f15595c0495291 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eda96d101f6204c5f3b5cbd84be36b77b783ae17f64e7eda518cde875f00271d +size 12781167 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1851c88cc4bb9a043f738cf0d805b1597ee3d86b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5feb2588cf77936cd903f1335c03d3acd857d534df2174fed90014257dd83b26 +size 14970076 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c +size 4041656 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..475a2601271d69a2608d2d2b078dfe1912faea3f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db765113e5bfeacf774c20ff15295cb944d77bbc72db9aadcb64840730889d77 +size 6260954 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54e8d71e3aa9c02a5016abbe130090cfd170f96c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51b6bfd334c8f340fff26d5502030bea49ba012fb13e3337f139ac61eb87d5d4 +size 34003928 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2543ab50644814163927c1133c91b2d2f0ad59ec --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53efd4eed9c56007eef5ebb93672da088517e968c3e63cac13a26cb31f4e6dcb +size 21476274 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ba2bacc8bc394d232b5918a270147ba578f84e9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5964c82363d14c09eed12e0c233478fd9bc848fb7e6155596a775d6afbc8e3a1 +size 12981928 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ff8fad1d8c618ac728cb1b98ac29680b124ef349 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c725094f97e73a7eceb7af47ec42784d323e210c3c041bba9cce83182c893d21 +size 15206723 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..196af27a8ec45255feb44d306ac555b96065f164 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8db3b5f7a75ebb026b0968f52a416d008b86d8474539942b77cbb7f48af4cf +size 3664266 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..813bafea5bbfd1d3d1dffed0bf405186ef1c9e85 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8724637fad67df4300e7a567a251bf7035bb49bbd6adf94563e07ba849c0d97 +size 5687714 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7f2bc611d2071164dcd45c5bd067090780997a0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d445b259034ce8c8f2a1c202ddfbd216ba58ce7fcaef4a86319197037e18cdbd +size 23198022 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52c5b4b52c75e9a299234d9c33912653bd0faecd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd33a692db4ba161cba5da6d0cfbbf70f91b3a04baeeba046fa978a44cffdc96 +size 19550032 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13c3f49d95d31952c0f0da564766b15e53e2d9f6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92607560ae2706138ecb95d0d59305e14955d93266c405e667dd07fb3a781fb7 +size 11823903 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6863b5a8657f7e4b793651319a240afe9bc7d28d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47c89d4234cf78d258f87d9f1d2132941f8ec6aedc23de594a8edd2efd546166 +size 13853777 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89136f80c40a3d1fc707b122a61ba98a41ee025a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4808382f6fe46bd7f10b0c0758dd26b81c37070a5f806732e2122315846292d4 +size 3861409 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21db2eb95f567fe2f6f48f659af117a48bb81039 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6063dade196e238b529e8f8f190d8118e73f825825a287f724d1715d5d036b3d +size 5990521 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38ec1971a2690dd8e31f9a14893885e83eee7043 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c05c5653fabcd56801b52a4fb72cd574bbe06c6c55ceb4f99841f538e8f439b0 +size 24425328 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..57e0f8a3e7da85a55404ef41a701ecfde2ae20f4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ccc50b6e0f98f4957c4c66df8eb6172c05005e757c51b14d704576441d6feb +size 20579584 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c741050e984dd8dcf2c17631a590245bd5b32b1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b881512166b05d5687b21014763d551f4e3ee9b37823bb66c40db5977abfb736 +size 12444446 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ef71d4971626be67cb9c503e98a8599291ef964 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297253446dc9e31e40a583b22fb5f37ce2564df09360239e770a84d9a4be6448 +size 14580027 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d96a99663a32fc932eaeeccacc7462560e2e5877 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d6126480f6c7250b854a223be1446b3fbc839e24f5a67e46dea97a3fc38418 +size 55148 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0e74af0809a0864c383172f2f4131ba2697286c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d4fd1b4f1bdb49e047a97da2ef7a6a33c7b3793eebb54e40fd3e219417d37a +size 77971 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e40f0e2490524065a2fe8360dd744b38d7cbe29d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f9bd9cd9b71c38dd7db81730e52e438bc882bd178bbcf5885cb96c85e6e785 +size 99574 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e528b1bea7ceb277e2f1917cb3f3435945a6abdf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c52ecb02deca8b69d17f2249730bfb903e9be2bf279f6f4009cdcaa13b09df2 +size 120727 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eed5323b9b4d3e49937d6b01020033545b6e63a4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74d6d37aea7293cf0f9b0b26d009c100da337d8ff61c382265d2bd569842ad2 +size 142847 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c16e6d7951850ab644b7b6657372eda1179197f4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0609b94c0f2b869d9922fe7adf27e349fc33f179d9d6b2366534219530c391 +size 163681 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8 +size 66218 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c +size 94141 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7457622d788b9624906e845167dfa27a3588fc47 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c609fbbef8b2f6d9a75eecacef06ec55dee8e5254f645f9939e9edf86c37c84 +size 120811 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..541ef7e8ac95e3e351de4fe7d06012e14f8c32a8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9af11be38c6f5f74afab94f965e817cd38976a1db4e75c10d61f61b39666fc9 +size 147033 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99b97120ed4d5239989b52c9b35c2456313d293b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa50b890611f372b1c3826300c5eafb02f8bed0f1a528a7fe720fc9f0ac9bd1d +size 174216 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bb7cfb217941eb047c954d1d305f6d227814ed54 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5371712d83096eca11f78767d93fa6a5c15c37d2d4c8a759c6a8e72d5c91058 +size 200119 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed05c6b044c73c31d1873cbb583429a8730aea28 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a57977217d93610e7c0474deba3226d16af40b0838e18bd199f2febc9c660c0 +size 56326 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..511522e973f2835af1b9bc196180a35e5550d8d2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d575a912a2f729762d58c3e3c7a5784207b6a0862f32242b8940a008f809bb0a +size 79784 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db6bfffa73cdb625349c1792c91848bca2431efd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b79f83156feb6eaaeb1931b711994b6012009f9f3c915a3ebef4b007856daf +size 102072 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc185f69105cdc0631cc3ff56e5d46168b1354df --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca65ab17385e2d36b84371bf3fa0fd085ac2218fb7c4e7518ec07e0d74fc3d54 +size 123886 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ffda5a5ffbe787982260f4e2e74368dbed8df470 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:154b254177a6af0a3767a6f23e997e73ee7837b5d913b1030c194f03e5832416 +size 146662 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4e3b9b92487663a773dd40c5259e9f447076725 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6796936897bc1b2b30286976bdeea1dde825127f0fb684f757009f3acd0e2b6 +size 168152 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..012f765dfbd468825ac77d18552a64ffc1b7b50b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a85393c8775db66ef95bd6a202dd01f5dde86c855eadfd8146df5653e73efd1 +size 63987 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..359574cbe38cafff662442ac82adc74eca6d9ab9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae71f03468e292dd477a7b0ae2cbc114794347c14c8b24b2bb874d06a111f761 +size 89654 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c26d98c05c8a3cfd1c84c525c212570ba2e34ab7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5f631b5ffce8aa325d140de04d3134bc274b28edfe6e8f33213dee3a145891 +size 114133 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1cd8080d746a40ccba1085099b230a028f9d9c81 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92fdb91b28a98d8f80de1ba8de830cb256439605df3b4ea36d47189d5e296112 +size 138152 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..814322a170e0a02aab99f1f555ee4e296ffc6a99 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a6842b089292a4c57724828730aaeaf9be96daede2645b699da18096251a8c3 +size 163154 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5110579481bcb4614e783c9f74d49271a294c936 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b0815b33eeef434b6a46ca3a6403e054d3bfdc3973db9a1e0a619fbf6f0c0fc +size 186824 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69394380407ac5c85597872595496457a267d30c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:121d5b3273f675eae5e4487185af600ab4afb41e0e7e4180ddecbc3a033e6ec1 +size 57326 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c18e9742e75c44224c86194d0c221a1b67fff989 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd1665abce524cc6bc721400352c5dbdb797840a3306f75bf378642fad27b4ed +size 81124 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..600114fef7e0d82b99b02e644841879c1e86ce24 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0dd70bcb51c29188699dfb1e3e56d838e470f84ac589fbadd164ec799dadec0 +size 103749 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c53292677bcc2e5b277d1d7b60ba6e9af49de571 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37feb31f16e5a5014ba251b71de0b5e342fc0c0b5352ed4252a6f11786c5950d +size 125905 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54ce5d3de529c4ee42c0f20027736cd2e331275f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9940610fba72ede44e43d468d72de73229bfbd9195e2917b4e972345f37763 +size 149022 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b4c5de86e06308aaab8d9080697de0f49ed957e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_cb_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd81ba1ec17f05d48f70c115a01dc2f9a1b4625945ff38be1207e21cdc5f3d0 +size 170845 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e843c651b10fcb733af3a1bf03130fd27d563086 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be853833fdc66a1fe02c70e628988b5a84e8cd0074e378d4de22c804771da4e +size 92237 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28cc9e135b3c79d4329499025966a9cd27a7239e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b2f497c669d13514d6874332c16cfd4017bd58c0ed53c697f29afd4b78574a3 +size 111526 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0ca18c727177292f4f64a2d0dfcba754886e4d42 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf13e6ddb34d91b39854c3ed70d9d6aac6bff6fb266f3316239fb9defe3616a +size 132085 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cbf491edc25ad1c7bcbc059e28794bddf1784087 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30de9470eae2b52c436ea6a798c805b66e6a287c65e95b66fe606d47c7df86a +size 152234 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4fafea89758347bd4e1b37ad5523b350ff780806 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5dda2797dffda5d72d30790519d38d4fa060100b31c5620a1deee6dbd84430e +size 172053 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..10c943cd95d821fbbae04411b97827537b2fe01f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_best_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a47ea34f123b5c7f4aa66fcb69ca4d75abca7c31779d1121042c2d5b1c7921f3 +size 191971 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..167013f2eba69997170d8819c6ce2290f70e7ae0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774e1e7f60a8f125e73ef9c2b2d9ef5729d5c212f3dece485b872e992167b96b +size 87856 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14489f3bcc3e2c7545d93eaa7afd1f72b63fed89 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef616b52312d5f6b739fd629306323975d9bade81cd0e1473509ee4f72ef3dd +size 105087 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5656f0d9324714cd07d39e9d24ec175e53a370c9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc5127968c9be7d7a75560fad17e8873329dc74dce873c46e68faac491414c3 +size 123406 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9775e1d36b8ecaa4e61e3717b56e3a02ea604a5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3701e7a00f9734b6b072697a6bb9e66e7a00d6f17c230329aa343d6f5c5e47a1 +size 141425 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..067e0e4362c61c42b92c777026fe453b3c51314b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ca4ff2dabd6bc28e2af054f9bb92084c4e534048d9c50c1524827376d7efc7 +size 159084 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0275d838ed6a522c81cf045b307a69824e3b4647 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_cause_effect_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4508d7d9b2e3f3355358d014f96b61da6c1cfe3e93db7a90ef69714c83d6f6d8 +size 176870 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9effb96cc2a02bb4dfecb428ade189230340250c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b0ad510f132f5d4cbae852270ddecedc78a9fd923ce0ad9a0cb87fd1f4f467e +size 85168 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..196025e39f90f885a60c0967f79e028aca3fa285 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635860ddbf5cdf8c30093864ed8d2edfe116f4144c11b0d95de5ab806f271c54 +size 101250 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da184e13ba3831c29f584446860ddd9511a45210 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba328565db1809c863eaaa50a1663ae8bd7b23fb3fffb37e1906ea20ee1b9ae +size 118456 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..807339c9751fd7992839202b6bde2831dc00e552 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7066d8229632d47049c09d1dadf0a4abda0cee5b3b48ee5f7b2aacb2fa8cfd +size 135382 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cdf080c6e3073e17f07be6dc2a0b1e23e58aa53b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb99780c23a4f1b7adf6fa50ab46c940e454f3647147174fa03241e867273941 +size 151923 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a00654dd5db2ee53d80ce9fd294799158c545cfc --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_choose_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ec8b268e78b28d4bc041a729bf2d15fed3f513c5b0c80e3eba684d1917cb2f +size 168709 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02c0b5286c20c496fc3938036a0f347ba68627de --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86469a758daf827ca2af709e224bfd6d19c7c166b5f6073e7d64f6fee8af3ea +size 96747 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9452fab74ee245028c82172373e0b6309172dda1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6983e417e29a0b59b8db8792f0d82d070b4b9501f3a7a4d20468c1329a896bfc +size 118281 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9cad9dd9d2230fdd9792410d1f7519d69457918 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9a5f85358cc8714a1b8a1e4c6056868371c4d89dc246f57d1e3d64ae7be84b1 +size 140919 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cf6b9b43e82b1cd8736b3e6b562ee8740203f38 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da9c6a53180fff453ab772653c8358310f5434ae1b113532a556f049a72ee12f +size 163216 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cafcb9b8e35730c24ac38fde035a3f439afc348e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74b6ed771f5a21629df5fcc26841fe52582c80dfbafb73ceb1085fd839f29b9e +size 185166 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6568ae3701e508e9d5b584822d586ac1b3850d12 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ed5a2d26f10ed18b4d4a037a03a577d1e4126d8bfb21ebb33f52f2239779cba +size 207261 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f24ad7650a30e2c992d63f713b7240fdcbf20cd4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa1cc27706e13e23dab9d42cf5cee61af15a3a9b1431bcc2446596a61747dd1 +size 95748 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d84501e51ffe907dd2da5ccf7ca8961b292d6ef6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f613c4a8a25afedaddb0549ac1bcf35d79c5a84881b96b077de5d26d6acada16 +size 115655 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3d56761c2f32f022d85f99cae9232acc0300c1d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39374a9689657a08be5a19df1f75076a88adbaf9a41a751ad5145afe6f5c3e55 +size 136664 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68f02317235c825e7facffa43a542baf656dc6e2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d58afe400e8714b3f498f330340f234e652f439bcd9e974a9a9e1e23293fe21 +size 157383 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c49a7b9677ad6a3a9fe2a1d71d84bef56b000b21 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bde237c17b5528ab54e15a4ecb7562afdaca204839a4808efff96e29d74d22 +size 177724 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21a66915c862a9ca957de3deb11972011f9cde0b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_copa_plausible_alternatives_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed783e1886d69c74138a3ea5757a604703a9b3c5b2991d49fc013bd95907cb9 +size 198309 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b09b15c722acd6042a7a6fb7c818fe2b4b1415d2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a786db881d1dce9ab75c2ba3660b11dd600b77b543d0780527e5ee56aaa340c +size 3322900 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb7cc80c52febc66b18bc9b707bdb7030b41a65d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93df70db4a7aea686297582c25ef514e10cf29df69eff0f2b5a66372ffc316f9 +size 4369815 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a3176bc4c95f29439f7f51c8e245c7a81052530b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb6cc8e2ce6770399ea7cae7a775d930332c5fea23c162d4922ec0233533f70 +size 15985617 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82d98f166f461449f31e3c0dadedf983eccff4ac --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01fa5e77798ec1660c167e7a2036b78de1745f9854e3f12b76bcd2ac5efd01a +size 18824067 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db34c92ba0cf812f0c322d3e1a665e28a0d81e11 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430bf072284518578ebf319e665cfa148d4704073ea99655994c8bb64cb22058 +size 7193069 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68c7d2e66929a715fa8be295613f0d825492042a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de432057ae8419416de5f01bbbf01ecd1579351fc1d96a1180f96115951158b +size 8132660 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ff22a0104ce7f0d041792bc1621e3e354a7df94 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82d5adef2f380765cc430c91975067cbf8b62888b71a872034f6001c9fc45283 +size 3406001 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..969f5e95e39535af1828a064f267f51472b6c62c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48dc78650dde321aab61430c76f5de1ae01b7c0390d3e27716a39573a754948a +size 4350409 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cee2a0afa3aa5647e6c5bbb43545a4416b26b847 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f67c27a51b5e944a845a0b8ab1b4b811b1dc81a53aa3b298b1d403369447b470 +size 15794175 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7492a51d2cc414c1d0e0db63bbc851f140beccd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02db2cd8f97f4fddcd9f1f1e43d7b91bbf725b394434e51f6ad7eb83303e16dd +size 18521397 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..190f215efbb15737a9c0d44415d8d1e5353aa7f1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:722699b7c5f65f14ed6766304809b4e08e20cb2324e55ef055afc3fcefb3af6c +size 7081664 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22939316cd5706e6b5dc4469eab41a47ade26eed --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3b07b72ad1bd7a2ec56021551e6b217ce59e725e200140ec8c2bf673cc1ec4 +size 7999088 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c223313788557ef420fe19a70a01d015fa7d9dfa --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3153b41fbf99e5ae40a46d8d9dc4737342b0716ba0d5cccc22d473198b2ed4fb +size 3597189 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ceeecca51ee87f688df68e3ceddb19969b7f3286 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f6e150d92e7cde86263c09f24f63a6ee457a17f7c5dfe4586e825f60a3b731 +size 4720877 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3f2761584cf25571e3fe9eeae2042af355f705f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bead86783f6a8e17dd4c1fb4ee29b74bcf3f42e6f56d03f418204de20a166f61 +size 17519124 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9917bd1e75e755ebc383b5dbdb38fa0402d10595 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a3cb4b76f8caa6f7b1dc0c63768e3c2426132044b86b6eb738f2a41fdf51315 +size 6877145 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dce9642eddc257b22789d6944bc1de47eeff0b79 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb2c11202ab0d04e15fca7e996ca4350008f3f88a897cc3e0b88a1e3940aa24 +size 7917019 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb29400973f2f2344cc45ce07fee75af03e29751 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04ac97c0c3fa38ce5dca3ca64869bf18f7f7123537224e19c668c4deca654098 +size 8975317 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..703c353afdc61bd13abe8d1926caf1553ab6c27f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4a0d2b6d6daa98c2d44463f3ec58c5711eece36f31e81c1ed8ae62dce434ce +size 4510806 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f896dfe80db8ad70ba89bbc52b709b6e1b55f01d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fe947bc8baede3b6b46336828e8318ef7b420bc580849912195956a583708f +size 5087249 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86df5f436db60b0bc58994a1f3135f7124015930 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d7485f4e70f081311ce33a2bfa223bddb0971c4a6fe69af6a305c641189c8e +size 18371487 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07acd057139ee82afdff61844237a0e13efd6259 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06bb1b17065d751f95869fd9bbcc009db8edda80fadf742458f34ee82cd5856b +size 21596541 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d8193cf921be041c1c578cce6c13f1b9a44f3353 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:434e739552c628e94192ab69864d7cf1533541c43f803f5cffc3a3c4c9542bc5 +size 8269063 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ead8b31cc4431f7964d74b996cc307f32739d4c8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a36eaca2c7972464020cf091427c65ba257fcb2ad58d5da6015087b59daff68 +size 9358844 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40d60ea3ecbd630b86e9fc9657f1ec58c1408946 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b239edbf34ae0cc237aad2bae9c17b69fb3525e0a72d05e76850eddb8d2b2a16 +size 3133399 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bfac35664fd3d0c1304aa154cccb158c39402b22 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b36e1c57519aae19b1476eecf50c4c6548e5e42a8ad18d9768dd91e13a6fae +size 3898074 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02b660d9698dd9ba7953a6191d9980f06a453340 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e9b5474053450af0c96f4fccee163ef856bd71446eb973dd42f3db89791d55d +size 14074752 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b32aea833abb908c07accb4955f7586a2a7de6f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7908f8925532f3edff913574101b970ed99baeb00a2df6462faf6bb8f16d0389 +size 5483373 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..84725ebe595322316e710cde61bbf429a1c7c9b3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62dfdbf57dafc3169c62b542652924ca4bdeb4ce0e4409b5eec50305ec01478 +size 6272932 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60a5bb49388e9469c89e5251cc50379e5c34c31e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b561b44b690d93f2618056cbbe8bb4e84a87ab23dbe89bff9eaaf890f8d66c8e +size 7069563 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9eef0b4e9307fc12122ef07744ae764cd9bd3559 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:280d2831de4469d147be8a7b230f392d192fd182ac9949a11669648b01bc87f6 +size 2889272 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b60c60ce3c5878612160e281732b5c3cb0b7475d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c79e15bd5a476cc83647a7b60c6f64031cc81c376915f038fdec9bb031c2db +size 5125703 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82a764ebc270ea38ff14439fc11ed974039bc790 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef28c5560f4797f55afc159ac5afd3dd0f187d20139627223cf6a519d11433f +size 22411875 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..293416e10f2f11d2be79c23f650656cdf590604a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9685c92ae41be6685d5bdb008690d9347e5c61921885b7169271dd66069aa5e9 +size 29293971 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07a78e919ac2ff408d67490db1cfcf30cdef746a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6649ccf8af23e0a204a1b924425c5ece88b922fc5089466610835a721d0e505a +size 11820519 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0906bcd8a8d746d851cc9a208a85c838b8be6baf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b177a23359c2fe8d9cb8ecc8e0ee87c6a7110dfb20bfcca188836b0e241565 +size 14076785 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..092ace025dbdc3e758381ea34254cb16452eea2d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbad4c88c1e399acc37a90e5fe395b7ce7fd48eb7f3ead1196e92932e61936f4 +size 2702198 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f369c7e723616ba2609eb50136fa13cc940e51a3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989c3fd57cdd71d996ce42ae89b5988a73d87fee09c05b9a8988c208a7752142 +size 4991605 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b15bfe9409ca87271189e8cd0191109030e9ce20 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81f28867a2e6933f03339e4eae8788b9145dcd1cec7600e81ff1e05442206d41 +size 21870801 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f19a8f545da2670460ffdaa74fc81c34f8557983 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81bb988616a14a14d996f8a929a3316b2df3bf1761c3fed4195361fe526ba0af +size 28686741 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..787182321b4bd6fd4da480f06e775636fbb4cdf4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ebe41cd77bae5ba18b9fd923041ade986204041f709475c55dd899524b2d3e +size 11579851 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19724bf595a5858854fcb8e8afd64fe553b120ba --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb3c5e22e5e44b54b5f7fb16ce37e2b2e18f02c9768c1c8f26c3aa8ad443345 +size 13794741 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53de47079138cc2a73a0bee82eea1206aaa413b6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d9cca021bdcbf156f91fd2c0797a1d25bce075aefb8eb18980e09e0ef75fbf +size 2821174 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf1aa6805a240a64c8ffb284cc035f2c6fdb4878 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:283599e6efc5d77825d91a22c55d59211b1a18cf888a8e6ecc958953bfbc47a9 +size 5098611 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4688c26c7526b74d89ac2ed8b8720e49baab71a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b7a761e13dfd073c1af2cbb5c53264d09e8e9121c16be9b247574b7490a0d9a +size 29499832 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1eef4dbbfc0afd5d6d2e9e7950c25ef57203d2a2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfef6501f32604a34c71c3a7fab5b7014b98866c1ceb067117b8b6a801c9c727 +size 28927074 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..34448622bfbfbfbd09c09851929c92e43df2b738 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:463f5806efd66cf66eca79742b638c3304c608c451a9ed7a7f9f290b1716cc44 +size 11671792 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4279863f87be43e164889890e44aa636c2ff86ed --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8071db815ede967d07e51938fad96947495cb2913f3cd246bf5d6ea617476c90 +size 13897478 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..959112d1d17f1ec18c2e4395daec701f512e9ff1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:505779e52fb64c5912da1f090c89cd236396a4c892950576cddc6ca8ab9c2e65 +size 2808080 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..15c1a344483131558a42a1b2b22e4156b0b0c4e1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84956ee091c7cea01e14bee0aabc6e00812533a6f4ec69168d1629a2d75b4c7a +size 5091978 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1851799c6f8328bba06ac5d6f8702711afe9f270 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a2dc85294bdd250db0ad31e633e0979574da290e89ef1fe21f4908a583ae34 +size 29460040 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87cc5fee4fa46941f06c8fb72967fe37851c8dc4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db2f98e0fe141169a84b7e6724cbe1563f9e623f57e4a24f6cf1aaa7e49e318d +size 28889859 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9690a60f73c93d75b07698a4f93df94dcd5c114 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3de2a23d73247a8a8338b516a8db21f0fe0cc266b15d6e91d877b7e6646e1101 +size 11657253 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..298a2fd1bab74f0c50b0c79d4e2b670ec8394594 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecd0ddc6d6642508ad38bb0ddcb9266b0013004a38919c6f0a3098a868e773d1 +size 13882297 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5791f40f1d4f3c7c983195084eb27f0c7954e3ee --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72680eada12a926785ea90d13b5ea5aeabc087b8bc9d32f64e21ac5657291fdc +size 2857736 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb4cf12a34ea5e00637cd9d7cb8e878d76380bdd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43d44b2a137ee01c874fa811c51f56808047018e8797ed72cc0c56f98155d330 +size 5156290 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29b9a70c80ff872304676650ae63c9c3ae8501ae --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f80dee25b07598fe7eec70a96842aa7382510f9e69215fc35d8754fff5d016c +size 22339353 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4637a4ad1b06699e01eccb854c24cdb5fb783f79 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf5d69948b956cae216faeff5027f38469284e3c7455a3447fcc5fcd533e5f49 +size 29184831 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bef5448f9f237101331ee0bf25722038bda46ceb --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f57bd9b515fde6ef360e4ac8751267bdeb6281ce837ee9bc62bc530f4076abc1 +size 11770387 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ced67dfff5ca1a9fd0ef38497a9112e7cfd1e2e2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5807331899fd7871e9a8104f04d44037f12061ebef1feafba72549931f4179ea +size 14018655 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a71fe1d8279ff673e1f68e3738d117321f2adfbd --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1318e728c0b25e25f02ed386deab6aa0629a2523fb08bab3c7a901dbc9a5eca2 +size 6902168 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22ce2e1c36374dd17049383ccd9ad1f1ff02f0b1 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83594ee4cfef1e6d33ba17ae1c8bc165f8bd9af1be17a8b0227a341b2e5c9794 +size 3658348 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c221ea000e3fb1582260bab2fb04abbea2e41ec8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a1765e250334295f2d7c4dc4d219716e1764a07c5e43f13304d440cc373f1e +size 25446336 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a0a0ccd3be3192ef667356fd82322f67df31efb9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f1a6a4a6efc6fc97df0bd27f4df9b20a66d4ea4a65a640eb19b2ddf99a9424 +size 19961000 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5b38bfb9342346500fc5385cbb2f03b13146b83 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7fa413520d232d30c34a2fedd6cc7b451cd58a914adeba4a0fc4f28f65176c +size 5815858 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d0f69a5fc96af51a9d824466d9a0901659e5e1a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d89367f643b257c7e433b62f27377e8bcc5f7f8463df04f6174d3bb433a05ef +size 6567267 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521 +size 2115935 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..406711f558ab2cecd7507ed7fe5d76e27de88475 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cec60fbb27b005aed8e09d5e036b19f7f84e9b3927d9a3d50f9d8acdf8f6b5 +size 2900909 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4811d46b3bc13b9ca76bf2beba9af34643746617 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1a64008024aa7a9fb6a2e31d1244ee6cbc76b7bee094bd94209ddfc5495c6b6 +size 7367822 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db1f03d7b58a836daeba050e793d9e9b162c7fa0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6b7621ff7ec2d6a002447f07d612ab1e77d76e3cd539441b0934b9e5cce049a +size 8916744 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5de54e079e5524bd2a28955ecbe548630a25a3ba --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89755520784f11c5e52997ce8719fe60917a79d1d2e8f9c48426f720ea3f1e25 +size 5254516 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0957bbd4ab31c50489a47a6890c3e1a386f58c8c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9df4d9549e530dbdf6ddaeed0772e342cfad5df1750079185e6063b0550f578a +size 6040677 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d244ddc97639552735c7e217752441ee89e78be --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef9f4ea14304ef5aa4a9103d0d877ace14a474ac1960e249724ff2003295de7 +size 6073052 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2971c66d7dd8c5be6248ad8885a060d1ce3d7c69 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52d9948c237625dbe69c7e54d66070ef3b3963459a584ee0029158a8a8757d17 +size 3331004 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa23d870e6004c5efcda158e2eb753560ed438e5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff2aa99c049ebf70dee1025ddb649e1da42583b4a5b9e1d9aa67ed658127960 +size 21502674 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a90f0313692e726cf30fd3a1ff0184eceefbcb62 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:177a043e240edb35a0d9c3658046a26bbf81163fa1087ae636d8f0e20de53caf +size 15379876 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8255bf0a7b0d5288f09c11b963c451373fab9719 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:700687f75db9bf34404ca6ff5fd27f68a34fe500f8e945409c75be4a99a7db51 +size 4108060 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51f36cb8ed44c4e7d8251adafbcfd525a1d28647 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1388ecad37fb3e47b307626103414ecc6f7fc08c2df9e099a7df86069d371fc3 +size 4391606 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce +size 1864129 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3516ec995e580961f8cea653222e1f4e6862632 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0f356ba12eda7d82fe11d667384724322ebcee197bce784081ba811d957c8f +size 2557203 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2bf6aef3564ca63835bc8fe6b6ef1f51cbb6f467 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586e4afcdac7d3600e2b41755afc81ecd6baebbeab0e0cc598c561c61b789f28 +size 6496610 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc82262addcbf7a4dc34c5aa7f876a47e0238601 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51a7954aa9cb496d2a345a49e51ea24c9c4273018315989fdea0427bea06a9d9 +size 7861732 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..928301da76b21ff7ecbcb96c3f4de4454fa47f35 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d71827d798003f7c1af018355d8cf33b9d5d4d9bf632f1e6ce51a251217641 +size 4635110 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca9636edc1a2667b7fadd9061c1756052ad56af0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_pick_correct_choice_index_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfcf752b005a0267513c0b037e1a2804bd38c7263e15e4f1bee55d715f20c1f8 +size 5329371 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b048bdf12a9688b7669bf91b0034f0ec8c0ac706 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50acd65d8d663490814d0ab0f23a99b15dad7336f6d5ac0db8603c16a7aa3375 +size 2257061 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..85fd027180a4be3ed082be356e7fa894e7d12f6a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c60dd0a62e092e82e317dcf2f212f7a33b51af6c017e6ebd0d196cfb7db1f98 +size 3016919 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..10681c0cc987577ac1bca04d1eb3603a182ce744 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae419ea07a495223fa5ca783093b260886166e5828c1421ea93788d81c0d6057 +size 7547358 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..137b94ff11b6c2663b5f23dda57b1bdc3a7944fb --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b92c9be569badf15b9bfb65f950238355dcfc2d1a6fbb459da633dbc914adea +size 9036766 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9c8e5c329a69e9b562f5d9cb35225398ad8873a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52e68ea1698f5c947de349b49e7729445394ee6fa934a44d42d7b031a0e4acef +size 5293271 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d6f572a05a08558029e3adf56343fc26b1c7460f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_piqa_what_is_the_correct_ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd32089e42c40f57ba2f96e7d17742cfba8cb8b75643c5dffd3f6b5cb1ccfe7 +size 6054420 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..de22fcf192d9017939a076c4d0937064c3d7a463 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba6511ab0f0672472304db1788f09a0809fe67af21eb65647c9bcba06972dc6 +size 639966 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..179d9329d425e9ffb4c17b9a4b5f80d352a0738f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6ca0bea5e2bb5f3bd9305d408d9aaf80c9bb2192195542fa12b5d3b25bbdaf +size 755083 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a36e19156dccd0263e303519a921a05348db213 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29792a67002018d13cc9ce6eb41fb658f4b38885bc25417c905433fdf25555fb +size 871295 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93dc446bf6863ff4d5837f4fd9c502cae8c48090 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49925567b7242dc3d338244101bae8f0c94b10e2b534cc3cb89c74b91017001 +size 985642 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1330e5e91d3e452b2478b8992f6ab29db9b68f38 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2929608b74f02738d3ab1d87401eaa8a9abd0c3e8c7745966819bf0277d3bbf4 +size 1098369 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0bc708dfa7139010189e939bb3b5f3f4a219f147 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e04497fe68d646c2fc757711bf5e744a4400403164f2c21d886b9470697b37d +size 1213552 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28575bc8e7551a9d6b8f3567cc5d4c9003417f14 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffdd3ebd47f162608516ef841d1eb79e3258f1be8803c3728df34907a9bbc4c +size 1182464 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbc59feae97b1b5961287942cea46e0b0050068c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:659ba3a6e3b3829bd5db38afd48b4c6a6270f9251c8fb4521b445d0fe79e9c90 +size 1779263 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a64d18cca31f88437270bc697f4b1fb94529682 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116115655997f20885914aa38879dab823e46548b7d82a0c732dbc26eae7b685 +size 4776922 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..678c0806a1a1d41d1b134b637b868e4bf9dc5633 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728c0f9b6a47a409f84c6a1d169a0fbb61309582f2fd6742e0b6a7bb7de1f1c5 +size 5946786 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78993f173208b2cda6865f7e889e4f47faf27dda --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f75f7aee157c382925cb9dc1b9701a698d9830d44d40326319786a97c167f9 +size 3555731 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb176a29a607eb23c67a8894c22ac9543279c2ed --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Direct-Question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb29d3e54629c6846f2fbdead3b696f5d6dede5bb3ca65a911153d48dcde94fd +size 4144574 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d5388b5d0671ca09533b497987f499df7f11592 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adcfef3c513a5f73643efe3184adab2b8e5a0f93ece964c3a9e600513c2d64be +size 1328688 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ccad3952447ac687331a3e7f96aaf341ce74a21 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff5bed76b99f62f081ba643c2f97232af4b08878d5a2551b9293e020ffb657d +size 1524424 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60a99f2157c7a9c8373b661399766cad16e6db18 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a64376e43dc94431295cb0f5ecabe97974f39ed7e9a59c832db4d6d54c32ba6b +size 3440996 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..629e4b61ff67ed95a8e24b42bce53b1b60e04ed6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a28368815649d4c4eb59166dc162bd6f131ce41ff5e468140654d9ca2724aae +size 3831476 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8695f483ecf59d19e29135315fc9df50588ba0d8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48bb19303b620c95c6696d0ab5eaa3583759d8edc151adf3994d2466dc570ce +size 2107422 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b31fa0092ba8b155fe153fb89128beb1af7c7ac3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc6c55569122e64f11d7a69b14f2e3b142a0c10d54d9abc98f04b762f0bb4783 +size 2300813 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b535dc9d884cc2e78554ec8b5fc9ec69809dcdf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7118cb5b880e8f5bcae77ec04cde801388f55f8336ea8db5e0ff5ad0fce2c65 +size 1934749 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aabf6267977037440f30e9fc669a12face163f89 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4784e1593320aae252bc9d0dea27456ef499a588db6369e34357ea1ec8273557 +size 2636121 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d480ebccdf539743ca3d9979589e7416c7e3a955 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720e580324d97875f2daaf47e9a041945cc55984453f20efc3a6ddd1d89aae27 +size 6698194 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9aec3c5a55a9d17bff0a54d57ed88150c2d4987a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a728b2072fdf2fb3c7fcfe342aae16fcddc437f13d7793a9d06d50b76645c21d +size 8077524 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..038e2bdf17ad0f2ac84f2799e6a58994e3253456 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec68c156617e2fa953554a3d95d9c64004c2450fa8cc87688f14dc69baa1503f +size 4724236 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..486911016769bbe2cf809ed381cfc8467afa3e44 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc9cbf02b9b6c1f9fabf6e2f7058d2f0a6f0ef14dbfd90cd350aedbbfb445a3 +size 5415273 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63acb03de55c889f12389a7407c4da0b823de5db --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2492f3d10b235f126bec62fc806a84c2af4f9e240e202e457b3ee75cf5e1469 +size 1869831 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b686ce4258da162144a7a9eb4badd1231eef39d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7784b84c710096348872b74e4e541ddafc259c1c126c01a08f6e33e99e501b +size 2545199 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47a3e795c34deae8c0915fa270b60e6e1e66731e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecf8490cabc92e6167a487da0651e3a1b0422ee576d89004c3f679b3178563ef +size 6464338 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7d5bb28233b899bea852058dbbd9caac924ac3f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:904c88a62412c2a606b5b8510b98ad1694d065a96e7d1eb75d22cfce4edcb5da +size 7791652 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f95401175bbcc6784253685b1a2a87f58c0c34f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b3efb86f0b566ee0023224782874a155e92ae3ffe0ecc7ea8946b6b70045b0 +size 4555158 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a87659a78cb2d5dcd5d94fb692a57f52ce673f3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d8147355093f037b26d69e34088a230eb381807ceb865cdfcc393de9b77951 +size 5220221 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65e3a6817a09ffd0bfd444730c149a2764449871 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b618b8bd6595199c6b98fd02c91a5106e3080dbc15dc45331408ebb9f6f67f +size 2214191 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5eb4d2023498112e15290afe9410b867f7dea1f0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc47325cf77c9c9975a16f0725a3560bc86bf031b41da4a36df6d9ffd874aae +size 2974471 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..085a877c180594fddec8685d37f45cf5b709df03 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e2fe67e380ade1e2e294ae397860577d81aa9608ba33022ae425fd87df2e7e +size 3726901 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b821c5e3cb0cec2c614ca7d53d09efec221c8e0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e8b0dc97f06f3697d9bd968936485e66600de8bf691c591e9f49b28549636d8 +size 4478822 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..534a21f5302f7f731b981a9eaa8d1eb9a6a8198d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc144fb79da0df2c4a82de7c475055e917866cc9e35eca42c3db79b65b2a2eb +size 5230349 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..12d7ddd21dcdba10301aa26cb9e08632e41f4fc4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ecbd3b752fb3626c8010286b3b868e4700c9abc7a8cf664b330c5ae84ed0edb +size 5980863 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ab79bee1b0c84d1bc9b40d16a8d0b1f12033d02 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5739532b4dd5075d128e202ea9cfe5ed6a3daed22ecd87bbf2f193dad0240b2f +size 2361677 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91a5b67893b4b8709ca34d93147fa7f493ef9b6e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a21b0e1c5a356ff7024085106db19c92a55234e966979c322025726ee8b245 +size 3193447 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6326cb390ff473a55907077c58e9120df0187a37 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155f853d424526b1308a5cc9ccfbf886341e14e74798085194b292123ddc224a +size 8033252 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d3043baadeaa7a4eb25ab66cc1ddaa769621b646 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8002cde30fc077fe3924745073dfba206d2494e7e4cde33c861fa61bb49dd1 +size 4839254 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a73ca03df498d97c26d2e1f79887842a169ebe68 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106db892c435498fa90ae74140f3ef7c378bd4e5c06c20b4dab16961bc602011 +size 5662014 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a4f680717eb2a8aa06064eeb5b577deb85ad89a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c55440de94a65d7d32cc29c22e2d2cd7d8373e8f65d44446f7543e9e61d516d +size 6483438 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26c40161e47685f2d70ccf9ee8c66a57c3e795e2 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad7c2b4abfaf31748ea5228119bb8371e96de438fa2ee542625ea08270af868 +size 1879582 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cc117ec86af37f76b0b385085e4a21611a5fa32 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a1e48b67291311ea4e6df21658e068f9e6202eb8b7845cadd4f8a7c85ee133c +size 2435365 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95b6e14514e74a4a5317636081274bebdca59461 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d55d0c847963cce2b7055e43b63b8b95d5013f9bb6606842d6e29b6a9e1754 +size 2985864 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..55482e085ac47a0c297e776767cb33869af9ab9d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15cf65a95c0c3a165ad84cb4eff6397631c29f230870785c6d61db8337beae5b +size 3534461 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4a0b58dd83f07ca6d3449d0493b831f9e208c5a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e76646db88645742d08f4c20fd81061ea2ce1016264516735eff331d63cfbe6 +size 4082938 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..31f58d7c01ee52dee3f4abe8c58a94d0e6c9a847 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Generate-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3bbcada90635ad44c0a32581c577b8bed4d2f3b618dc5aed1d11af073ace840 +size 4629967 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c5146aeca1ba4be9be04f23cebcd6e7cc20cc1a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab73bcb376ef9d4e91f9451bec868281cbb2be39d307b851d48ba5402129d678 +size 2371418 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc419c48c2867293de6d8f5b0df7ed2bb910fc7e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076f80979dc46fc9de3420753cff3fc5592b818187034cab504c97ef7b15f284 +size 3210570 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d63de8f05b0e54f5602eb117fa5791ef370de19f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd1a9d32e2e41c65f2aa7d8d81bdb79025288a24d2a35e87183391cc4c59f8a7 +size 4041361 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ede52bb12519d5229fa08b3ccf2900d4036ce4c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6950180e9b5a94eaa77ed30169c57624ac43bf0be767b4a49d53f8ab3c94d83 +size 4871551 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..179b6474713d342a77b2957e7275e8395b7fede9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37d3eeef677b6f79f056e9608aa2aee077109fd11966d02fab90a6d394be95bc +size 5701804 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9703a409b72b7f156335ea0d78b79572ec8ff471 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:566fc78c197e2aa7d039984e8649249bed7c2a6ef5fe1011a507c089796f2549 +size 6530965 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38e528a1cd43c12f5d19d3e4f96bcd37148e1372 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37838d5a57efd3f692e4f1040a1ec4ad5db1b06b9a5d74b4401a3ba97511a363 +size 2344154 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..06c0ae820321c9a170e1536afddc44eccb0a93f6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d4a1db71436935de337b8dcf13e26cded3f23fa9f024fbe8d552c11b18af4e +size 3158204 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f68c37140f307cdcced88b518fee22e7c273168 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72a9c1005ec57409b29c433986347c327f21dbf331c17b7b3ea42af439b21101 +size 3962824 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f9eade8cc86e6fc5cd0830acea81e430925809d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5527ae8c59380d593da8d26d9dc0d2024b6892c2b7b76ad8e6def2bd09098c03 +size 4766387 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cd579b7ec9ce670d7e574565f65ad53393e0e0b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6effd7caf41c21e7a61ca5c6cbccd3f56973a1a298c38448c292ef1d654e832c +size 5570434 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e963f2098f93309a16ceafa91a1916bb6b65131e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd9d006bf1eaa7ea65a25a365f370159a01d437a879411698d19170169724cb +size 6373367 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90379f2a9910e91d41f19762c47b04561343b055 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b9836e3dfa435e2e12d211955bb9be04c7d4f20fed988b40205fe5dfc6ef62 +size 250587 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8cb8207c7c2f65f905b3be7274467d417a1d527 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed1377650795e238c768971fe86c90463afe946ac7cef580810dff46075523c0 +size 351379 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..110bac3cdf8c41d06d1d8f24f87f58fe6c751e0c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8567cbd95ab35b39eda895dd99bbaf9e8af5175cd13d93ae2540b80ecbe84ca +size 449710 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d77266f53dd452cc128d4ab13c309739d3bc52c --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a92ec3b87d9eba992cb2ba093bc2b8502d741b77ee3af0f106a309ed7fa710 +size 553092 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b79d4085aac4a67db06e305d223346f1a49e5b9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:977cc99d275f12f5058ba56a6f2c7f356073471ec80f6584a9907fa778b03895 +size 651705 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d72cd4cc28fea374d6a43601ea7a4175eff1f66 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17bb98c04f900c73fe077b07ba2a1989c69ae8326985eec49545326fee7845b8 +size 747619 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..210de6fc8134b858e5e0b48a11a6135286799f4d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4feba47e4bbdee1c3e07fbb522e27c19bdb185d04cbc9ce74115002adb63333e +size 293092 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346 +size 415394 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9274476fb59b5cf441ee0905f2e0c3546991f5d0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ede7937a4b1065bbdfb0bf6680cd09d250e153b89decaf421b649045e05f22 +size 535255 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd7754a171bbc3bf022a0db9c6f9744971a8f6b9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d1ce930e52a12c15f8bbe4cb24824151016e662aae1bc827b6d41b4f8c42a21 +size 660229 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1147fa5b1670731cf44fb94298ac1bdc6333613 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9e68f7c9d8c053f14962e1bed0494b7a301e81addc3304f1542b73255397ea +size 780431 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c3573495bef90d6b61d9bbb54685c2c8ced6b636 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bab4c7966da8aa02c79b24eccfe47217fcf289f00b5e9ce47a1a0e4f3ab25a7b +size 897925 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..258f4cf2c5a887aceecbac8fd87cef730cc208dc --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36804b62878d6f3f35f5d452015266d65e413df71128db19cd77edfca755bf6a +size 258377 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6c9f548940c7a7a0af8f5ad9943193a2b0063136 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36922d0b250a4164811816d2dabbd6d6ef09b5cdb7349c564dbd17f334c6083d +size 363043 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7055d38cc0cf764f82fa73f3cd0ec444cab1fe4f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:960bdd3b353e03ae46522b4d91610535f0284af88f66d92300d0167af3fa2161 +size 465213 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..50ed134b0353864977c8f2a86a77fececc94ce66 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52740f4d755205f194171ea933713f5bed449873df6b4fec11d3efd9cd3919e2 +size 572448 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd3a1145200d82b6ac0de1145711389e37bc2983 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43348f368a925d834bc51233fe52172124ec698f004524b23b34e6b50155733 +size 674924 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9c17701780fceb605d1dc0c1113951696045702 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_does-it-follow-that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:998f782f7f872d5f3d60389097efa1486bd0396efb74ef73a0d15422b918192c +size 774698 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c7b45d2a8388dc8ee9736fc24286f99af9c107d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464a7200e3e44ecf0a971a88d303d9d2eab599b7d25263d9fcce0ef20b4c59b1 +size 261257 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d +size 367750 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3da1703a1b608854241f2b1662e1ee930d88a1e0 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8658ce7f5afcb940bc3cfb428387922e61e4a5396951e70c3a0279bdb24f41d +size 471910 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba4d3717d16293eff1c9a97e0c62c438aaf406a5 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb3e5735195ba1813e1b941f989a775471ee7a854d82c1bdd068da50a7623a9 +size 581078 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e23a07f30d52abc48153150e5be9e4db93b4c5ac --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e502046f5c2a2c7ea3c3261019de778ac6813cac0d9f06a0d18da54a96cf6902 +size 685496 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c93bbbfa556383a546d1fc6854468b08d260783 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_guaranteed-true_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:644c62418c4f54a707b036d53af3fb3c621bf4d6cc13549df27f0ffb45304251 +size 787203 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6687aeca726361f9e0a88e077eb83e90e21a4e9d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90d53d86ed85eaf1d6d47f121ba765c0cf2ae9fe23107763e401880c69b78746 +size 262255 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f +size 369689 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..32c48926e142cf9aedbc5bd9aa6d016d6afccc1d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1797d01200ce09e681e3da77a33fa557493e705e3bc8488db6da1e77decb62 +size 474665 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19872c6d12ae45cd08bd135f60fcaa95a6863708 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591ec530655670e7577bced3749f419282e0b5d6931d5379068fe90a8118f1b1 +size 584675 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29111ce7a402855b4f58f88392cf42c1e427941d --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7398b92f7297d79483531c0e71112d241fb9ae8254b26934337cfe30f8165a12 +size 689934 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..368a67bcdfd305cceff26f14d4c59b79dde918dc --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_superglue_rte_should-assume_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a05e24863be993f4c622702ba76ceab1ec8dd1f5ae1be654dcebb9566b74b1c +size 792469 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fcfe588154bea60af25b5ba647eecc224499feb9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99ffc3414c57dfc96961a2833cd7743ac85e72600c67dc3399187cb4bd47bcbb +size 1039216 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aeef5f4c35c4f9fd0490fb3372a353ce412ce748 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2cddb9018d85fe2078331b4d63be36077cb6b2950609ec19d7eb05c0c932735 +size 1300223 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00e5bc7d1b55767169ea343a8c27b40d0269b01b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b10f73f15a87e335923916a62fbdc9a6173b720314407b5c39d4134bda218463 +size 1561300 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8f931f4da26c0e01bec3907c3d7b2ffead41dbf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27a714dd091c80d987ce9f911b85847c722fa012ccafea7a8ef1d6a4b53ee70d +size 1822774 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d80c4207f81b033f935537a9e22869f7d26db4e --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ddc542fb7d588f2c68d8bb77f51b7f59f17c890a9aecb0efc935ec20cea9ea +size 2083087 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..161c66876adef8b71c740c2c009913c7f5adf6c8 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_Replace_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d86b003221aaadd287c2e22df58737d147dabedfeae4d249a8fcf51ba1d03e97 +size 2343803 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8e648ab4a79fe8a9c9ffedfec19f0a890c96b3a9 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2cedf1dd278ae8004f3da955963307a735ab74f71bf4b907fefc976079b131 +size 948112 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..781fa32575c0f2e2876e6b8268d3abb4bf1a02c6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487f1dcf239386167a400d98004ce9a7646bde999bf22771de9280a6882f9beb +size 1180417 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4893c4f2af1b4d1e3a2d3ed2c705064abadd2cbf --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30ef7d720948784b47759c08faeeb3151069838a2a02f450b67265e65e8f73d +size 1413083 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a98794ff7c27fa851c1830b9865e0d9f30ffa209 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca72d6ba4ebb3b34dd7adcb7e272dcb2fc47a940515a9a1a70b543d4921a18a7 +size 1646074 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cfb8626500c6ba094be4480014f4120e49a880ec --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa707b3db0189c0a9f4db2662a1cdc54adf2b32f52ff33af6983ebac2f2c1347 +size 1878114 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d5cda954ae8a42cab5cfc4b435154f70e3c8710 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_True-or-False_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05d66000f1b30ba5be1562d9ee252917e9f203e1a64b6a277dce7746be17f12d +size 2110231 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..12d22cc4041c8cd7ca9967881de138139483d4f7 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de27e61d55c776054833d2b2ca815b0bf5b30da0a8198882afe47e9336cafaa +size 1010014 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd5656b634c9745626e75a4a9b130d86b218299b --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b6055429803251ecfed0d787bb9809b95ca13bd3bd64100e4268a2baf91868 +size 1243188 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..315dcac77bad05a837cb9ea1c8375bedb641061a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add79fcd4cf9ad52e6dcdea716269bdd65a7239f1db5a52461baa47f2d013908 +size 1476447 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9017840f10b1c7e2ccea2aee56723cdfef0ade40 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5013f48a8dfae52ad751ebd97de1f17dce519922bf087991e49e86024d5ea3e +size 1709925 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0f62a3e8e4b2677299fac3d5ed89ca940a59437 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bf68eca89c123b9ac5cb10dbff80e23b18f4c8bd92a380f27e0003dc92dee8 +size 1942354 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38751f0b2a06aff61dd2feefb13d7637a673bd06 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_does-underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ecee7d9fa474d5c523f8b7b51842d84eac4b690ef45c4ae545bdd6226722d9d +size 2175160 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..293083293e6f6fcb21b62184157f98ee2b31bbd4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b034197e7d337d48d0089c54663eeff0ff82377cf64a2918c072ebea0004bd39 +size 969466 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40f244f84c21fea0e99da974e11bad86705eb3f6 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c5f80e97dd0b5c49197bce40daa0e2cdb63594e93b9d00bb8c9cf6aa68c852a +size 1205106 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa77238ac416d9d00810b236751d034542098ded --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6eac192b71779e0e3055c9918a8a350fd1ffe9f73e33e4edc053534eb81dd9 +size 1440918 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abb739bb911547ca12ef0b48b45014d3cd593c33 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2031ad9a1b3c649a9665e301ee283c8cbdadbfccfa9dc6ff876ae1dc063fe541 +size 1677042 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da7af779a68c8c80e01dc5ba6e5c77a030aa4ef4 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a00cd73673613f5b813c2abaa41153e54cb399367538cec78eecb8bd754d6ac3 +size 1911966 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93f7120c0adf7beb397303666ad0ef80ac5bcc21 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_stand-for_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6aa204d80217f26459ecb19b69de89cb9fa95ea45dc6b7540f8db9546bec5d +size 2147413 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b748ba053e66cfff4d53302f41cab024d25a5200 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6292e089482b88593f2217bd1321a86a8c5d9877030dfceae6be6619ce14efe8 +size 1016419 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4bb1403c693ea7d9d427d893cb94bdadf4df838 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b060001362c9e978d30e1617e3529f799a2ff187660c07418c87c3ee523e7cfd +size 1257107 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b07f6c63f23a63bb8f6b5bce1e7a967302478b2f --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ccb2ede868c8d847ab948ebd91d99d5444d9ed3ea4836d683acb3f34ee4fa1 +size 1498035 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb8ccb21f3d30c880cebc28955d7fd6bfc39f928 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a301ad502b936604149ba2d611eefd12b42bd5dd2b63ef85b33a73df961e1b59 +size 1739153 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_4.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..604bc37e982fec81c7465c6cfd0b331d5d76b73a --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3325687ed700aefd23457ecfc3b32372b3fe33835992f980e6fd5039ea4e1e28 +size 1979196 diff --git a/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_5.jsonl b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e93258b2c637d9e974d01e8634badb672ff21eb3 --- /dev/null +++ b/4b284b12bc4/eval/examples.4b284b12bc4_winogrande_underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c941c46fa923fbc6fd146c7d2cffb5857e27b131454e7d64ecee380a3a34c8b1 +size 2219562 diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6267ad1e49b3823b2f7291762855b52190024d9c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4070835356827751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03514958095848397 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0758536616906455, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015747064380670645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3264375465319237, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004888854445231445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11509298027342854, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002040147114373331 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03493638633069714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009342574915112234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15766160622381195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033114573324024405 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0532813862747049, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012579627803205211 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07257604824526195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014483785678009685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31637706878833355, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004769735504597033 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1105412242108245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019072286738988954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0714774644843108, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014699104009543759 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.307939556685913, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004520814685280998 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10843545057843905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019083088150967664 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3538b79158a1c8eb4e80039636223d384f2e1c27 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.41914858834195134, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.030279335876129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07536633674836868, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001620641410096321 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3290768382699901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00481767508183653 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11424698089656772, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001973221738343803 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03540467062379218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001074817084017668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16089821041540717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033011630774406127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05368591058094131, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012551880063213156 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07231503158237214, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015163361416883465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3189205930522712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004694857387684187 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10991123942051419, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018557651460448018 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07148579673935408, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015357817111525064 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3110112645350247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00441643475943137 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1082043807305256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018480349337665876 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6cdf55cc91041391482ee0ed01f3925f2acc9262 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4241874936612034, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03699728854949305 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07469786641233617, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015771153206732972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32891693541469197, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004751520151482175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11375522621692136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019642936162507533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03462695918297652, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009079391487918842 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16210166248343671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003411098262587952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05344291957030947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001233885317072834 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07161852924412807, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014761860684115284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31797917629392425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004598849198704314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10934081987838024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018415550723111455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0711214967812572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00149377360051449 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3130870814045286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004421211065212564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10823991385374933, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018380668821100924 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f803c2904d39a540f9b847de6409dfc746709451 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3916994292697065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02655023153261868 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07713695315738618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018521617901133295 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32641437991318506, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004583689746653368 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11443103117633296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019845366723218495 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.036319480745632425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012079439649413412 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15985213856119682, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003223582265695079 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05368996382308088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012403567348119643 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07333561421491072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017170202297610163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3129899723170166, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004403504671395443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10920281437234497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018472429946517301 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07332584684616669, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017553031622823821 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31070372160179327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004314602758278112 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10882089385505, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018674885337082484 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b643df2e31f5455e6f16882cd1453fe9b04b2241 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37875018794247045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.024296780304434905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07231813177556075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015118246585830762 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31870699434574523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046463072458484975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1102139471634063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019620155943445507 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.033695317164630666, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009163247572691914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15554105747469235, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003236397171744527 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0515680827205002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001213141047008391 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06901574750871842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013947803448898716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30638147232578594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004472401624140904 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1054595308766645, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018290764447497754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06885871876103705, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001420315845279487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30338100654345734, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0043561963135752306 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10492767242713451, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001836832016012516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c007e9ee44de7d4b4c85788fd2e7bd1f4ce5b0a6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3689406693649318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01833284872989782 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0725733890131515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016541722828599028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31647681542290346, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004649887369574888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10942321553706275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001960578009336271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.0340511038621137, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001109810266658632 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1539259113242296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003295678214536681 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05107734688924233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00122669666906548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06920155517233274, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015510532870385023 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30372126675172995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004472388579309833 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10453686352175766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018279045748061345 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0688242602910231, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015744817424633028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3000075619025587, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004328528641012373 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1036182486745027, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018321897232056268 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..182b692540369af571a243974b1f99a3f4d12369 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.2320345566104918, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002028663485069812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.15806405453574032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0018950997131969357 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.17529221125446331, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014916739627182348 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.01677596652301549, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000700778712609751 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.012168726274182532, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006308549540794899 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.012985384177633208, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005793970465797221 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.1871404213284981, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001604901624172935 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.12845338233190606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001541707987416157 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.14159539007628982, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011656632831698861 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.2002817812856504, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018638576492756192 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.13376513099281687, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015142202748567432 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.14922590343653938, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011984205257227288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.041197050082785285, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.001108996485999182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..55d4e39d4ef4f2ffaf87bd796c6a79584ebe695a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.1368123990805212, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003859738443155984 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.21915722251491454, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005434745495444146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.1478464813817612, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0038320333818157508 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.03979808957820905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017989632072343414 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.07332816379948168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0030403554888855404 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.04640292562526275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0020116942927143034 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.10824512629470052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003088739020891906 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.18075899176594987, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044836417615243115 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.1170703502765758, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0029311547998135696 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.12042587524943701, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003511890506396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.19133333683300907, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004812564968367625 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.1291123762138599, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0034020744059904736 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.8996196002528378, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.19349493429212591 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f434f341e98ed3527b160dd120333492a02ccfb7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.17608939105537802, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004253817230115363 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.3431140264090531, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00534101919038467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.19633951658374446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003958953046041264 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.06908776827206395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002528705334523847 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.13897905823799864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037630298921426447 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.07757017845101091, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002400954415731867 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.14034375265741814, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0033583359725794662 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.29506101479424657, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004712859932989569 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.1586443874692295, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003057777683650459 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.15250105487675392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0037920002252605505 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.294858148838543, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004765884466280505 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.1689535186370901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0034887482192121964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.640138834989728, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06878839244992584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4af138cd6844aabbe36ea40186f283408b559588 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.1480732637246692, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004206315352922732 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.3705823844540417, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005255405787076392 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.16851267425125402, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00370915891341574 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.05851115547615061, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0023290778302875307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.15887512559363004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0038897602570469245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.06808437331115559, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0021900035041626056 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.12082840824613766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0033519196264855337 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.33255889008853323, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004852038022387119 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.14033970117873198, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0028896013836388623 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.12767131894850914, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0037644778529259577 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.3162971688824102, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004747043761926457 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.14398542256516328, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0033027404123591544 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.4187050896119742, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05234246731012316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f1a9a6b657627c9f1d67e5b2efb483e518565a2a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.11219617600997056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003512511634359904 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.38024888704829407, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005180394391296343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.13586161882742837, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003197161076846914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.042085938124191064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018230760751544277 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.1645327089113456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003997411490344945 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.052457179235399276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017515669453131311 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.09344570812882523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0027200533279541257 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.35242896432602544, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0049265850395284186 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.11671197771344118, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0024641462705413553 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.09539660575352989, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0030856580577620744 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.32391745612838724, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004709017013998289 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.11463457129527055, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002803475929481998 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.245079321161806, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02778718321398768 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..667bb019ad35a566d57edec99c201b8e2d86d92c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.09393710373652846, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003199269737822915 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.3745567023525177, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005019704359364239 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.11729721313967732, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027320054268922134 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.03517936482889984, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016983516245859785 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.1605210440862932, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0038664434520114016 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.045075512409701604, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015508059027240881 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.08008998231575896, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00256681324622759 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.35287444686063746, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004875421748566501 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.10332377556822082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0021608967117124093 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.08023373310406072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00288649450260082 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.3198596952956621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004543895911760099 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.09887761108872525, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002397235033540902 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.1514814811735028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.041945649669857865 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eea0ae3e3cce1e210ceb30497152f43cee0b1370 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.020794810389244665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.008286245115526584 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.022922154005776907, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00035921812588188365 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.21352555655696376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0022208454109318375 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.04006653282921872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0005481740262477606 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.0010806676437478898, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 5.572788223887115e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.013144891498511075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006988181195494956 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.0019179536475281184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 8.819595205548223e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.02287171287886853, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00035207258394197473 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.21329473631474377, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022102531793857646 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.03999564658771847, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005417492107414437 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.014803873787478817, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00023955003605054956 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.14458682760689323, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015738069244451358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.0259194629518608, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0003466200097817442 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b57919013cb6c952dc8b71b3f2da4c8404fea9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.4861728821422337, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03584212201959426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.05062962614453291, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013437525777807101 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.3417100308532278, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00435704773787356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.08279157012164147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018413217717905348 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.015902434132343327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007863360597732104 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.10098499560418767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033802047604672408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.025426464984268635, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011153275941210136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.046198311688407115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010451508171373679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.3277232393217278, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0040337705566764045 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.07649332751100299, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014727630258473052 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.04203232751325336, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012323483368875934 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.2835715586404924, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004168417537419584 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.06849359236303296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016968890488419251 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a02a0ce9babf26c3b322a63be7a3d691a308736e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.7628040722794971, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02211715710461336 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.05503857963517623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012530235158013246 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.4237722308516944, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004678442445263298 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.09191514123583507, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015945801148090116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.019614084687761036, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006958667373462588 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.1680913399724686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003936935332174803 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.032591510976486694, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009225376298957598 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.051275890765769695, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001008153954165397 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.40628782480488934, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044353577159538015 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.08631782016595303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001327845800016554 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.04580878529585366, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001122992125462262 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.3581492753310056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004381030885989483 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.07633989045692885, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014222030587289858 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5d4113a9a8071d38fab476664dca563a48f57965 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.8888663782890923, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.031568607067727 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.05690926034885604, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011475096014499507 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.45426869216384025, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004516571311698728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.09625702326801779, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015524610950718182 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.021645434712929023, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006521179846718135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.19508795473756288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0040357210440640995 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.036709719893509046, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009246274500291746 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.05330602414074714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000960960782770985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4341135138594583, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004317367652444747 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.09064023633752837, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013324331225587393 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.04733129735724454, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010260711095464196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.3853940847006117, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004214948856669242 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.08003764881477311, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013775813454877939 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..543287200215e633b8edfa11f06507f0a54afabd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.945854666461263, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0402112381571833 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.05604585253845832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011427724679781285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.4538828578884873, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0044575960643494774 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.09460944755353914, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014786963748578273 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.021574156553869385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000666980759824044 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.19771338204023536, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004119461394324271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.036402813498665906, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009043444520209133 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.05270992641742948, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000977850644744796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4347653663874296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004302103689446801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.08940502094865169, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012882532753422126 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.046717493551301156, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001010133723082569 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.38717142823870215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004286251024406202 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.07885538264707831, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0012911242439370375 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0354f186f97f4e1f21040d7907fb36e73b4a4471 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.9354058693381648, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02810525494577349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.054228941021206276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001096252527267709 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.44052473635855544, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004434086426797913 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.09189669042348231, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014444352152093109 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.020624829786179254, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006718344531212511 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.18657022488728872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003970246553779722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.03482971179628577, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009022373095447741 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.05120088679459415, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009633103342396731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4212363318994364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004211846253373217 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.0870670682221663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012838750789689254 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.04543807296984024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010157838089103063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.37673581998691646, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0042662278147285095 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.07691182427135196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013060342828319171 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9ebf00a4809e7fbfc4bb3e068ee942aca24b1244 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.023111606817375823, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00034433259453187245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.199898162382514, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0022216437870279087 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.04023278195426544, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0005477579043898359 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.002348515359030586, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 7.292209026413432e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.02366089115265813, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008284819548104561 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.004150191718708099, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00012739617704077432 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.0229905917432032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00034282566300562767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.1990997429912886, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022213443978533944 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.04002572630665756, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005453486432541972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.01905255519601842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0002660747474795132 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.16903333590221456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0018194040988673658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.033238272140701595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00042068255632681374 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.011674876863475004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0021009000372254386 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..954a120daa6ec99593bb2691bcad28f4376103e9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.08967550123251418, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002937976752458282 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.4773024647388192, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005382160167374159 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.12976799696821192, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027894949987997152 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.03875399838383589, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017591539560822005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.21057258380768887, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004063079045737644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.054485621293343514, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017012540588488038 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.0790538192004279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0023876721978287347 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.45016078797153286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005136353137741063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.11667620039186845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002240166240092561 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.0758745124201561, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002591817404665531 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.40795420296814416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004919957414755 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.1093056100326863, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002463062122648188 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.9921366651933168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06271540865538455 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..269acefdcbe82044f8ed1096e0bc43e21f0f78f5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.09279613971509679, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002133744119404951 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.5489640435072753, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004743143133672053 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.14521489817363703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002414506324594877 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.03909773513534565, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011574465963972958 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.2581685386804721, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004193436933392039 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.061645677874947354, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013858346723081972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08058580758928455, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001676284644699535 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5047433145790685, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004508210716996378 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1276872860164771, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018563839053403013 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.07756422907254948, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018428480505735482 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.46809438833435923, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004457544768356877 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.12134285926676484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020676862675326735 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.2825526231821482, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.043492144074173136 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9153d0049e591f3f5dd2b5c9d8f33856c35e531e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.09618625233754133, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002582113721152913 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.5457125057626717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0045898974012682685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.14643243275737228, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026100965175248907 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.04417002172231704, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017784684644853198 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.26721249890626364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004205423420143083 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.06571024213935271, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016742580356473582 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08292794661598844, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020960952284175775 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.49763673601262126, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0043726175282994455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.12785451973740822, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002033342221993515 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.08127713958483572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002294208086828085 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.4695938897289019, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004353276805355962 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.1235235922343905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022715172117069057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.408739047525639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037077565091005064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4944d2f7f90dd9ddc4600cd896d4d2690882c31e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.09479449002481802, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002655577330384621 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.5432614689390652, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0045982556155288136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.14141133839063644, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024329396682929907 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.04363505527515881, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001672653936113288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.27112334525352516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00422467374714849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.06374220282296517, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014814715876127712 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08141857432754715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021601024343818197 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.4913866663151205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004409556454898837 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1229318925720696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019091259065232668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.08007354294909853, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023461776768967193 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.46756254108881956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004390247412461436 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.11894159333044178, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020882148965706148 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.3615130495425574, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03941780905887599 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3097990bb45ec7e5678083578fc70316348af9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.09830029665183404, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0032438344323854184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.5307155773373864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004576617366471351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.13901031487174892, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027114384521542325 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.04702658162841678, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021412388855118754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.2655552533910532, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004201121728916801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.0639493149144395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017580675198847523 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08605550281508571, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002809323179439482 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.4835198358351258, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004434793924159799 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.12271761011397488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0022754941590082543 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.08393751091304703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029130881830766323 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.4589032783950323, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004394339374968956 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.11803135941981421, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024007564296031226 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.343037351813773, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.036140936708617705 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d6ac903bc835924b8d870210e91d80983589b6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.019449928935063972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0002996123768501138 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.1629409626311148, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014178344731677658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.033848667122813925, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0004785862714798914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 3.7290701550733404e-05, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 9.057889706362649e-06 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.000247591268503305, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 6.608651546632237e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 6.345797512857661e-05, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 1.5382459284932663e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.019449928935063972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0002996123768501138 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.1629409626311148, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014178344731677658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.033848667122813925, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004785862714798914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.013350740331407294, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0001939630721901288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.11827840682573859, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010744315351548378 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.02335480003154784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00031241488270193515 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.0024661624004243305, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.00022679513212848655 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6414cbaf6519d159deca28f5670c00854acf1b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08707890601958627, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024828597057122404 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.4696545657848939, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0050361060172909415 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.1299893322446928, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0025804647383953576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.03503697621648224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014028562973121522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.20358535015420007, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004005463978193232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.051815361827752766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001524719270466829 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.0774736609326366, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0019875056608878654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.4430174064260006, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004789275467133504 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.11759668044891276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020751461605538037 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07410190870582686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002187569905436837 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.40495877524310353, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004615318859024589 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11034796536631672, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022796848420741974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.9141145179175494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06536296915571908 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..61db20e8759e8fa04e7c546b89b43934885102d8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08092828074865113, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001906490261574064 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.508339922983901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004811650108707499 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.1274257819886352, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002067479681389862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.03434375738172082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011440228205307629 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.23872647813314588, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004162934850661181 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0538074768484528, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012562150972743444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07273561966493279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001561121717880125 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.47591647381990865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046462730133157 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.11571836456900784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016908336646337485 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.06920233060909893, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017246238008069412 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.44001472970441646, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004502443832581523 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.10863800930732682, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018630619874616906 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.2160721235507446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04337158036151172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..875cdfa395c3ab1514d116ace197be006c466f86 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08332513771516135, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019868347121842745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.5227234573351196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004548834912859631 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.1318424172773736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002194221638712534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.0368091501321238, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012212135534082967 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.25680330637418675, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004245573310206607 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0579898457998029, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013623575809253766 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07491478363930333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016259914368083285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.4884403183279412, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044306286785567766 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.1196985480814696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001792364415355123 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07137645686823287, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017762733333957057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.4552935323318038, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004331432435421035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11277885328608343, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019446200008606618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.2968562517794118, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03981886990665034 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..547678a2b7d1b3f2b7864cca0f40b2b80c5e5ad1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08350295350572431, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001964565788491711 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.5269669069519911, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004464761280028578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.13136185565213176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002061912807178742 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.036905855206776556, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001145818807419939 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.26098146346727347, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004211890254662535 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.05793797811823835, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012534985992537161 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07557252948320996, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016449289985662312 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.49370616371308285, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004327952531421013 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.12003948650766051, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017167875473952968 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07149122309163901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017785526726022418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.4585698429370505, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004258474803913088 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11220846020003944, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018460781766260106 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.3609245244267043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.033697352727562094 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..173f5cf355abcff1ad3bfb59c4233920affbc7a0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08171511695579445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018633798688914489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.5205014192182988, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00444667928382001 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.12912740120963084, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00200364840452344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.03547040912515195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010877669331809023 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.2544026510929024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004229382794688773 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.05600461944766409, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012334741587230486 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07419114114811591, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015341927152941845 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.4903982438780839, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004319214854158963 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.11838679216919511, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016466394081195558 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07010518510850142, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001733506016922793 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.4514049055251742, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00422208781632271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11026059508958445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018150927429308838 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.3127171878921704, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04798969717597349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fa3dbbc6750971b0aa54df4ee8ae5acb6656cac7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.08086960514171361, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014525200909349487 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.14073285537633912, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021365692846394106 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.09516029675770697, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014796463176875243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.008229815685974943, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003995476871805908 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.014689368147720344, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008211286470989261 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.009594517812957653, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00045393089210895286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.07093712041741049, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011411235398140263 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.12604423823582933, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017629787569386458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.08409008430092453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011584868305005939 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.07781962570247349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013794354759233953 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.13531803934376344, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0020069265089291905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.09152903732984705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013950176929762666 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.4446304965171368, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02101564136831389 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5c6654743798709dfeedb95087720edfdc6f821 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.12036832068446836, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001698822978107527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.21036263585148327, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002320909333489002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.14245264302938512, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016832867172118543 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.015115117361918744, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000577145021228364 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.02662558452098901, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010503502526558268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.017846850141455827, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006531290340899643 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.09229239279906276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011398229641967105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.16800779726654005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017662341943095999 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.1106783481573006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00113630622144049 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.11299875946868077, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015782836399768998 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.19806432139905789, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002160572168760569 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.13385718956690607, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015602587391599571 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.8653788220477945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0447073485750703 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bfbd9836610db82f04c01699326a2dda254123aa --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.12987828446896582, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017668539132634244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.22449329045901115, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002446761016968778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.15317578509745602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017377869775542976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.019205230811756614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006259651415811169 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.034096443155707416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001170242516537603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.022535640055881916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00069644020248688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.09796065026425818, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001176944853197895 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.1766738402223476, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019113957567420726 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.11716221211305587, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011775173863940828 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.12148938565288571, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016327095456720253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.21077828674873728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0022836578166374076 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1434435746637532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016022296713075052 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.0323682964004037, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05728647371845087 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6a9187351d52c89673a527eaef34c24d37ee6abb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.11777387943190545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020113312278699025 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.19469078652969996, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027977487029445204 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.1336180051181457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001957815366131097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.018842490102852782, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000680153964280099 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.03249985364518616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011921354860818573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.021648290209856712, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007245383572720933 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.08919191590884211, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014541958760042968 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.153037543777827, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002210331899287786 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.10208979953114408, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001397295719317153 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.10958122883328779, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018686579393142751 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.1819882090629157, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026135117445184627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.12445447248036409, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018122356410507367 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.1833686716294687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07615919722768849 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8d0fe08b6e827bea0ff76b1edb33b3cc79804486 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.03995701958166195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016095372311748183 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.06674356961372938, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002448300622281182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.04366405939122397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015831202808765945 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.006552251076514466, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00045575183717078306 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.012072979341302284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008431856424377719 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.007536808369783641, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004892115817681631 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.03139542886892282, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012706755231911566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.054115916065237676, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002006663515515118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.03438650141175432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012227168700738242 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.03739170639098872, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015159196007358158 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.062356306725636794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00228581975031369 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.040753188180590226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014756445728831532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.28624026182603163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.029349438130954253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..aea40a8045a45a5e3a5894e58537ce35a316c4f4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.007186612272605283, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009791397918263772 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.009626580155215095, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0009843679259005512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.006297564938231645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0006539758063676466 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.0009639306536740955, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00016209042335021826 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0018204921865520307, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003123750046265983 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0011309269927620334, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0001883904593636651 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.005946725242200601, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008795298956796886 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.007926123528146993, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008039901837259593 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.005007392411738194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004993418818895954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.006742477351676616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009493223215183244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.008831833968476688, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009029356485515189 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.005783114895049256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000599975916196901 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.5436993670367395e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.3789544446448377e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..30fbcb37d7e3cc7aaad30487123aefc93cd162b9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.06660092740337692, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010310436131496232 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.11598354826382899, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014500523897201271 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.07882898461240369, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001040524409407945 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.0028614323156141855, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000180186892026584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.004436869555896244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0002993853890744701 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.003243321779952968, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00020438291184088177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.06084499114194029, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008567327435936374 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.10872582938119582, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001328386237011712 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.07272695606243273, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008850843973306463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.06025198497698202, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009166556048550033 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.10601353160224014, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00132199783950748 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.07152252322090602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009232099420132897 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.09735038671178359, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01912399145191819 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c95ed34abe1da4262a3ef42e0722e71194882adc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.08855732672619102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015064472119031337 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.14544347329432597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002411071227170801 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.10178108004797332, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015749327774327397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.008560544006655627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004413067196946586 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.01616782557058256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000930039738517821 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.010181112623842817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005119910300898817 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.06881259230018068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010402326401547975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1164247420147934, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018577911811822583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.07977466300345593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001103140871324476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.08290521041610925, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013971705074040997 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.13668730020343445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0022409703680440853 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.09536226516262464, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014532751500060744 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.6029781033915351, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.028174541781078276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..708bf1f247aa39d50573208eef08af7e610041e2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.10757274244241427, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001866638618407923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.1768593713292845, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028614155741575843 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.12386118481423918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019396226221506604 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.01811267415569796, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006924773828138044 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.03241684505960385, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001268207503708144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.02117387153026309, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007530823627085244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.08462993923914566, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00135467976486583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.14336450100833226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002264556242154818 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.09831165574662265, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014131960658590383 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.09907287995201415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017198819213882605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.16367866039452192, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026624060636886954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.11426827856742555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017893139149704248 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.168914893531667, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05316184069367659 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1cce9c94d8674e20caec9d64a069eee9da3f25cd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.10720171803251193, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002082996644638251 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.16612796673148053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002954548192082436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.11809803100929343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020464135073259295 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.02009963124741285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008011300876613331 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.03248360016391421, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012256498842740874 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.022197025590925616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007547906063236964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.08482141911566571, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015735036854485902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1351988789967234, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002389596247655131 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.09416415981113054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015409418875020468 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.09848261188127233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019260433293872472 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.15267739552040577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027171840590306546 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.10837367169734866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001874380599850097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.550517196878354, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06993149956571287 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ddbe77ba047fbf68ba9bf95037e5bc6d981ca49 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.03740904754421892, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001610680414155072 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.057441900471446455, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0023292227400479332 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.03946196457385396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001572740226370851 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.007405435782553709, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004938171762244464 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.012822151487257908, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000886391837086689 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.008171833643724272, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005061088990249447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.0302784538841258, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001265349446879213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.047977766809024824, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001937859235680591 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.03213695990330082, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012354431038129371 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.034287485555940835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014785361480277362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.05268920626830129, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0021423352455123288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.036104092106741856, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014394317299840206 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.2906818896887448, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.032241949265799034 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a0d6989a3d9ead7baf34421524f26ab2a699061e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.006416762662669792, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008067399599762491 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.008596118967962398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0009759171996390612 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.005997866632095373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0006446489437867567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.0012325600300179128, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00023262166513716688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.0019877130664642207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003765849024360566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0012168332924537228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00019190718073544674 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.005180574608491138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006638732889323238 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.00723418284441452, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.000838041683119617 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.004877643128103321, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005141597079202806 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.00601329806116974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007735773924533153 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.007945295399230258, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009128968590461367 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.005540816115674005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005975088748642658 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.3695690005957795e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.162992091823081e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ac30508553645ed0a4dacfc36108bfda16a19719 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.08473975013844745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012652837122476013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.1329672875133205, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014302651043992514 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.09629596799033903, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011603461268498176 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.003906179417613626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00019031811304692613 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0055735013712897765, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00029292378757282304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0042667329498244436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00020313195756774255 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.07735309410625876, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001069934210773379 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.1244809192327779, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001333807275878821 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.08883637966635893, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010087038286843665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.08109267338305781, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012068363440650514 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.12723188277215594, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001357597198069678 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.09211538989861727, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010998765766529236 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.02719689123536131, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0044363498052874435 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2b0ca48207f3e0ebf3bae804dac11d3ff3a0ae70 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.07936181927044636, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011762568246866275 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.1308334139584999, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014823829252056944 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.0918764247127699, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011260140101502995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.00393290828598269, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002040513153487916 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.005507726691318861, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00030125280804493035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.00423567615381497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00021018422233903274 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.07417084074707798, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001018585184337875 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.12499113633870057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0013966793071261054 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.08666723799936435, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.000999850410378775 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.07610364515575635, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011064264710643904 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.12615810768661415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001412447688376772 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.08825782099755484, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010565687929762516 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.10510472666663716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02342215944592973 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..81a59144dd43ad36c943b141ec4b3f7d5dca5db2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.07947361253543833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001193005591889888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.1327421000432318, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0015486342539799732 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.09256018146781847, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011555403499640406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.004199424248706065, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002403958914677049 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.006627430748129385, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00047327454697217267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.004697153886380661, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00027487360812121926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.07476922060726728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001036726053715039 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.127478433032578, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014601336227070283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.08788495344807927, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001034393917882335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.07575953247297404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011219051960697426 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.12703786617602278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001464179232269078 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.08832933606271764, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010830718460803723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.19993488449475255, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04329139775558018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..14c2648bec5ff492ed038c42b8ea8c719f4f7168 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.07037274511161584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014120231404269865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.11052779587284725, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0017290812995824128 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.07737568279361932, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001265246486643532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.00392300118223772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002557581635323497 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.005589835600869042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000423176072989138 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0040651974203171634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00025930635468911277 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.06512620770805046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00121951222912035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.10533676686618007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016204045193858512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.07258759044318532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001122423619227686 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.06672709213349835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013263129049748312 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.10524011607734472, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0016343300862625232 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.07343581428013704, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011835280613330765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.24585325834528793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03702715577574997 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f48073c55a2e72b1b52a8a215c354c226a764903 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.02381329967362824, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011028927436461343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.0375355725294583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014975837973007148 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.025475522197387277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010381041342986902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0019534338228814068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00025424799653585257 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.002574143785393473, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00033042674187292473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0018254913452193152, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002100606076864114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.021656975285866457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009662420131766899 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.035213960582413724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0013920460584087664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.023474073198495527, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009278481009221991 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.022475087207169998, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010391682891249803 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.03568293304710748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001419767775872114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.0240648893225021, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009742444146235487 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.0725273475872256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01184587819436072 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7362741da876ab6b5cfc13882026c802ed3d3daf --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.003599842790127908, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0004357453635530341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.005450956260566027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0006157647454877155 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.003773178583642327, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0004233214040303885 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.00024100397622410201, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 6.428724333491122e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0003662081649364038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00010026927803212084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.00026468573039586365, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 6.948326335135335e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.003261903403975752, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0003814441870483113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.005138441375666529, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0005722130168539618 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.0034927853884506757, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0003810695069644319 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.003394903398841389, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00041292623146590925 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.005123453693921271, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.000573270656957646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.00354234805200471, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00039437507462346326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 3.4432997292695364e-10, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.054319054078497e-10 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2096046730b45b46234ea0ade8397c39c4fe1533 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0505257980847339, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009316535384306269 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07944574030730557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013436416711421135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05739499438745971, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000959404112224303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0025522430280765624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00019188137819214202 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003889530091650386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003502492712853139 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002874313185982406, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00022791640812011725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04641940622808299, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007892141876172595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.07403750267734954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011941764542527037 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.053008425140295905, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008201296289562219 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.04816018092226108, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008696938078640268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07599978162074517, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012632037879506343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05476968826480647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008953065390907387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.14859459498800928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019538924284114197 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..25fc3227de82f89d44e1e60d40ab4d4b2b3ed8ad --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.13584035826185337, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001883003830967405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2300655738827389, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002771815067524841 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.15836293018887368, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001891640433790272 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02400583408187123, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007285343955308211 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.04289680599794199, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001353605133009988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.028190707681194575, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008118715649523407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.10298984208878467, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012746961770117027 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.18009351587124983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002104050870387258 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12130392597137107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012906736654645788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.12668865175777289, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017429813042225584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21558088869876474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026000978058887433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.14788907195330203, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017520956258023405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.346869321685321, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06607443264134674 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f7c8e08291dba4cc08d19b8b3ca567deb99b0fe7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1708173947979992, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002081500885534108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2857546859749413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002766073063501205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1979943409707515, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019701979235594003 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.03867526801746755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009249837176721441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06488659439579164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015151297174771778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04456119604899187, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009779372383836055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12436000788073821, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013936495773448447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21512580022359384, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021392819398317084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14561448339286645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013308339769298708 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.15896932268065497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019322450604166179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2668532631896298, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00261075911429361 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18443888049277404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018323545176278458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.247794388992107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09928029909737168 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..74d7f6ff747d175dddce0162bba7676f72b3a76b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15395480397824707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002402542953049042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24557521515724243, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032199609208037362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17165885454358776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002266523062316873 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0345849314932535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000925086814362582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05685215254790676, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014741611375388177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03887583188926559, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009318869486006191 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11262382269704937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001735467609168603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1853949845751863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002495030067359919 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12611107179368627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015753873989903184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1426484686620457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022377962576445335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.22876101619014125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030299996388814796 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15920866146470747, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021025779963518188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.212268753332442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09749124513916169 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5df963c364f8d10e2859eb9f36a408618ec687a5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05323367871163294, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002019764759552398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08477832633294664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029256767843087337 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.057194516095400834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019532780600384314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.011892613439611567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006525291351364055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.021014952711012305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011420293722952714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013407675922368708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006687476239191229 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.03994947969161068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015323136750776513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0654114763710059, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022890377474177525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0429641799328658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014362645716637073 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.049203521589357486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018745212624263795 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07828616324493042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00270998239625019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.052823275351703086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018085153746161442 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.5159665881377578, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0355848354666848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4508a5a68cbd336c36d7d336e9a676232b49ea58 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008397509125114434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008676037544483993 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013182384245950918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012488579545023588 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.00885537771521418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008474145806800235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0018519121687661717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002652232008037664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003201546871291623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00042951400653203656 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0020845828252393957, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002776449859686965 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.006312806712827651, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006339300731280119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010466170637582555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010119772330203227 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006795329450745382, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006430172034977336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0076894646083944945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.000786099271409283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012227793486074013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011695312324965957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008112137370163754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007719985459243486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 5.133528491740168e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.288876136024227e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..76a3772ae3a7ba1403062d5c935ab9a0d087b5bc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.050536252348243965, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010303713135263982 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.07249280873981946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012874203666130126 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.05331824068511711, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009108204142889356 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0011620656110374618, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00014360139445475996 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.001522213922602407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0001808244327036453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0011035986294212138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00011286020352486199 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.046800024712754074, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009037799228350519 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.06839046112658723, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011982762108255986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.049896775119845964, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008243222785585304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.04562287089619549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009343218195610894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.06572279893423466, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011613172719751475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.04804101501692195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008043684532992358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.025005452627405448, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.004798052609004983 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a375326ea4a326989830e1a72ef0a66f0c2b208 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10336569841892168, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015572986318203581 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.15980468261934022, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021843290096728234 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.11453232328159935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015165645681982799 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.00937353738620273, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004621640258235172 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.01583249449894538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008242203780143205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.010646298254836605, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005004715634605305 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.0790900720241622, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010673462835914323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.126113983071195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00164608010815976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.08826342080759857, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010184729602732176 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09781712366399713, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014684084322258415 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.15149051165585709, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0020607697766305863 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.10835203045408379, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014203571658610793 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.6148382566867375, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.020840940915060138 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ec8bc5a4b2dc6f4740f90cd01da603be7aab2ee1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.1239472716246903, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018733837629968672 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.19751962957109306, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027503417388775164 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1389508648632086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018875843940261779 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.018494424650071603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006803416768500626 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0314952015813472, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011962370296272414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.02129586388647884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007438044749129246 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09361354942631474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012858909938593117 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.15349834743094354, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020743788462299056 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10552665111055606, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012791910056744688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.11595972550990691, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001753308531131635 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.18500426729144692, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025796753919204852 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.12992721021691855, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017593130424382794 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.157589240412585, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06749886174670726 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1dcedd2308864bd7c068e3867611566ecd87b685 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.11061508225201387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00219210163323952 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.17229542877999413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030954808441172386 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.12031046206784149, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002096721222333191 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.019285223842382425, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007386242173115931 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.032665716658571785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012591431714370847 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.021617853652187335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000749070842689932 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08351753016640841, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015962438735235313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.13414611484067868, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024039219745857114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.09132576653371158, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014923093239882957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.10363733813080353, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020595302264073945 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.16163856120472586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0029085585556816312 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.11255926347759326, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019516313006938469 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.4447905460689463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0650728846043769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b1e1a58b6ada031c5e5d066a2ea9a32264a0f720 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.029920984782127447, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014566610845788402 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.04785896357976774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002210898388888131 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.032690132376649535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014923647623479581 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.005659374256624486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00043051326798354176 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.009804204059300893, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000793438963373113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.00641884861944969, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004672678132495265 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.022749908515385872, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010830329089833192 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.0373074119688992, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017190468611345826 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.024933006612978887, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011060087069036857 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.027846049257508616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013603362197923356 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.04432125424579462, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002040921786045664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.030339892293146535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013855806303329002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.10806636279782142, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.008181526680029229 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3b4b5e89b2c5f68ff9cb08a7dc46fb7f867a5ce6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.0029392502676391357, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0004853225161067237 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.005172582525022545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.000765814668280875 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.003428639083443863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0005233153160450194 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0005834403624431923, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0001396349568301966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0009316463701771211, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00020955827176984525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0006554801175224404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00015037462344468343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.002100593784823454, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0003210510857538418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.00393696053174691, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.000586165963887078 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.002504342041321112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0003638861281311792 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.0027237658351042765, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0004415537544986123 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.004866254903763763, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0007242312593154887 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0031942471606000973, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000483410224652734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.464939987980638e-14, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 4.151354860776468e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec53a1a5649b7fe516922d88490ef0e69afb51b5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014794927843348635 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..309024be7e233ff5a63bf0993868143f6398d6b9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732961 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402702 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..84d82a9a58fbc2cb2714e45622d15b32c0a02ddd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0150806639915631 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01500870618212173 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a2fbc2c16f9fd4b48937cbc0d8c4932f238822be --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456736 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014944140233795023 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fbd87d3db3c4521d5392bad58fc03f6b7490be02 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014734079309311901 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01460648312734276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..179b737861cabde88f6b041b77cf30a371bccbd7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014770821817934656 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014758652303574881 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d4306fc11de6c4e9cc3a901f09321457d369fc72 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015090650341444235 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f00c394beeef99926ce3eb5d334bb72d62f1988c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..842f931ea0d2db5f5604d7e1dc4d3475cd36d9a5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.361, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015195720118175108 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.36, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015186527932040117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7592880a7f04e748db09c7824e1dd440fb925948 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014933117490932577 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229873 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d2ba43cdbbfdb38b4e1d400d14641d47a8c23b75 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0148995972428115 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015008706182121731 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7fa127a48360a6fde9886086d87cff5fcfb2b546 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015019206922356951 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01489959724281149 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d70c7f8d367ef788c850c53aa547a5f45231aa08 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014944140233795023 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb09adc2b35c04a8216c3d80560432b7cfded882 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01486539538592837 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bc5a7a7f5889e936be88ebd0d8b496285baae50f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.352, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015110404505648663 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01489959724281149 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bba736a99635000a73964b1bef64341702a023ab --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055233 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..885dd283b8de6f871baaaae55a96e7adae04259e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014842213153411242 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229868 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..863351876bb8687f5cd6f3a37c3f0978f598c6c5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0146966319607925 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014782913600996678 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..496d4c2362d07521530b34dd616d52b6e23badde --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014794927843348632 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5c22897940cad81b07fe07f4df8c3c14d38c9029 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456734 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6111cef351a9118e128dc5dd2074c4d04a42e789 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014794927843348637 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014818724459095524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a2287c6b1a06a535818bba280c8141691c1dfec6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01475865230357487 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014658474370509003 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8bf251deb5934ebde88c29578466965ac287c4b0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.309, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014619600977206491 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.304, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014553205687950438 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8bc4cdfad0e217c05741cf5ad0c937f7c55b995e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456734 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.0147340793093119 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c4145ccdbaafc8f13ca99c6f82b8b7948b722932 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014865395385928354 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6a955457c41b0ef96d6bb40e344d5cd39b828d1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01484221315341124 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014782913600996676 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c1726599b8a31ef84b0a4257a530c4fa75bf362f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055237 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01487687202745673 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..11905944fe803c4fd8a93f9b627908879998b827 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015080663991563102 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014987482264363937 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..01b084ac49c997cc63b3c4e62d94ba5668dc073d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01491084616422987 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229873 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e821201778c6c71f885bce31c94150bbf1934fdc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229868 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014818724459095524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8895e9fda155bda23108181c04d00f1b5dbb6d85 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01494414023379502 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014987482264363937 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..73d190ee68f23ab1ab65df1b1eaa574da62db965 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014566646394664378 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01460648312734276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0cdf0e3ce3c41dd4034fcfaec0ad8bde731f27e4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014566646394664382 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014965960710224475 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0971af08b77bf863455f6566963c4ce1d11685cb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880215 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014818724459095526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9a1174a6dbc8cb0f70f080f489ba499fdeaf6232 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014671272822977886 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014645596385722692 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..47977fd117d504535b628b9189f9bd20947c3e19 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014806864733738863 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370508998 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1713714f86e494437f217a27ab38785f2fa7b069 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732958 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456725 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d21331bf586c6f745a6dd67333aac2db5bb89b4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4d7fd6d8cfb56d6bdc4cba588f525273c94ebc31 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014944140233795021 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014842213153411237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6eabc54d500abcc5e0de93db5f36d28ac08d25c1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014645596385722695 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01468399195108795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..90be270da9e3d4b76e5ee734bb3b668efab83f78 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014794927843348635 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456736 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b0570197054d11899671fef4136d78a4767d70 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01496596071022448 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014955087918653603 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4399a669075cef657102b23b963decb575a44fd9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014944140233795027 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3d74589f968d13dad1e6a475157528a6b1e12411 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014658474370509012 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014671272822977883 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..15f68d2c381fcbb27f18abd9472bd7652515415c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014899597242811495 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014770821817934652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70b368dc9daaf4df683d8477f2c1b2a1b6013b21 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014955087918653603 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0e5005dddcbe2d29b8542733f8234fabac5a1a0a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880219 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014922019523732963 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a29fcbc459f54ade2a637d4b352707df1b1b0104 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014842213153411242 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..231cc4466b5660896c1a6e73a4a154a418761ec7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422985 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..413e6ccc510561a4fa9d53b445be034c9ec51582 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014671272822977885 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f0024f92f7967a022f42ac178300f0afcdf2dec3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014853842487270333 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014830507204541035 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..da92bbdcea335561560147e575c64b43b4b4cde5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014933117490932573 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..073979477fb4c20c14eff4e2a3f8e4ce6b4d7ae6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014987482264363935 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014955087918653602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ccbbdfb9200a7060ccf6ab8fdad666b806c5e125 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014955087918653602 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014899597242811478 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..af667c0bf6d07b2c0e43a4cdfa3f3d3fa04b8923 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473484 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8469c875e156c64c21ad55316b81874dcda864ef --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087974 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.307, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014593284892852628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..96b27a64b1c98e2db1f70a8b471e3ddf770848af --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01493311749093258 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014865395385928378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7e7015f3a4b63407943307028c775767a5caf18f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014976758771620345 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456734 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4797250f2a8258bb9247cbb491a9f68327a9cd53 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473484 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203933 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..44806e154633847b85b43f54fe71e0bcd591c10f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.0146966319607925 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014910846164229868 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf5318e27dcf7efef04928f9cd667e6fbbee1fa3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013664144006618268 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01368049572576779 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c5df1b073b224d20740df240b0e6a9bde4476788 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013605417345710526 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..836f124b8bae209be026379ed58895e4f51c4278 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013517438120881629 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013517438120881643 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..41d5b1378561bb11afa83260de778264d0d247bd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3408333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013688600793296936 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3458333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013736245342311014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..60def3a53525fc007c212a56ecd1e1ef7d20f1a5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013596836729485168 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013664144006618268 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1fe1d72f98d7e358039b1fcfc5eb3f111d244f9c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013490095282989521 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013452948996996296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f6fec6c0e627a479845b921de799b6e2227bee8c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406389 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e567ca98b1b296992cc15492d43cac4e6578b2e6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932889 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932889 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2bfc38b0407d9163c49461d109b7e6cd72bcdd2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013471620929769142 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013415009084004862 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce218c90b0970030b6a524f50d76482c65da09f6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013774667009018552 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3525, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013797164918918357 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9b4fcfddd5dee3a656d2d9619c2c27a00dc64da2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251944 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01364760294240639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4a1edb203b6f4dcd217ba2adb468007c05e49962 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01347162092976913 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013517438120881622 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93c81595a8c9553d8e06641dabb13a9ae2fb6bc5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33916666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013672343491681815 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b38ff72a7dc9f6268e760aecbbc40224ae991dc8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3408333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013688600793296934 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932886 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cbb6952aa8a08f18143000b6163c4994a1401038 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013376268790982108 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.30083333333333334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013244749345624925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..76b520cb94cb11e3fc4d83a112a2ca01ee995fb9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225603 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463658 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7aba12b93bb08f85c7681b67d43481a89dae83bc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013664144006618266 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013613950010225603 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..645424d8320864f368985fc6b11eee6b72200964 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406398 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2e9a11fe118b3c4b1f00da8c00003421291eeef7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.2991666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013223742523347383 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013579531277800918 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..057fe1b1c13ef38fb2dbc5f9a2cdc59d676a4bcf --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01364760294240639 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0085f586cc3a8c91c74fd9a09af5f0bb07c891c9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013296358936471108 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3016666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013255174729956493 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f7400efde56d497d6eb1cdf38302ea4cf6e577b8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01346230971200513 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013443538681348054 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..24efca6c4bd9a1c412e7e9d329fdd853f83ee6b0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01365589718546366 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01356203291952902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..236c1c90de00a777130d028adca3c64d629015d2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013544340907003663 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.31083333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013366457845965433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a0d648bd4c9f0fbd46d8745c2c9ed77a66dd8f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3433333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013712633830465858 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013622434813136774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3b958a7248e842f42305859d9909f01954196f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33916666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01367234349168182 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f12fa1e48b3e57607b6dba198f62ee64e9c100d4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.30416666666666664, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013286140243317446 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103244 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cdb1abcd20a732632d3bb5a312b9f28f27282361 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3441666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013720551062295756 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013664144006618268 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e58478d6b2405448a83653c5f9c2a734b76413bc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01363926119093288 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..104851178b3bfc2977e4ddd6ec880dae18a7681c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01351743812088163 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821479 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..acefb5d44d55274e546bb34f70e3bcb7dec8a25f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23890784982935154, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012461071376316628 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23890784982935154, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012461071376316628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..45d727a82b8a9ed25ca73310cdaf09ed3bd94aa9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012336718284948854 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012336718284948854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dfe97d2c964afe0ea1cbe44ffc65a467f57facb8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2440273037542662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012551447627856253 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2440273037542662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012551447627856253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dc3548affefaa83638cba50fb79e8f403d05ed47 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587085 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012536554144587085 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e8935b8b551230d8d90f1ee4b5dfab396f883b68 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012445770028026201 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012445770028026201 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..de6013a8c3e328134b0c9afc371dcd80fa7f5f4f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2226962457337884, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012158314774829919 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2226962457337884, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012158314774829919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f551d6fbc0c8b47863372c04eaf5b4de1c0baae8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2645051194539249, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01288927294931337 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01332975029338232 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e5bce1f29e902fae3a5667e440674777b5a3dc5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2627986348122867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012862523175351335 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2935153583617747, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013307250444941129 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..618471e810a610adb187e7f23013e48ceef846ad --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26621160409556316, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01291577478152323 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013329750293382321 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8570fee248f1f1d4eeb27cb541313468d8ad926a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01294203019513642 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29266211604095566, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01329591610361941 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6ea7d3f433cd5903084c238ebaabe31dbbcee8d4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2687713310580205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012955065963710696 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29180887372013653, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013284525292403504 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f1d9109cabc66be44e05baeee18ad4a1e0b68f8e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012928933196496349 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2781569965870307, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013094469919538792 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c92b86cbe257625ec1c2ab6e72ea2a05e8a864f5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012383873560768668 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012780770562768403 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd641a4a6772c886a1b26db9363845e7de80073 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01266819862131543 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012928933196496345 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e49d9ded1201fe99d7cbdeff1a120c4bb4d00d21 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23293515358361774, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012352507042617405 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2645051194539249, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012889272949313364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f72c26c4a15484d970fb4cb6929ab492d875dcac --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587089 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2721843003412969, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013006600406423707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..18d33a18323db49cd4bc91b77d0e59014074dd7f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23037542662116042, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012304928418747611 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012794553754288675 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4f4564b5b3b492dcfb2bb3cd8f7133307c6e34f1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004741 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2627986348122867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01286252317535133 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..398a9772ffc1e3e907a8b6ef98dc9160bc53a415 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.22866894197952217, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012272853582540807 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.22866894197952217, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012272853582540807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a38232e9d790d2cd14de0409d61e2af69cc3fc7d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23122866894197952, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01232085883477228 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23122866894197952, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01232085883477228 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5e714a7eff13f7546d099931d45b71bb9d8a507c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004753 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012399451855004753 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1ecd6eda2059937b2deb17eed56d0bd914971072 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012491468532390578 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012491468532390578 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..97157a84e9348b457aa63cb1b5c52083d3fe3e7a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.22696245733788395, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012240491536132877 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.22696245733788395, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012240491536132877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1049cd47899c6cebb0279d58785749892511c3c8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01236822537850715 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01236822537850715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..709b21a608578dd017b949222a4fcc06a3efc21d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012696728980207702 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29180887372013653, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013284525292403501 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..055bd34fd728662093f517d2d41f2767f91d4d30 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25426621160409557, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012724999945157729 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29266211604095566, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013295916103619413 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a39d44a631c534a95e3962b5f7d637fc0309376 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2645051194539249, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012889272949313371 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29948805460750855, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013385021637313562 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dcf1394da07f7fcec5334d10c694fcc9a5c63662 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.27986348122866894, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013119040897725927 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29436860068259385, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013318528460539426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6d884777796f0cda3a782c63963ebbf577a06be1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012942030195136432 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29692832764505117, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013352025976725225 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7bb847a930c31304b470366c20f4e8390c821467 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_challenge_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2619453924914676, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012849054826858114 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2790102389078498, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013106784883601352 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..025183d545e1571837c9fd42445633f3d648d191 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2474747474747475, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008855114414834707 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2474747474747475, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008855114414834707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fd7b75212e31af5b2042fdb17434eb896bdd1cf0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008762298774190583 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008762298774190583 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..92d32216f79ef0ee4a57e851ea2aa0658852f884 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008824588611219073 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008824588611219073 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..87bb7d82f803fe94d4d8041443555bad075633b3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.25336700336700335, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008924765424529257 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.25336700336700335, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008924765424529257 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e810e9c86f72d86aa1dd3d47ce34d3673c65a14d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24326599326599327, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008804009846865534 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24326599326599327, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008804009846865534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..33bd63e9ca5e7ade90f35559fc2be3aa57fc3eb4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24831649831649832, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008865199020660963 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24831649831649832, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008865199020660963 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1bbf4bc10f806ac65433a1c1e50f3838fc85c518 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3042929292929293, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009441202922359185 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2840909090909091, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009253921261885763 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4fee54b81f5318b645492f007884104028c9487a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3063973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009459453573398327 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2857744107744108, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00927038060698121 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9802295c7beaca2636d865ffe35bb18fd7e97331 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2946127946127946, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009354224395837097 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2849326599326599, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00926217069559066 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6b06ff5356ea1db1ae5c0591266bed2cda56d90c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.30134680134680136, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009415259879351616 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2916666666666667, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009326752065621165 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1e6d4cc93d4403b984629a4587850f3827e878ce --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.29713804713804715, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009377397867796849 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29335016835016836, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009342508331708561 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..19e56d5f3902a253f2e3eaf2ac048ca34dadaf13 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.30303030303030304, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009430140669278955 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.28703703703703703, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009282621598983076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f8baee1fc7ab85fdb15b08db031578dbdc1bd84d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.25715488215488214, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00896839476897199 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26346801346801346, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009039157374497706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7abd7771b3c55054d053b085757e3228beab6eeb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2836700336700337, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00924978169114074 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2878787878787879, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00929073316167016 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0c66d456b3e880e0d4105c92c50a0e67b6dde5f4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.30723905723905726, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009466688832475376 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.30092592592592593, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009411516193787195 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1c6ff2b7bb8bed0ee24a609b892ba49cf4f5c82a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2849326599326599, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009262170695590655 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.29797979797979796, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00938504606669487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a833baa666c413ae29c27ff4c590a2f13b0ecabe --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2908249158249158, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009318815921176657 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2908249158249158, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009318815921176662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7a215d665c5b90e9561fd496ec5a5e3e44ef77a7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2967171717171717, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009373559492986846 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2925084175084175, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009334649503078416 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..11f98f86be0ed767a25607e1b491b1d95f5c4564 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008850055161459236 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008850055161459236 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc155cd34fbbd4717d53e6a9770b5ff66a9689d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008757032594354027 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008757032594354027 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b9154c49d4f885f135fc0e8a0975a5620d5c7b8e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.242003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008788455043255558 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.242003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008788455043255558 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9dfa73be2fa66ed33bb9f89e0b6c7ebbbd1834 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24579124579124578, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008834809366391494 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24579124579124578, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008834809366391494 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7a924a231bd6c46bfaa7156daa072d71d58d02fb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2361111111111111, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00871448049171129 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2361111111111111, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00871448049171129 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1515ef24efb584db006c30c2a0a28571b0f25129 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24957912457912457, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008880241465504344 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24957912457912457, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008880241465504344 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..776cace1b810ee1fa84b1271a78fd0c519248411 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.30134680134680136, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009415259879351615 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2845117845117845, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009258050925618825 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ba69292909343732ed3be7a6f48af6cc2f18bc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.30976430976430974, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009488172851903717 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29503367003367004, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009358110551087427 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..02c5297c45c146b528dff29e7f4cce222ccbd249 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.31523569023569026, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009533589368505863 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2946127946127946, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009354224395837102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..917f2e9c7606f7c9de195599e18a2335031a7844 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.31734006734006737, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009550648343947771 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29208754208754206, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009330705616569073 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0e04cacf91fcabd78d7106fbc517755500b5ecc6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.30808080808080807, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00947388707582633 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2937710437710438, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009346423298166723 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..00273c4a71daf5cb561ee5c46ded3d0ba6daf774 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_arc_easy_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3101851851851852, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009491721291998515 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.289983164983165, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00931084097076903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c08b6a9350a36ad4952278876bf3aff8a91a6048 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6163333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008879665985151403 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.624, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008845002997512754 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2085a8f69f06ec367d9b1cfa393aab3e380ad158 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.596, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.0089603624944537 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.633, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008801296548822387 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..32a40aeeea130d51b7b7a26f9ebbe33d1b6a1603 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5923333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008973202213879655 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.617, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008876744835033232 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7451890416d3c64dd4a3c4ab7f9e8f5c2e535ade --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6083333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008913348354532972 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6213333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008857326053368308 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6cf63715e16bf5646dd5dd7f8cf25de0e9a31795 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6136666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008891174310695492 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6226666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00885120015653439 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f577aa16a473c3d5cf20aba9b7c6930595f683f0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.609, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00891063782727302 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6203333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008861873799148993 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f7dbd44d6b49064b6b83b5b5e6fd4dcfd69bdd00 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.622, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008854272003440052 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009064255084676055 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ee12d479234a644b3e41e63f66fc6eccd68071c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.546, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009091509877386519 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.543, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00909640486825282 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..94aac210f9c7cb66d6fa94288bda953dbcfe9766 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5926666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008972056373066369 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5723333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009034185176145654 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a2f46edc3cc059f96c3aaaaef137201c75f954de --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.58, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009012606487132148 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5613333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009061278956794627 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..101621507063152abf55ef5cbb07fefa93956558 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5633333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009056690207178123 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5403333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00910047692710895 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bb69a0245a37df4c64ca06247ad77f968a344180 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5546666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009075496684215473 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5166666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009125157363376123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ed938aa8505ecc817ef4e5f56f6ce65ebb22ac5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008846558976258922 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.6183333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008870849530787626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0646a66781175fd2f28ecb1adcea150f2fd57973 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5566666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009071405243621038 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5473333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009089227499483241 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a81521738b1751a46ad3a314530457a89715154d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.576, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00902414234419792 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.539, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009102414587191052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..674053da7683c1bd34304da1d7b53092420d187c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5796666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00901359097963683 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5516666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00908135501204554 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7255b716c41e490abca006c51f80631a55084e63 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.593, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008970906255948515 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5486666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009086879312708494 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..daf85b300cfec3c11f7379f311d9169e37c965f1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5896666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008982215188519146 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5536666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009077486613450288 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0ab43b62546f13dc60fe867b898fb6f265cdbc1f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.565, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009052751926300883 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.4096666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008979987547601863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..36130a9467b7db028900a46b6fa2666855df0439 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5693333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00904202497793108 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5513333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009081985306932099 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1dd9ce96ee47ffcb36bb10e553ccde3995a36af3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5973333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008955564831687456 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.576, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009024142344197916 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..300c0af9c536c409238a8ac0146cd1848a064a2a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5966666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008957972256087366 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5606666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009062775319073721 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9becd69616c6d047902e5aede4a3c051cd5c979c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5913333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008976614094836195 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.556, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009072785596468859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e5a5e3786716b7b4be6ba3de7fa3a7b0321191e2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.583, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009003556038613138 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.545, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009093178503605503 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..882406485a092fe216875feb612e5c5fcdfb7672 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5426666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009096928229880426 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..172e1b2bf8303b651104569ed15ce0522c2a9d38 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5436666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009095345834327868 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.554, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009076827433934427 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e4fe46d9d371d5307302555f46322193e7e61434 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.562, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009059765989615446 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.573, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009032396953831092 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..553b4007530327d2e91b65a5ff603843713254b2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5646666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009053547904033165 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5933333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008969751860881003 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4a9cc373d366cda80b716259431201008be7090a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5516666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009081355012045529 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5833333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009002529294393654 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0a346c13e0ced78e1abba1c2f2e40047b9c717b1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5483333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009087472531749432 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.576, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009024142344197917 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6db67fe5f7763200acd6220f8d3adc74377727 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.1940928270042194, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e183b5063e5a005b467af8fa3491338bedf52ffa --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2824214792299899, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe2916ed8b63bacdbb613999da6ec8691d23d9f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.28708133971291866, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e1fd247cc6b96bfae9bb035cd3e86e144c9e662a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2485426603073662, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..94deec47601f59ca1801175d822e5555cb2e2a35 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.25564695129912524, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..62fc77951078b8916545f0ecd3d73e8c8d04a83e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_GPT-3-style_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.48214285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0673769750864465 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.325, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..984c67947d81bed18c3d557313d6cfa3072da558 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.1940928270042194, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5017e2404add11db6b12c0e7331d8ca4599d70b7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..007e7dee459bf1fbb2ad98f2732237e35e39fa4c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.30977982590885816, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..325f147957f56a563903e1002c117144fda77b27 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.20370370370370372, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac5f334743b80ba568ffc611d926b9e7f3abfbe --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.22305244223052442, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..651ddadafe6f87c5f6753a54c9587ead27fc617f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2295932295932296, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2f3b6816d328d627f8a49f1b46106228afaec894 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.2857142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06091449038731726 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.24789746965043147, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5036430769f626f2611df1100fbf9e22cc2efa5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2794380587484036, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..113d14d4e96688b162207b6fe0a4cac711a9561e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3141821946169772, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e31c23725fd1dff801dcc3726401bfeb359c8ad6 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.32702915681639083, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5ee3a6884ee1b57be553b4552f6e21f53a6139d7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.298989898989899, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..228721a1f701673bbc1d7932b0b363bc73d5f8ab --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_can-we-infer_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.26798881261123825, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..378ffae5c2c3ac02b3b934d453efa3a4c6a941ce --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.20779220779220778, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8fff36f524227c06aefe4087d97b2120fdd91be1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2fa8e1e42a3d93f227c5a131fdce2c3ceb77d79a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.32608695652173914, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2ef15aef130b0d8bafc8ab966c04f2fca47585f2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.27226982184142523, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ad466e68aed91b1128e7c0a3496c6a617e94d042 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.37449908925318764, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..62d9334fa6bedc095036a11136d46b8f49b9cc35 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.270516717325228, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..78789482308438f009136d6e62433f4ea908e067 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.19642857142857142, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.05357142857142859 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.14814814814814814, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6d86c2ebc495ee7c73d2fcbdda979b6cca249af4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3172825681224338, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e829d793198c942c41246061def64e54a4d2f66e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.310790273556231, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..328b74b7348071c62e5ec13190a50df38ddc9a85 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.27441920164292133, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..704f5ab9b662bb46a74fbd2007fdfb0fca1c2789 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3224993701184178, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7881cfbae75edba7ed9a4b6b16901e9286de838d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_cb_justified-in-saying_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06585388898066351 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.28703703703703703, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1bf42d693ab914490e4a30833734fd5cf4b9ee8b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8f93ee85b8b170ece388b26d1b0157eaed3a743e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.59, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237102 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.56, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..afffd0366036f27291ab383a3936249ee797beb8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956911 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050241839379569095 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b98ae801f5d6d437a36e4e441ae45f441b0a27e5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.55, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050161355804659205 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..50371c7a1c35c078018b5329bedc1792723168a9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b262268c7992e22b6781969371a739d654650d07 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_best_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050161355804659205 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7a0b1a27834d42a3f984efeb3193d29544971a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..115e5c24307df12534bb7e843d5bc1c0a1b4f79e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04975698519562428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e62e514854d43b7ae14b6f3edb922c3bc77249a3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049999999999999996 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bb656e7ffec600ae151fc73d0e127e485fffe6a0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6edb06311162f1a992a10eb3fe1cc7c09e6e4a37 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05021167315686779 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eb56d1b3118aad18c218c7995a30d0618e17caae --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_cause_effect_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956911 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050241839379569095 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4fe9d15d21eae765ce10df412a8846a1f7c9fd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05021167315686779 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..248bfc110de1f4cec81811983e4dff7e0e206938 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..92f8be19841e0cea9f4fd7677c255dded0d195f1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..186ab15a49c2413d146cac32396ffa6b6f72324e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..aa55e82f82959a45fdfe3c03876d8b1d0e62c8cb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956911 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956913 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..551de067ffc3e087b76cd68046013dd30c3b87f0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_choose_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..871f82b7ae9ab7ba87a09271fcbd0a2547b65be9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1173cf53c248b42a44695620d39dfbfcde428bd3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049999999999999996 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d227eeafb64372bb72892720586829c56d7bf01a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5dc68dc4ea91c6559effd218dcaa96e9899923b2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6e13fec5a939962c3e5997173a8396628a23c2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956911 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..85232b32bf276eaa8b887f18d2bada473312e9cd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956911 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7093d73ae3377c89397be9df6f8cb1ff5a8bb04c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050161355804659205 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0f5089b7b1fdfdca565bbbaa8a55093de0745a07 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..96f5d65c89f727017e34ec3567dfd83aec3da610 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a5751bb653a4ef5cb3c77f6bb66352ee14570874 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956913 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5fafd037646e063146390e0205f3c059055abe3f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..17ba5196d712df4755c9c9c021b0762677215fe1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_copa_plausible_alternatives_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050251890762960605 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956911 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b7268d3dd24fbab924e1bd9c08defede596f9a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 3.6185138767655642, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.027545555030891664 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3580184040254419, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002769550276517225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.44573433671378293, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029443445157491567 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.37659938602738596, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0025625344135822327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14943282342597658, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001681547997613015 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.18664925231515925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017720627212135355 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.15645061177192066, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014050173197206021 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2605556278541228, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0020429532483913155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.32788024129204274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002331773243025902 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2727904799998107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001753190259230686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2944910613704785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0024027482769953855 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.36501687453094783, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002476795135078731 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.30749802061359216, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020994896088046516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..73654e56bdc50c62180b409f81133ba01cab546c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 5.940650821215176, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07251202191064558 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3562088549882284, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020586504966668153 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5809146861845771, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002622892280494673 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.43127560616047034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001978521697967289 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1532140575666167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012898457667003178 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25724312942661537, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002124196698748588 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1870937559813721, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014484391370237738 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.25186821456916564, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0013627204573516862 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41938362031941356, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023184382496243294 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.3071708177800624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014249199674985732 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2932254446894908, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018317984365110538 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.47920715808647235, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025056807862094345 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3552092130581474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018353427248767173 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..37a6e789bdee1117b1c877fc4649d8e54491c8ee --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.33821093669954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09379960768199452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3436726890451921, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002099670338530863 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5695885225308102, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002705227771510273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4183289686770281, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020625774095387943 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15145968082746675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013480346131076102 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25931282860324156, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002266505933044581 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18600518275150685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001526977649422954 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.24856833227974034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001426163937446131 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.42055124580774705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024091709808996154 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.30465189918657554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001501929754244832 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.28486827225361294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018810468253166184 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.47345290666526635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025997531141343233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.34695172233964533, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019140373403072556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..56addd2e41b46e0993e4c9dabc974b131c6cac4a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.311617860948694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08548221958370308 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3349961502490069, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021680799575113964 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5565544757647279, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028291574048521497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4075743912737001, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002146090481914194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1491383011087303, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001393658080239258 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25579275901300663, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023006833356505204 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18307097946148873, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015714553612069671 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.24541633137395016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014787103833905858 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.4155102693950093, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024152269725431605 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.3005272202343599, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015463466057729666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2783595624546861, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001923194297919911 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.46345985055371536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026579629870791487 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3388120379840606, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019607864106101058 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0150dc6676e86d37aeb2f90d3e445bdd552a862f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.3919491491112925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08289306046133078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.34763102577482935, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002149233305061666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5691735901016647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027018434473941233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4212144473373565, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020829348398401265 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15665647152252904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014198314983417174 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2641675222249262, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00230189707841787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.19134136835621748, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001586785118204661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.25481057551592495, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014950883188841327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.42503194148437995, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002378950785673738 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.3107651494371883, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015448622097670513 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2874633204241909, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019236933160864085 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.47133290982559206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025524403420989594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3484043781862422, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019286965322109623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..18051ff165dd293420e7a0405cf3fb74416eae31 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.2572003772902205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08895652378880192 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.345895095360714, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021030157057793835 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5666156627306035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002673571521803025 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.419255158305618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020235703061870854 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15435273004708677, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001384871199965745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2608233042022455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022656300390166877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18872128486346074, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015510519443660666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.25300940768488106, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001425167078952542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.42247328220972996, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002353052068466571 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.3087981367592903, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001478900728238061 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.284821827767446, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018887063780582047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.46645604559897036, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002510391137764271 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3451904558791544, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018918033978421436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c182f7c6e129c66d01949a96f98e008ec833400e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 3.112918446128787, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.053810399237995354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.15456658592030545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018143739370322604 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.3025102581811532, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0033588873082462944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.20084482886106741, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002261750612159959 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.04828817820081409, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011102317342711607 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.09771928488920142, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022721293090265083 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.06347842363431547, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014480298422468413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.12503267788087088, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001400501149320938 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.24757730148054255, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00273114149938929 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.16303102771266634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017676167469174691 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.13774800739240092, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016042003429823542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.2701867863891116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00300144186310966 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.1791143226266472, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020064066499178943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e0d945f70de010034a2afd02eb5285833e60854 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.053334944424697, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06721594685515214 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30801660806432923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016273405349466305 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5619641447190051, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025318813867833377 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.38936702412346486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016662898170015916 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.12945275179345114, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011074882101870791 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2439184132930377, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021503693219860074 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16511209673657395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013352227993337418 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2092448958861131, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011482208527024708 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.388326334625175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023092628247571192 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2659105891113473, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013002731121155686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25794640278330544, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014986373993336347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4707609586779557, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024378163211520084 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32608244869018355, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016028641869031382 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee45c73de9172e345b6c5ef8e6ab02350b178e0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.561233872713581, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09108477364016405 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30557997424529676, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00163203899887396 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5616556592909835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002469578127127456 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.38740329284198877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016694245894016385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13344266402318036, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011422005728536733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2537552926960219, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002233457163496332 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17074360575215342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001379403429380591 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21330386118745845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011724736772492886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3990960192237072, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023163293106240614 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27191789630264573, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013215420536361491 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25891502247240206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015027118915720513 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.47690020829771507, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024407095587869695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32843508910149455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016104077951613867 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..73138ff2344fc18c46b01b2189e3e1863a6c492b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.776788277555548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08908888038445728 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30214712366118474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016165598811510716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5592881145815344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002519531092257658 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3840971278102562, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001668160634088681 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13422079490420966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011667771588757267 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2566073804641227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022681846275686155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17213478001357976, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001408023581847778 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21374794181163875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011830966338559523 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4017997775289984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023191692353490933 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27303357092908037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013329054733735002 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25778209737895724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015185694019430708 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.47756511166340027, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00247847378438377 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3277571464283619, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016344606337249433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..37a9f15ced81bb08f78d91d838c89cc68bc3f5bd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.799544352473561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08238532707877078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.2983364923590482, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016137840485591932 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5543295832415513, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025227410815235234 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.37986106085998556, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016771413118236993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13254250642240248, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011814668653123278 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2537164843195013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022749137857616354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17010384910521295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014281699846839927 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21162550086306248, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001199938107743782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.398387439159313, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002291054746321359 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2705591293748383, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001350504706167112 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25543730355908567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015294759522292728 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.474621306847035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002506933541584412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3252326795969852, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016631041936946884 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c7617012e4a56712083e93cd71e0f51c5ada872a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.697212269931585, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08297111861263176 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.2957939018771136, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016095004602289511 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5510537049287292, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024827548140078834 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3769172098402743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016610029156639574 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1310898884871186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011695549044830754 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2514989870527316, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022100542132105377 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1683711858028947, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014048286913860427 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.20985604615376746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012015087997438354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3967540619875405, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022977607550473756 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26868909350069764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013557031410614696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2527358286967553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015401236971888642 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4706633716608987, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024801531809459493 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3220148127414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016679134262009298 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..43c0d2a21926550a7a55d8f50c03da893b30176e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.728040337370279e-139, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 5.383410053108398e-114 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.01413888888888889, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020691161490258166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.0008690455993586293, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0001433594447135255 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.0015568503799152772, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00024935374592081596 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0006693121693121694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0003915966570899454 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 6.965488215488215e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 3.5859820215685834e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.00012067093428409366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 6.188322773568673e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.013305555555555555, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0019973445104400247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.0007596001007290696, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00012365940229831348 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.00136520023174339, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00021513459627798125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.01375, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0020331190505576643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.0008096001007290697, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00013118909454719753 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.001454791799595827, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00022889798881739265 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..43993216ac2644539e33a178998bffe8cc3d1599 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.20327154719905477, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.034768468483106676 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.11777046155460656, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004787552105977667 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.0755797825871043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003197252899893255 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.06799149288514703, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0027079375902244007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.025207113777389382, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001302556335227448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.029482308327976877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0014690441197124206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.025195913355673966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0012304784432672295 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.09889828492871724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004441805820458746 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.0535759232563342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022760149883595216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.048776083503696964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0019127958949913073 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.10804282075733199, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004604581172532683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.06371930488469042, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002700349737694304 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.057898257679435405, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002300168304929039 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a278ba52d4846db8021f527ca883c3d94c9cb0b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.5836328567888602, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11778913760335698 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.16142810684796663, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004922963050007353 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.12928310552643027, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0038524381489268497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.11549730510595205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003274787299760805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.04496956013107435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016446318482885244 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.051485578046763486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017825906801569658 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.04447784117945149, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015076383673862696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.12870133893762628, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004434980395614415 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.09129308646670774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0027323522213050914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.08223474440316005, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0023266800965344265 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.1434849944924965, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004641427545467661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.1085877089549346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0032584779260767995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.0974072227519707, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002774866384923383 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..201b25764843c2b1b1711ddd4749dea1efd5ab2d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.2251973658832964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06840209821605239 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.15582501118061967, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0050057405436943385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.11693432609433732, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0036830042145809486 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.10350129867546688, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0030845729308722892 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.037407232865964785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014744828664782648 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.045276097550827864, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001668077617467975 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.038284747118588126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001383633589370129 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.12825581895473886, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0046484472676466704 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.08317479654292953, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002625824446891428 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.07447530948404624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002203151007472917 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.14099199993249983, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0047989121526212306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.09896314553373009, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003133393959775989 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.08797882063149828, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0026214171075166757 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cb657847a3763698b3cc56b00489c03c4711f9e5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.151782007884036, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10917316383418346 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.153541975103279, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005017944717364684 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.11318510222695441, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0036272890057710877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.10057563196631636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0030484749132734744 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.03679979821425693, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014791535592911546 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.04434331636886536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016832005869182524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.037516989850184534, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013843301579119466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.12610380196461451, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004649593887639469 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.08015880317670764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002581090415922211 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.07193779825133709, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021569276149122152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.13842028827055886, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004798912895625669 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.0951361419739092, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003071944475077794 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.0849355679057312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002576383143938577 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dc66d36b781ce0103079c83f6652e9a93a60cd46 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.2641292736786365, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.097056472591506 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.15290909151602036, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0049595466535822955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.11499927704768495, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0035939559105208365 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.10178181807506882, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0030246507145682004 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.03683534296753499, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014422357682860986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.045310106634696305, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016718535230670977 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.038242180726931196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013857098786122237 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.12582895602125432, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004611397478517857 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.0820526177378216, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00257106175276383 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.07332298026105327, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002162653775276333 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.1378091723714546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00475897658619402 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.09649871874135414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003040230110947172 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.0858643495813317, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0025590711901703174 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6aca0ee21835e3099fc3b6ca43b3c5ff42ea282a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.02601889547824242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.008845990174481217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.013188289488289925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0003020110162685144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.02331912229418711, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0005487407492538043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.016342850382717815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00034912236129064063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.00018882696164487857, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 4.0143613745290394e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.0003925072247489244, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 9.925690772584578e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.00024104025657346095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 5.3010616942847675e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.013188289488289925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0003020110162685144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.02331912229418711, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0005487407492538043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.016342850382717815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00034912236129064063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.012772945572946016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000289959796992291 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.022577540598345845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0005237847970514886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.015820908294670494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00033227991604236137 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..67ba8777dfe9425951449777e51900ec53b6f2af --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.11194167971178, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11325409958385195 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.44662580344767955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029145470980427935 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.403325138761631, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029812540605776657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.396708212066539, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023271634805327288 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19849995481108837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0023301034372897886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.17476709268624893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019834376005625296 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1714205638298909, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017391694364972787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3259862976774872, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0025491424613272398 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28928330619087117, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002329063084463768 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28469304359904984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018043411811584805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.37806486217321694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002787004218176944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3367501905387613, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002641205675752919 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3323030796627126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002111384569072981 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c9f15194d1954169a1891835c4d140817882f416 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.765851233592166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1147637687545087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5029109698404333, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032326251122724503 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.399072749631299, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002770754484292409 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4194853660757534, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002222220598248002 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.23608052397188775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002567411684370418 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18324767332759007, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001989894281113984 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19259169221915515, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018423181170468268 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3660346353864222, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0028533265797324394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28763635547225785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002249079446710024 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3027130154416743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001900798692043736 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4152150772992765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0030631524445173153 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.32804484742896467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002520109021821381 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.34512069878778673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00213278119755698 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1e1c42fa37c491627756dbbb5af93bc35b0e76a8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.504414399066166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14144404460789148 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.509958754752616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003154730928607417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3954899202585752, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002608601302312822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.42179412730593185, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0021527795098671134 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24242098353513022, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002484157206893866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18360253637850166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001904297402323581 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19636018570824587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018014064871279597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.36777401786978037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002648283289980144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28409076327843025, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002115488452238855 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.30275824983777616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018031245552577217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4185549498251625, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0029283489596877298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.32342169868683124, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023588979812351725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3452111361624014, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020448977561221436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..df2e72f6883f27c759821c2c478651e60cd0509f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.453461006006084, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20323399299325623 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5152975398825912, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032876121566522126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3875757012647283, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002563136912882847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4184745538314975, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002152424221911221 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24767323400172267, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002578983235298861 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1802765312255684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018268424519338505 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19590832872090894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017695553874619732 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.37129359217610464, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0027272181980460375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2780682094337028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020775510109816452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.30013711995116676, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001799774395616833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4223690533102043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003005111083307187 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3169024351381635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023241001320442878 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.34206938848652774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020120604230570572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..738a292d845d4076189f24181902fff96dce1fba --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.336987597938899, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20513507856533955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5143545157562858, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003337934175013788 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3857868235592206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024923995035469678 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4170315564856164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002124259039598598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24631251817723201, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002657555991369566 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.17787346956272435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001792217462473695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19402158147865167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017876994973534497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.37299125515052484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002815154394732632 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.27850203471081203, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002038628599689064 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3009842944910936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018055725861154817 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.42379837225389067, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031034115053880863 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.31679500554979756, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002259000761426462 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3425854571657328, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020124639191903327 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d848db89f87d680593463b24e3b907c9835a43ed --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 3.026054687943056, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05078028430795098 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.22893008098167747, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0023230957141361385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.3777465242623155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028593004842754027 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.27711271390608916, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0024014333114825595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.09022137807900418, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012595960763224963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.14929791491865801, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001792572395692284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.10910465326076894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013890979482078735 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.16625817127303685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0015302425277458882 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.2807767212855276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021268958542967034 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.20293981930119875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001618501844111904 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.18691135016064478, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019403851673907941 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.3092909346574639, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002464400666534724 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.22644058640190284, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002021283150451286 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f90d0819e44efd636305a7aa89e11f9baf763bd4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.1092098281513625, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09127976996292099 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.38095564749467725, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019709425802198264 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5860861386660807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002426219681779596 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4510092881121515, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017948907933983346 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1695045775140401, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013385932546957934 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26780052036509366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020869517300861743 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20219167803744306, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014562059203023273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.26311648134376764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014102722543714375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.412622808220604, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023307564982918876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3135694240329392, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014545694851811255 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3132751166335577, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018346757269402277 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4820736892660319, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023982212141781824 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.37081917705524653, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017754950366532115 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f064eb942d2cac2270a0ed85c60c807f7675360d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.46596372404059, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10222019609135197 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3651355772456116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020414630989659278 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.576357756634015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002576181875133805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.43663869577593034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019277436539416571 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16361081730624932, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013840748749673782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26604599291958914, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002256079964522369 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19722529213201134, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015363235060857502 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2602719118470187, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014547965301518198 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41863229087687953, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024024801780310424 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.31324149645192695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015109155091120891 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3042042275101151, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018792439139020645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.48113522705415146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025570057038425613 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.36392101007128824, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018682328209929703 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d3b08057f2eb7ae038d3b2589ca9078f50caec --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.660859618805935, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1172038453417659 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3587661880944985, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002029335942530353 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5745232266800461, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026290745515900977 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.43141055859111643, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019365140286472272 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16212842829928545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014186202913670232 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2670249903635504, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023104076885023827 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1964954395976402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015922666319257745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2554771790178779, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014430344926745968 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41621281205529004, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002379747308022119 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3090193571542357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014996989848492066 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.29982900805480683, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018862992882486876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4809216581836253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002619984298616209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3606858214661092, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019006216836532906 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ac7b9b5ebf3a89b75a0f4fc2e6180f7a6eb0607 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.669912151694831, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08998324955735416 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.35581234182428756, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002046080023554725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5730282795070204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002647698915295993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4288209301236101, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019594115330927517 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16088582394762457, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014414061079427547 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26635743558551805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00235484941364112 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19536984000862256, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016184104384078227 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25423407055295205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001471223089557672 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4166450671427576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024110898290359713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3082343858102462, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015342038958960327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2978062975462721, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019203204157120337 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4794333800307403, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026149511981629395 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.35880379298869464, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019316365194543348 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c66abddbd51711844499465a744c78690f1f7406 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.620027445143791, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09375792130378542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.34835218897272274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020272801988729847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5642232234850835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026596537106805445 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4206980580109448, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019612026897740434 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15719137843889044, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014304675402805396 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26134576822206673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002331449346276716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19119099944111612, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016112209219818218 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2505811728785357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001468887250836978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4121293895675098, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023983911022547304 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3042241306518907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015396594533973271 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2925850043674371, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001886060348908665 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4736084330903114, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025599032117973853 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.35323293802974065, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018984796157355765 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..51477104d0adccb9cbc927a0a8b1895221873d7a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.07843610918389116, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016304546568532293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.19781222980051757, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004008447460929873 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.11088128154225567, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002247020389730472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.009871857582884632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0005640034860620715 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.026368970171206544, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015701889057261075 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.014155568509608755, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008063057663091102 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.06563283750134696, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012425342106169393 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.16622607422512797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031008472748429656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.09287467603067165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017134113448546992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.0633625318403226, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001306257704460161 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.1611657589875393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0033067458850410307 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.08976009975925459, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018118631890276285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.5037467911105132, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07923156278606476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..69c7a54d9c34f1b81b6e9ed7158bb0aabf50cdbf --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.12349083765294645, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0020272360845469154 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.24222254243586877, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0044424282559846874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.15514646838340018, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002450478109255688 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.015138644522028519, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008835351646857368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.034641410753845345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0019099475070161016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.020262527556005907, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011049454084872373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09008731584721652, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014860613149287796 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.17489515829679106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003107522060120389 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11242576970395425, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017048075155665426 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09704873972752216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016068769402850923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.19197702048862725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036060489138808275 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.12222801015667094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001959356143537563 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.8932555367691859, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08607396085989544 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..98f298fff92a2425bb122b9390e0e14b5ac2776e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.12268381637109076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018383239534197776 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2846467249093633, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00416866407812642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1672594497220971, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023821447431805398 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.020208437649204568, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008690875508991531 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.04992195461086151, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0021341735292668943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.02824595859604695, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011938469906633742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.0902464384634346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012744889966160597 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.21079924235160477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0029910133249198756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12312979455198288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016331251581310881 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09655299851334746, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001434721313601341 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22560901395675043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0033495485137766405 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1318735139607911, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018667917007005078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.1080989461946913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09146307320028801 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..81c7abcd316da3bf637584ec84198f46f370ec24 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11891262113570582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002001687206629081 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.27367870029835756, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004475568220518732 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.16149874282108523, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002582011824364168 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.020146490797565537, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008642878920700437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.048883434139436875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0021010331707940486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.02800561543388405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011878939240017126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08847153927567653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014199450476196438 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.205201991606198, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003311790274919139 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12036250889852927, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001844426823256787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09480824337780196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016157369586947718 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.21941969701456296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036949095354463033 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1289105526954833, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020915120619956393 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.0644281139673455, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05948687397505724 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..379e9a21ece3a890659406e1d9647a705da969e8 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.0354039096330574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024221017075653785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.06348261494972335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003911537053406857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.040556158583151695, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002414780159795799 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.005222818308406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006123821346103362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.011105721412738316, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001297198171401599 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.00666835063292078, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0007560756618639585 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.02786040640367174, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001905350850283381 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.04948373109395915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003054509322886882 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.03166574075281406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018736700983275614 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.02824398060982596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0020175634460573052 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.050498368281708256, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0031803074224818545 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.032061891009607074, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019225657173298427 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.5155503095519125, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12577156331553185 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d675fbf6327f07f745f6c07b105314168e8b921f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.002572898799313894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0014841881904327362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.0001111004565947103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 6.627016158645234e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.0002124229718569341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00012635147814425192 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.002572898799313894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014841881904327362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.0001111004565947103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 6.627016158645234e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.0002124229718569341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00012635147814425192 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.002572898799313894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014841881904327362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.0001111004565947103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 6.627016158645234e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.0002124229718569341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00012635147814425192 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cb9588d950f06d9bd49da5fd97ce99559013fba7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.05503216443294283, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0013208178441723422 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.12247694126975207, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0025016532931361178 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.07476921388091318, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.001653767538430365 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0044345537535997155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0004672948221132119 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.009160898916001237, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008442058248417855 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.005848067139995684, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0005763476196672857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.04930420474602141, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0010839279734555082 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.11045765316293539, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002067309095667932 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.06711829058828381, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0013521377427549437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.04612657642022773, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0010535188227501662 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.1036251864997193, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0020318246258696476 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.06282572428962137, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0013169186728595223 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 0.261262554550647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05001884278243581 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..103b3dc5d0023fa87f05b8202060a6f9e0eb539c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.07325117374129199, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001655255942277255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.1712083046666863, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036789822383315267 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.10028369295745565, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0021540097009297766 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.008743555836383905, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006295019423085548 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.021042178818604606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015442126736674505 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.012072025290438592, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008659837789447237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.06221015702227314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001239278126124422 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.1469982161156331, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0028600463911305847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.08551310402778783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001624161272115942 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.05955098102853854, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012927157425887115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.14057793619440717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029763151410751565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.08177586301155866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0016972607545623106 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 0.5215024206550787, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.03819786214468247 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e6d16ea137e260a177cecbb4641b71a27d156584 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.10254403634811139, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00205888122861127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.24531036433761338, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004615810458504819 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.14215569897506622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002716440843767019 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.019657431817826233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000956296408061839 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.048701420734218445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0023261449730351148 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.02751335673945438, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013158018001875164 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0829632148581498, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001546655224020721 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.1999850784660946, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003529529330015122 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1152398964081201, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020348120567768926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.08247426765333944, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016467303434733415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.1991641708522746, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003807983414613027 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.11464083134883725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021942880329705087 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.127011706781588, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08800278654546138 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3de445a24189679ebce594f3fefdae70dc7fceee --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.1271432311032113, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0023074163775109917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.29372316623222805, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0048620902876557745 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.1724090676689418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028418088279796444 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.029429750673113986, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011851736679672697 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.07034873459931881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002719393016178684 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0402095932041227, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015367221854734703 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.10042286402298024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017760282981670063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.23390892681593342, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003893287643962479 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1365099491658134, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022085318962482355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.10075735870532844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018958298832118272 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2342919714250497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041467607949485135 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.1368078332815587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002368380322788649 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.722308165092786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11202608038530598 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57d9b7f472bd6fd377103fa0392629a453eca846 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.039100730854070335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024339409571454023 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.07309487784533099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004271503462792883 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.04736351863009491, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002774562583687205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.008641952844867724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008209970490511706 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.017044130872016457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015311516528441142 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.010845224152235416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009811807847928044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.031181187159859413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001963515304654915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.05798493820733094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033712132693060827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.037416996326406736, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002165687085012485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.031826221071430495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002021894152923463 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.05922841807653992, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034905888872791228 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.03817823692527911, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022299777911451387 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 0.7851146582603888, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.14248563320706192 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..918f625c9b7f078db3512c64000d312974da0b48 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.0024406380399721135, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006673287893985838 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.002125069578891824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006354178152671431 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.0022081044921040066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006238927634391291 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0003296901951174981, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00018804946463358912 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0003008969679774049, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00018767733482335345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0003107051777238192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0001860944713122803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0019949852219552553, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005366470253705943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0017177557215678593, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00049942930432405 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0017991228311771584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004996511492676782 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.002116903629112286, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005752535478594655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.001816496355313458, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005356117222057922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.0019064317416731434, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005368859371026533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.5125237582746943e-42, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.0979696174822401e-35 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..63f99f27f9707e2a6cb1d631aa86b02f97a78c9c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.08057623396604993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018427190672612415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.1903611222785915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004119319495363413 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.11144536072130062, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002449613618907836 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.012439638488265747, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007674450472725088 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.029923546174765742, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018325311907880977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01730052045113504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010550283614850532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06836163772266587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001418824283099982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.16290273051352785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032456160924942976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.09482469984719238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001891115330305303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.06720724706425169, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001458820899362241 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.16056752379802697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003380030301216589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.09329416183941738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019565438808381327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.711911214189282, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.062271095680873176 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2743fb5f4b5f4501f6601349b688bd9b78ffadd5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.08658830544513134, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018612611645494558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.21204874097915127, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004437751832600217 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.12151658666281231, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025542911124592704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.013510304634606875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007312649740318255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.03415295300257043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0019425669982816587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.019132118327200527, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010395559026960023 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0702987343925042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013468612933160927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.17295883718094748, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033149082764143117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.09875044840716671, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018572603815356456 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.07169769773353042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001454220219767976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.17642520944692217, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035711244979865823 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.10073542075586238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002008424522536892 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.7485653629026496, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10161315249240252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..48364336d190f4cbeccb986baeb23c391639d9a5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.10919972863971208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018326070695569962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.27085077984418787, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004340987074007443 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.15372555310846955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024917036123975646 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.018660104842681158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008521301906071727 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.048121178106264206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002236763054096929 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.026545543337132424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001202505861016018 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.08598679476507894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001347479595633704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.21491972476150967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033319697703370205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12124448179130719, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018406701231741705 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.08936454412712036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00147707143314301 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.22313932236999878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003637762226705496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.12600811759241443, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002023272430841333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.0047358326681721, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07696741647689843 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ef1771ffa2a9a05c597916e0834a8d9d334179c0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.12156176985300436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019916743006548566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.2919486745230574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004594202930152972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.16788870363340208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026015869149755492 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.023390561110934703, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009482124699139898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.05931283277905219, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00253961685376344 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03291830334125208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013348595001679636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.09410425031338661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001505615984367887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.22729829891507616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035793671647219765 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12998891345659275, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00194502365003499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.09794367331761664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016077634502078913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.23679450093245694, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003856459984346274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13536430596241408, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020963457328233175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.3876528749760366, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09352517366139018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b262435c67fac50562dcc425ee208b5f1e999d5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04061391536633297, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025885117246031656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0774019664424454, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004501264779451914 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.04817132624367883, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002710318861619294 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.007571424737600509, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000774569805719568 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.017306355810365114, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017078169253319931 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.010104068388385765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009810253225945517 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03226926320041072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021853447839680425 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0606544963878778, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035695265285015203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03758941214391744, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002107269506833207 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03361123521194327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002255713167385482 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.063038545340123, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003730201727070476 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.03925168824333697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022220799806552142 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.8091606018729823, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13365493953263705 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a2be15bcd28d3ecc262bc7fc106ded5180d3f4b1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0025292500918997793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006843948420078455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0020532297175197265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005556296741534733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002218415772902317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0005977307451849004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004376650603065697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00024169751059179606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00040004436924525715, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0002447488200268485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00041371259854665804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0002415623180229552 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0021231909904583543, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005742362353931501 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0017609197535564964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004851127145778472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0018828340816919485, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005121954193145257 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002170837264519722, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005838940116621144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0017952250708806817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004910386484249093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0019227239855572795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005197852399034371 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.9417748605436574e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.644365126953672e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c154e314ca783a7a885dc326d234bb4207bfe9c2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1290022798344525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002385695780968086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.281977709942812, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004346065008819319 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.16881791798821513, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025746455343428143 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.02404571804293375, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013659743034430886 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.052449257583153115, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002298683613553151 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.031013676801335422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013438213374447965 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.0991345122176881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001904974022092299 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.21710334307827317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032407820193998097 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1293936279197813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001905181791064322 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.10327457231643225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001973038870062356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2277601179019926, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035824212720264757 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.13530856271788863, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002063773976743065 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.2025114531043062, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10655129877623105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fb682c978742ee6dd0b56c6fc7fcd35be8105172 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14417853642749137, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018611534066647365 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3537153376225196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0042546837674507666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20187019098994882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00244120776332977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.030641973891149483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011156075040613734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.07800524098991497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029228930462181354 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04334620232538617, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015695593858469068 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10744022317808914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013648772722477944 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26585677142562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033591403270810637 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1507291074053435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018184345906770076 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11450326924869379, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001534569154151368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2828354708969096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003689314769244396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16062701136729154, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020501392930297666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.7762050506045142, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09926295387249034 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a3458fc6afd64439b8ba62ab93dce8bf3c35ae43 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1415629115103802, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017838032550705376 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.35091090779004147, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0041843712980091045 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.19932155852386563, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002392922806012122 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.030702156454896785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010693945263455453 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.07873922100816551, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002781171908981175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04362070001507444, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015090115501029922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10636784579398613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013043402766199064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2663335312559493, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032882907730531676 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15012616545978386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017748529391718238 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11293750566369144, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014708469401565362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2820820207716993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036056577642723567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15932750811184002, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00199892928658887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.6727033378910847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09495317261707087 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..59ddf4f8911d40e3a3cf9b7f491704aaad92b9a4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1366930941690243, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019671239516403935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.32595135506801376, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004477539106095352 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.1884376588525654, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025642932469730627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.028528787233145586, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010703763072091093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.07151102797558972, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027297730367125982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.0400453211123096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014862606183809399 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1040074617013233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0015105673758063752 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2486131025320305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035056965604027203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.14317246926611424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019379403187274123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11019224321987507, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016599707159991781 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2636677319513756, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038844849580001823 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15181862614650385, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021601749082742227 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.742293158445559, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12475047173558831 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ceb86c981fea9cac73283afac0cabe9787fd9ab --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.04410024857440917, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002722944168692114 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.07974432748836109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004470156036803333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.0510345184963839, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002788963063519097 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.008214763901131556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008388994526412292 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.017137458752784145, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016302525508184434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.010522073701869125, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009826679169524012 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.03366378122590164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0020732284962604146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.06128142643464567, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003508572574499959 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.039029603911429886, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021514536166429943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.03523949695288221, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021552641230410848 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.0645342916738449, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003702351211849086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.0410270223606163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002268750341839602 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 0.8218145273207486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13259402458841246 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3a4be1a0a306769ccda5067155a756817bb158a1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.0030006953279340686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008260104673619008 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.002448875117757725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006686883097147919 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.0026601966494432523, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007275199319288471 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0005102355407571833, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00024289107214722606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0004277053874723131, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00019999235661580537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.00046275158053195667, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00021785531794482434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.002398796026490401, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000661743079139523 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.0020040874122394833, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005526887153193406 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.002154351416462934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005935536504835335 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.00239564296423634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006561431092058282 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.0019841067166653605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005417182019917762 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.002139637125943983, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005838831183619806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 4.273566994720392e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.8291126911657586e-32 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..57cf8518867571d8e5666470e5286f8c55db8eb1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.14203799200266776, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002270644021652014 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.31732904475268425, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004576952267589019 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.1900087776056167, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026590712025527704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.030454613592282354, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001210755597735052 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07004943298066989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027142097244891622 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.040900489822348056, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015444912268875982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.1084493048877211, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017320556174949715 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.24390681149682772, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035304448560530047 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14526081266341737, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020046277742893984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.11212112563171714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018151655419957806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.25327587259397216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038632056834617744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.15054379734755471, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021753294084953185 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.6188434485166334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1109564339936777 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8505a8ba0ddd0325c17ab368a9301f1bee5b61f4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1310768758781413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017542917965564532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.324216765100875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004101611205461987 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18445302591356055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002358252600127643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.02727413002222952, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010077697140453462 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06992767890558657, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0026846374203769387 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.038774277981477374, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014338449153380509 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10150380205667145, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001323267745441165 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2537245165326843, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003368126070672136 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14321716258429631, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001812335103355609 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10624902236675653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014584079648567682 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.26509353636861205, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003626121575001854 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14987577117769335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001995803271934798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.5775644045947161, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10747802490972798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..23a7b8d1cf6cf3e5f07f34c27b54f08320e4d5ab --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.12929139605322415, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017250930103395053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.32127297765523716, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0039219092067371624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18223568358253522, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023053601425562413 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.02577874922325936, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009382634954700045 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06615069752192394, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024396021514475836 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.03664914264570665, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013235856869957899 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.09993852104012416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001292895388479902 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2507415234860173, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00317071813365378 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.1411906750067624, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017542193090228139 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10524886017120061, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014473708528824547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2635160093607098, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034669204083299546 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14863975004627103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001961623937053356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.4631699049107234, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0818224941441452 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3bee974b46ad817f6bef9b6b48db0fe84f8a6464 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1270012394870458, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002091745941086545 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.30424480950743027, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004523774432666432 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.17573577441569346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026465445466935253 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.026605921898331948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011008712275775547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06560029901447437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002618665050517068 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.03701973106444136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014575936915393465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.09771059146617143, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016171559842817049 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2357515770528734, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035751298993674996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.135384466992445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020400308616851347 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10278375996682501, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001730936323308653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.24848166333986804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038955439908868214 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1425837105636293, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022180825213510558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.6024997408404598, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08124844005963922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..451763f402fe7472ad6c23fd265506594492e483 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.03513411483539431, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002220905791930144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.06807789519735524, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004165774223715686 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.04277432877434612, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025508570289566007 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.007057485487635765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009007492112467273 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.014720343410307037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015318015560624846 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.008786196844590121, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009097928996719041 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.02768868248284088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018118702472072936 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.05338300755867666, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033015217413937302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.03335136680279893, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019990065744291984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.029185508102300205, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00188906175149616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.05628446465745351, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034924975937847573 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0352809944078966, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002126337470382871 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.7057325975464012, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1666309026142423 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..180f4718118c34f522d84b150df0122eba536cce --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.003716409376786735, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0012903356644372312 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.0004836451099249516, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00016932836035127237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0008504247165137163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00029663122917791953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0004288164665523156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00042881646655231734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 3.430531732418525e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 3.430531732418596e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 6.352836541515787e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 6.352836541515829e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011921960276713975 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.0004359988358635832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00014796403981705004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0007687453895513705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000260905903643579 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0011921960276713975 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.0004359988358635832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00014796403981705004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0007687453895513705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.000260905903643579 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f628e9f9a7a72e66e673860047810370167ed26 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 4.4726418073213345, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16753723606834903 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.09990759568043395, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003962742821805288 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.542668484521527, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.008163857356083524 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.13738094874969342, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0040877314692411865 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.07078028109191174, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0034244977285782584 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.3897797823742886, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008526350760099893 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.09706102035374112, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.003653466658553787 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.09777072760562562, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0039192045347804325 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.532792582986373, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.008177843897950418 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.13447315238265775, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.004050384508000915 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.09637631448880932, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003951999111601633 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.5181791996400547, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.008308241749147611 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.13173120941865174, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004084755897159187 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b61e4759c9b446f896b8b29d21b92054de0eb3aa --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.388997984440001, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.22030624130684912 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.1899985239367049, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005524798334171625 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6197096800293467, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007218975234253726 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.23339725988760335, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005577890928986144 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.13527730400642915, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00492492204254613 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.45578433847724403, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008317705400555226 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.16839814753926893, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005178370714369835 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.18193802149181904, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005417257807881834 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6024769760148035, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007410768687993907 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2248230330602144, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.005533593484296289 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.18398509430200422, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0054807247743772465 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.5984789556418103, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007439193497312767 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2260074903815481, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005576688947640353 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f04cdf89d9b1a2c737cf8697c38b7f37bf4f9c92 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 7.1654235804393975, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.23664442530069865 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.2632982490541137, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0072621786329556 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6328871785091988, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007011851144328955 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.28818203262436376, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006789855255273055 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.19631851485753235, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.006522555304684875 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.47001882425100405, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008218114595065874 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.21700191007059494, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.006362729074282588 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.2526725103593678, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007156069139226169 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.611025032648354, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00729448166920107 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.27789781089889687, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006777757204322281 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.2555628679056802, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.007187190219846719 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6156889934973935, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007241268875757391 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.28052899465966125, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00677957486590257 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f5879bea3dd915b95d4d6d38148d8cfbae61de6b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 7.130010006945033, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.36614351972675196 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.27187069677018444, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.007724489648815434 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6409983231082317, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0071549123595310085 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.28765257778776016, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007122033948855405 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.2074993154694547, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.006958791199014906 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.48909595674061107, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008272314038685013 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.2220313726729203, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.006635540925947997 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.2622707060445547, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007591102476600222 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6217153240581279, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007407513056831873 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.27918830024031255, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007101399798328581 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.2651656938002711, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.007644202201817951 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.626997130842089, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007343317727382677 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2814597792431154, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007102157344044307 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e9454efe144536481a5d093ef4af62421587b7f2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.664095500884982, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.34838973765349207 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.2587985355155675, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.007619203655004035 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6552543491315841, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007002192904681263 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.2771124149282886, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007080361587320621 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.19927034297156443, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0068409334865776284 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5044582337397387, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008192238508307557 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.21583669822052345, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.006557201662967979 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.25108837349831326, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007533159919158992 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6368030343144712, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007249065063061643 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2697811327830668, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007067126106733718 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.253246129669511, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0075494739014644666 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6429352137151435, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007173543257338698 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2717897500548224, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007057987573730667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cfa47a2bf95e23d091953f4dd39f3994c35b761d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.446672886311619, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.27464231120207433 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.2528718402860418, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.007719845208329688 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.654561878141819, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00698855116841065 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.26736214869813785, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0070988275428792045 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.193812416956008, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.006844516632300937 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5042421877334644, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008241858798974178 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.20868674330105244, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.006522205599689379 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.24481322454808105, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007592775252281026 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6363050209515578, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0072450011340579445 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2599202410891578, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007057224366690571 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.24690879630896684, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.007607566463992895 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6431100118567753, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007155463175240526 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2621481175778254, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007056258826347817 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0cd62b63de6f8dc87cd9fb4c7e9c7ee916afdc11 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7dda6337879834c1d6d5e182feb30fee66ad0393 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5087051142546246, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664055982032837 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5087051142546246, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664055982032837 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9c287836e251be530567fd6efbd135ffa7a03d8e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5223068552774756, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011654208652596476 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5223068552774756, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011654208652596476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bf19f137d43bdb8145527b5272f0b18b275ad7a4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5092491838955386, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663828032649183 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5092491838955386, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663828032649183 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6a24c0afd61ee37900e441b0fc492caf190e89c9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5282916213275299, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011647134172749322 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5282916213275299, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011647134172749322 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91b702441a140ffd40437df3a39a9ac6f90c1a86 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5114254624591947, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011662778026451659 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5114254624591947, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011662778026451659 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..361927288a39a8e94da94bb8f24e87c4e9036585 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.19326617690121023, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.00752510845276588 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.021769865204723387, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005817240957449695 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.23632094446041846, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004083513329156124 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03752514813814976, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008846335161188192 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003405753331635085, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00016739880607996655 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03960148154546242, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019531504378148563 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005928136888518339, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00027560748473478435 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.019577427737286695, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00047793881065340873 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.21917768067787397, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037979740764717976 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.0339518071889011, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007449293744359491 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01809711784394688, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004916087758295677 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.20397788862519056, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003704689919592339 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.031229364613330524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007424902988435048 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0c448d126debd1f2f57e50c38218729fdc36e7a0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.17941583254637414, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.008838972109532342 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.02104978239320399, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006182804474426174 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.22374374255453272, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004184853646640156 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.036115788468476816, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008787800610938908 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003222523734541128, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00015290781225805715 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.04029314519755627, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020801960108103577 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005682715949708656, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00026165286218016116 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01937672404519423, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0004938315065636547 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.21137283250925631, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003919038757731567 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.033515515939669824, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007571377468473982 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01691374585639976, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004598604234910933 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.18918805907421266, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0037299738776066143 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02922662157472046, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006947111027433623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6849a0bfe72b3e41055acd7da9dcb46febb1cf73 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.17745296114678968, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.008624930392648838 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.02112776839925051, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.000877980468137592 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.218206359468982, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004101332930154146 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03499069784081937, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008446359024973105 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0032849283759100656, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00018537681002876464 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03820330073739487, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020212152218120854 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005621916396892083, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002732592116855481 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01981043631160736, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0007993400917579435 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.2071458409919012, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003867415965532175 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.032907905075395594, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007674706272733912 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.017213819703019412, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0007926417088864289 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1841964822860343, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003656569397088453 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.028356416929466385, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006881727791999624 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..904d8dd18bb240529ba5ed28af3626897315ab65 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.18436487057820314, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010658373126425667 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.02057704871830096, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007494006749779765 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.21357292385095153, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004127038723114807 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.034174719119795666, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008567973159331906 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0033630628167333936, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00028514960802026207 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03759233299677987, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020872224529434385 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005486989401606149, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.000276704602860595 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.019175227864817394, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006222647397159332 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.2029780421179166, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003897679155093511 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.0321009603843197, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000772867121470867 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016726612502688427, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0006097435643352546 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.18052347991125617, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003650693342311041 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02777350418405329, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007032861760892736 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..11803bf282f7bb9e34f284a6d70e1f23d89600ed --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1662496800572863, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.009003816484519792 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019771160862529565, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005967888743923156 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.21013503593016225, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004080380870732406 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.033492958359078145, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008122527510855113 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002967361151580668, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014505452013749797 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03712110581081379, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020199022271893104 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005250361302057742, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.000250744194605345 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018431702367967623, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005086664104353652 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.20000526433171545, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0038644792190701566 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.03142149026212096, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007237635478632389 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016047371965998675, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00051159998028551 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1770149433781237, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035827141690138965 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02710335704553083, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006613328324069825 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6c2a250909a5be00db58bbab5497762846a60762 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.17909655705334981, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.01001948544264451 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019621328243135686, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007677978709294303 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.20744825708534467, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004062728081792751 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.0329209856904888, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008425698682080111 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003330941009610586, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0002884450625712378 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03734761717066073, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019901336224895593 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005515135528910162, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.000282385997180886 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018273289410454367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005985872356689969 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19766569572595405, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0038632884293854372 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.030971756359625915, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007421166076215399 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01596955228121598, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0006686144046345224 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1762042254326998, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036326407496276346 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02675062591564841, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006935141958106178 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..28f6ea1600ee3937b6f29a2c5d487b0b8d07de5a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f1221e5d3942ab88b46f00920a112e985c022b80 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664470424044976 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664470424044976 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cbf9ad4d5f50577231b9b8c880391eadcabd73fa --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665824165343952 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665824165343952 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..50d12e0db9280a1047fa6dbcec894e1152f44421 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.515778019586507, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011660014400426185 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.515778019586507, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011660014400426185 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c3dbe20c0c2d9c9542d75795186278d69abbe553 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5228509249183896, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01165363483240117 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5228509249183896, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01165363483240117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7d535afaee2c5de62de8a67353562cf77af90606 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738877 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cc0551d88a3ea3738b340a0b66f0d50bbc958448 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5565832426550599, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011590883373666863 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5418933623503809, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011624803747232126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..099219e3f1f8effe30ba270f12a9fd68c6908129 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5685527747551686, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01155565729886461 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5674646354733406, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011559142916063143 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..74df21ab5a6c05bee4d8aeb92e13299fa3d3814c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5718171926006529, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011544859155318844 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5723612622415669, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01154300962328283 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bc85a21157a565ed7748fa7daa87e602cb1ec1b3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5663764961915125, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011562571737707342 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5723612622415669, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011543009623282828 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc82d0ad65a392cedf555ec7b6e4b482b562022 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5865070729053319, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011489895831821135 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5745375408052231, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01153546884082453 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..137ffc28ae4bba89f69a7bfd12f9bf51438ae8f4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5848748639825898, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011496520442659124 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5761697497279652, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011529663270276293 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..283bc0e6d24b0d4913335d3558a3e458e2a8f9db --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.613, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015410011955493933 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.543, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01576069159013639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..784ef4c2e1e2a0f15b840cca8d9a73c8d1daa9ba --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.663, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014955087918653591 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.622, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015341165254026642 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..807383f021a6a5cfb7fa0154eb742f1bf8aa5082 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.673, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01484221315341124 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.637, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015213890444671287 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b48e049c30b37415c7e39be4253ba8e4fb27560 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.662, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014965960710224472 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.65, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015090650341444236 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eecc43aae03bca094061e3ce25df8161c8049dbc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.671, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014865395385928369 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.66, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014987482264363935 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0255b47b8cac9fc70ce4d35cfb42c2672d5696fb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.682, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014734079309311901 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.678, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014782913600996683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0de636c508cc3689a4c2aa37b76f39ddf0ae68e3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.83, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011884495834541665 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.741, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.013860415257527911 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f393dd8b18f9639abb29d25c0224c740d127b17f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.846, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011419913065098698 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.794, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012795613612786525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..56b953190916fb11efaac1b56d7a8db6c72f112c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.853, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011203415395160335 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.805, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012535235623319325 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..561baae249ede0d493d74291ff59b263048a2f40 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.856, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011107987548939149 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.804, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012559527926707345 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4d6f5e0d00a1175ac1a8d53a5edecf5e12d05c23 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.849, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01132816522334168 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.81, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012411851354816324 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a713efce67c6eead19c48b23d86bd03f59170129 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Direct-Question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.849, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011328165223341678 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.816, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012259457340938598 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..771a75c1780ef21ae6b7bb44eebfe979136b32c2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.287, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014312087053809965 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.315, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0146966319607925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b44ee476e48ee44183f1925cd6fadcb39f3182c0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.378, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015341165254026649 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.358, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015167928865407559 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..98a08ff58140d0f7f36d32417951994492c8f2b4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.372, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015292149942040577 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.37, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015275252316519362 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5384939034dc3732c3af388bd2324599959c59d5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.349, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015080663991563102 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.364, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015222868840522024 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..db1d71cd187734281622a3fee8c29499d0c90b8a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.335, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014933117490932575 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.358, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015167928865407559 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cfadf2e3a3b8b002567cb68b3a749f876166740a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.362, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015204840912919498 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.356, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015149042659306625 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..069870e5508dbee2f05e976748f895385c5f0a72 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.349, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015080663991563104 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.339, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014976758771620349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dfeed9cfce160aa3f6a968024d60447bd6819c7f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.392, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01544585946377129 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.38, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015356947477797579 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..03e6be74064ef36eef1c7a25ff1f3397dc4e6594 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.363, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015213890444671278 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.386, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015402637476784376 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fdb626bd5514de4576257e8de0040a7ee78f2be0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.363, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015213890444671285 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.363, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015213890444671283 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..77f6da47f8d536733635d59ac52351f00f217cfe --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.319, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014746404865473487 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.345, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015039986742055237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c07a32b791431402e06731cba65bb5136334df21 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.333, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014910846164229852 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.346, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01505026612756445 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2872a089fe05029eace712f2ebc6485d67412acc --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.342, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015008706182121728 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.346, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015050266127564445 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f64757fb7ab58f04f06ac78f55a6b78287928010 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.378, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015341165254026644 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.371, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01528373621182319 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..85a1073f9c0b4ae0142e7c778d62b4becd67bdc5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.344, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015029633724408948 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.323, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014794927843348635 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ae4c9a708b7077ffbb11a181fde9373c6d3c7095 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.329, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014865395385928369 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.362, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015204840912919501 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..aeafa6b5e662794b9e66932a0a896a62a393e078 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.335, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014933117490932577 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.335, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014933117490932577 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..334d28ecdbc67403b1f4862437f34f8ebca81de1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.327, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014842213153411249 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.333, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7bf06dc350592534814a027c75005f8b529a2769 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4719401389631213, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011544210396951672 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.5050774986638161, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561836054238783 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..36c1b9f1f0d3b83adfb955c45d7f281c234c1c71 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4521646178514164, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011509395748220104 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4767504008551577, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011549925483927456 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6d965664410627b93cd0be2b2ffa2fdfea74e944 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4510956707642972, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011506993144185188 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.46285408872260825, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011530479981182628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6d152d4d934b398502b145be9559a81c2bf1d909 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4665954035275254, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011536599118298178 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4660609299839658, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011535764881641411 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8053d7bc429dd77f52bd1bb5b38691fdd926c391 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.45537145911277394, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011516282203726655 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.46285408872260825, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011530479981182626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b16e71ab660fe2950d8dd4b6777f53887491a258 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4665954035275254, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011536599118298173 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.47033671833244256, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011542066509767012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..855f8bf1ac2913bd52092592c728fba810f01970 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4906467129877071, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011560409019420364 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.532870122928915, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011537420054210306 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e08aa94de442a996f69b5f6bc4c85174ce8c6016 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4596472474612507, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011524715486240657 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4917156600748263, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011560845076525713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a5213f399eec8687c9577a8c106f9a103b7e7b0 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4623196151790486, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011529552555884571 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.47995724211651525, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011553138977961008 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b46e1828b70b12bbbe11cb8e0fb6bf64fc59e89c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.45269909139497594, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011510576955232206 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4756814537680385, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011548748301487319 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2003da31337e9fc9b8c5fa1f899f3571baff5037 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.46125066809192944, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011527657726586461 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.467129877071085, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011537420054210297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8b9672dbadc6d5ad3f7fd2ae7e82c315ab7f4a9c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.467129877071085, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011537420054210303 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011528611805439891 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..08c0ad788eb91d5a0719af5bde9196da1c715d43 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2ec0f6fe0ac92c8f737f720303734ead33dd38da --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..24f408fad8295e2e7159b3ef52693789e9765a81 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..31f21a634dbcb79053e80f249f64a6a154682bd3 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d50d0c06e1d0e13fb8d3cae0b1a1bbcc954470a4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..afc2681c8e1fe8d7b125335c5195ecd2dd4de122 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1fedee8eb88de14968db0898e80d85e1d1d2a579 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4831640833778728, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555875693960773 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.51309460181721, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558466383367178 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d67ec7e785628ae3991b331de917be492f24b90 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4494922501336184, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011503288699799176 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.47033671833244256, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01154206650976701 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5df8807116740978e4da4f3d7a51ca2905a96cf1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4478888295029396, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011499463505491369 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.45056119722073756, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01150577173876986 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..618b8f9754f9c650ecec13d5e2aff91a8ef5e90f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4494922501336184, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011503288699799176 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4569748797434527, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01151954486592806 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c7c71dd8c4d13e5e5b94db96ec4554f9f5945 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.44200962052378406, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011484402719452577 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.45323356493853556, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011511744771088355 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d6d6cb751f04a59dadcede421a7a148e7a474b70 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.45056119722073756, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01150577173876986 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.45430251202565475, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011514040245583501 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3319a31788dfdfc6c6db467b0aeb037f46e4f980 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.49706039551042225, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01156223242154194 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5259219668626403, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546883081384903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a09e4e1454c7ad9c42dfb25c8e532271909bc344 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.46392303580972744, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01153229486915312 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.49438802779262425, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561703928784337 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58a2145137f90936c6f8d5c132eba9ae4a75e266 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.45911277391769106, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011523708060182082 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4746125066809193, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011547518083754583 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..20ab6a19695b10b75e610975f8186290a4dd1f6c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4521646178514164, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011509395748220108 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4649919828968466, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011534056494505864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..456c55ed5d40403183413b6a65d8dc7c9ef63761 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4510956707642972, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011506993144185188 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4633885622661678, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011531394084549621 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..89060411687cca4bfef2fb92925f9b680ca1d2ee --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4665954035275254, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011536599118298173 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4735435595938001, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546234813777397 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae0de848a3afa8fb78d7501d8f4dd67263a1abf --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d706ee18d3e9000adb13d8726de6347ce88ffe59 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.4729241877256318, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143713 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4657039711191336, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..883f1f82ed311e5affd5a824e0e1e189165e14c2 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..54d9f24e5852651c3ef7e96b5729da7085395fdd --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4657039711191336, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a39cecd11ad69a7c3364def0cf9dfddd9618e293 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.4620938628158845, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030009848912529113 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..60d8403ee675170c5d06a1b733d93c235d634518 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.4548736462093863, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029973636495415252 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4620938628158845, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eb10b4ff09c127278426c39781bbbd751523d308 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366426 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1328a544db0c419ea902fd89d27960ce73056a4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7628ac12c967461afbb8a455f4e13414d6c692c7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb879982cc03ce573521d579671b7cd628b2e94f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..411271093502a97bd00253732e0d9bdc3f3918eb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9b93d8a9da79cdcf0135708707dfbd0ee7445c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.4693140794223827, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03003973059219781 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.4729241877256318, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..71ea0142d0cfaa24b21f1e4319f66125c0874617 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5523687fb39eda17d3aa1c40a9397e7b00d61a5b --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331327 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1afbac519195c87781f0ef6835620337a06ebc88 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9d5f8a9aceb57eb3c074ecddde7c23d0d515c1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a4969c1a8d85c93ced0b7555c2f1a7cbf01c9232 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300727231673172 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d727b7da9c3acef8ae54492256d1710c1bcc068c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b59582ff5b428e428749b5f4c08d60d365cf42b5 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03009469812323996 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4714b52299630730d93ff55673aebda497c32ea9 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7e301dac107ee49209521b28f9b2eb8d74566627 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300523034631437 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f89c9db0f03055bba4e9d2abcd1b38a612fdaae1 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..919a4bf511cf5a15a4781a664fac0fbe8a8bacf4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030039730592197812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6a017f6c30a47c3ffa15df21a1ee3798b73e3216 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.4693140794223827, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030039730592197812 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cb6831f2f9dbd3dbecf68e91d84add69dc90ecda --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373324 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7308dda551c7d47495a8ef7d4e1e584612a5da7 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..055362acd6416992419a3073185fed4aa70d243e --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5ded777d4011424d0152795291947a6e1bec648c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..11e62e51024fe6b8ee415e87c2c4a90117711108 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300727231673172 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..736fdcfb78a36a0c8387014a0609ef3cc64a285d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_superglue_rte_should-assume_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.4729241877256318, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..71520a2ad1489800c46ef191900f9ee48db1abee --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fd8bdf724b12e799b3815d7d3de3a41f52471395 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5114443567482242, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01404880419985932 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049512 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ec05859dc57a468fcb75e26474e6767618035701 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225632 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..37ad57641dbb8ab8afa18b62f3029045bf15ee89 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367592 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c40e8202ced390138beba91ac03acaa6760d2ed --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405237625922564 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405090552122858 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2eed86e217d5b1dd461e55d61cde7c67d9cbf28a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_Replace_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076887 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e645a1b185f30355c7ffaeec4b8e7fa60db8bb91 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790516 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..152c56f2c3a1773deb8819498205c8ca5ed09460 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790516 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051745961790516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e427a4f2920c286e67434c146e5554a93a1f4d2f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330349 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051745961790513 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..72d8d1cd8ad3b4d4e5088945b7f0c2e3f1d7e243 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529024 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5185477505919495, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014042813708888378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e420fbf075307975563e0555b243b9171b98f9a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367589 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5169692186266772, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014044390401612967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9c22eec3b23566dc0d28bbcca31552c97b86e4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_True-or-False_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225636 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5146014206787688, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014046492383275842 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4d2ece5a847e7a531c913900f6d33f4cd2901901 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915867 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4877663772691397, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048278820405612 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e751f86daf86356a227d4ee70804eb37ec2fcd0f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330349 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.47908445146014206, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014040185494212943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c2cfbdb88d6485ae1ecb8a5c242e858b06857c31 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824197 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4861878453038674, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047122916440419 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..635bda62b7c0b660f16d1ec1a96c82840b764b92 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.48855564325177586, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048804199859316 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228587 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9a4dd2832bbcb8dadb4a339f18e277ff11d3a5ef --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4877663772691397, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048278820405616 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.48303078137332284, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404439040161297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f6b52fb7b5cb141c97e49d5381722c401a6668 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228584 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367585 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7107faa0532a34e892bfa210b70b1342dc2dd233 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367592 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367585 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..62d6514c84c0d08a8e3bba5fae63e2bc2214a631 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497704 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.489344909234412, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140492945362904 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d752cbbed86f67bf93b34ad40b36cb6d7c316397 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330346 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405174596179051 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..041c92a6e099b235dd517e5b4187fc4daa51beca --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405237625922564 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7145a2ea217df529f480e530614201c78acac40f --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225632 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bada3dc71b0e9e49e70f25e9a0f11f209ecbc5cb --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_stand-for_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.489344909234412, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049294536290403 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.48066298342541436, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014041972733712976 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_0.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e44ffb5da86b6c25e452a0944cd749bcb0b48fe4 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5177584846093133, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014043619596174964 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405213114691586 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_1.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..21477dd7f02dfdbb1e3593d0090480fe2fda3443 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915873 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_2.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9861903265385b8a5b0a744abcfd8839b93d879c --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225629 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_3.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..448f19d9f28e956d1d890aa731a6adbb7523809d --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5209155485398579, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014040185494212947 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_4.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6d79dd3b5b945ed69c21c4b5c02d130a32d93c18 --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5193370165745856, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014041972733712965 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616438 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_5.json b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91d301e52d60d82429b45cf76837377d7cdef18a --- /dev/null +++ b/4b284b12bc4/eval/slim.4b284b12bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497707 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529009 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06eeb8ad7e628c64068242173299c1b866bf270c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb6e830b385482747eec04ad8c9cc6b4e2634ce3fd1a4dc017a789243ee3e43 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23a19ebfa23406b5e8b05c4e16c193cd30b199a5 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a68cf11a5d9aaad48c7ca6e760ee56fb6106f0ac6f894093b4a1444a8df222 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aab34721abb3f250e6d2959aafdf4326576e7cf4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6486bb5b10d3081beca274827131e3e538f8a6e828b29645bcea2281af801cac +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..100744c97b6ee6865c8d1fb9a0549ee6c5413c0a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6c007fd9aff82a56f6a9bf861faa138d651b98b34e6f6eb5c3d22c9d39d496 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54651f2ad19acaae40099ac42ac8b2d3cfa134a5 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f40030ffb0e6c60d5b2bf0030a5b32d77ca20c6a3b63963bd92e80d45204cb +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9885a595b12d6d3a56a55e07ec4c3726a192baaf --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c689bb24ca82811c426ff47bceb3c23c3e16f251405893d0de22e304fe7b45ff +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bab81350031d8a0eb27d68fea55a7450c829b19 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc7bf43a006d1f33ee32b0b9059aebf77b987de5c17f023a66e4252570807c56 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..096397375b49d40f5b16577da542ff384f60e025 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8f2bfbd9365113ae6be1a253ad8a45bb2e95d3659827715cf753fa8c5a5f619 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57ae9cef1073729b697ba69a6542a87f44ea40b7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c783f31d8fc9ccbab947995eaaddc0e638cb0ac8cd8c4c5da79af66b7d5a0b +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea6465776a6cd60b2362f52d49711a3c1f95c78 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0d46e5dc72dd1fc91a88615ae982da7bb1784d82fd6eaa93a49cc0282dd76e4 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24cb2e91a9e910083932cd83ba5949e3302909da --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c465972a22b7df28ac24a3a022e83a3959645a9352b7b28ea8ccc895a70f1468 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..819d6f49edb21103465793b36f5c4e40c8043ada --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0707fef7dba8aea2183396345130306d4509adc2c3e6711742c5c9eff57906 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b69d8915ff333dbcf5a99a687492eb6116a8099 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa155b382fa4b3009f113778a6fde1fe1f164c26a53ceccedcd5e113c350091e +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01c66a3f518a5487ce289ebce780a88d21b9215 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8cc64d9d3bbc07b724cf2a23c6966cc44554698628c98562e9e04cc0f72884 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fc829ceadd82af51e395319f67917c7f4391183 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ee66e8a527b770a3455da67fdf570b44abe727d08db271a8f3ae6d54d7d9a0 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17c2e25bfc01b1b5b5fd3a1a6e196d9d874f1cdf --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45a6f80a8111a97004ee7fa6a62702b3ff58f1cfa0e5868db893a0351972fcd4 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9609796e7e177f762b69bdff7a41edb812c75723 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de02003f6b36fdb6f6d72a13b669034d22a8efcd8956aac768354c67ba429ff +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a134a7b64c228c8937c8d9e83275761be8af115 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d6a9ad9f9451ef1fe94e9d25ad48eb85cb5a0ce6de16aae148188bf22ce637f +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acc12acad38f14d7a1bc88d97f671e34ca660636 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:724307c6faeda373d9634d775a46600570cc4f700c3b98565ed50e3313ec6dea +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffb2d5b9cda4a5c926980a34aea72f2fd4156c76 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f3aff2520e1a2c357f7a91fce0042687c574ac9b514a7c0861f134db05c3fa2 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3248b2b4f023ba6747b534b3705cb99ce02ef519 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb65bddaedf0fc4bd4605d74438977e48268dc0c5c3a9a3999f5f7e8220ced35 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed8076239d32085d1bb08f782c1b7aad55218b82 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42185c85a6f6a2e4a8daa3054a8f5d75bd3507baea811c1920e8f2f4e7577664 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e28a25c6dab2bb8669079d78b9f85f9ac1c02a71 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b58874b7db6696ca4ea917241d52cdfb8b54367b524d913aa8abb7a04dea02 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..340f8c21b2887c067eb7d1937c422936f1af0fbf --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d067efbf719b25de59530a841480e100e3978440ab9837c732e876faa0bab379 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3114f98f5ffebbddcfa118af53fcc7c0509f848 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff3107a921b8c6b50f3d4d76b8929e9c63dfab67c737ea3234c4d712364948a +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68adc4bab2c3f44d94d7bce52934bfdeed5408db --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a470b2bdcc5bd360996a0be6ed617583bcdc4790413d601131da2ad46ee5ca97 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f16091ce0f659e8df9ea452006649259ffc2672c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de1c13b88c8985da45b8ed4cbcc6620a123dc018da942e7a12690bf33decf395 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b520ccd40c7c57d73d8750f6d5049a53362ab8a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2ef2a9e8e417e028f5f27406dcb48a3264996e80a3b976030024a70cc1a9ed +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c33562f895776eff4ee57af81033acc4350f9796 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8648ba406c9c657b3be33305e9afd123a9ef78ef8403766a32f8b96a08ff5750 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f725fc6d44cce3ca29bf0fec0bdb392e5b0034d7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d56dd40e0cf95dcc5918f4f66cde3a49bf8b40ae4138b31eb645ee56bd04a8b9 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a81e735dafedaab1d9e9ec0c20e86426d5806da8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a9cc0cf2d547692ed8d93c621d87e3c8ebf02f7d353881af248b0b2d7b49955 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d748a0715405a6c239d7ab5c900777cfa4a1ffb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:177a239581e70d76e7ebe4143a986a7e9f8a4764d1dfff91abb066c429ead8f7 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c024d40ae8056e35c1510c01342033c033459dd3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bad6f9623acf6d5486dc578a51fb6ce6291be543950a621061dec74eba8c9e7 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2e6d3dfa38cc7f9d899e2dab3b2fa33ee2e2af8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fbe9768db079f53390b891dd6bbaa6783ede64fbdc9386db12433e864b01a8 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..887a6df95ff24da43677d4109eebaa37e890d930 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a98de965daa06ced9fc07df4cffcc77f71b0fab37436d0a39e10ac31d86716d3 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..945d56339b6972d52b2886f812ebf2c573caa8c7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b388db2caf3968f2e7b0a825864c1d6129509f4fa676b314b4f33b5c00ff5b27 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..025b0bdce5e6f476561d7a312c6f0232719d4266 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d44c62c825863aed1ed7a46d04cee8f4a488d8e4098e57833700739d9b9a4ee +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..401775d482b120bef922661dcb9088071aab54b7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e75b7a0cb6bb435262a607e1e6b5cf48d4ac3bca648988128e0572d952f31ba +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1618d1c9430b06b95be43db1026427423ae5eb7e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:941a8c0650aedee4c5f5cc282533379a8947cc1ce729f046f2bf77fecc0fb318 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1951c74cc0dae516be8976eacd41346cd3b8fe97 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e0830100d52f19a2c5011a0d7de099cc07f24e7f2b003c10431bcedd08a102 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..773f15f16d784016adf766220573f1de68f85259 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eeffc7f2652afa62cd2d8068d5b68c44b8c5ca6abce0901a8338e3d036ceded +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d1deee308b17570178658784825ddb9f74c9726 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81fad3a98b66e0f8bb7d50b31fb2c6177b8a61e83d92f7fe6b8cdd49db194d5b +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7be48456a14c516b6a07e70ec4cb066083ac8ea4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:296b7218b757d716f96fc1f7f3490161c71dbe4b59ab0dce1ba80d771a9f843f +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..884130ebf719afd9e0969ba01c6bb2c5b53a0084 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26970d13fc96a24c19ce1222e2b470fa3157d1e7affc0e784c35b3877d24f81c +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4988432e13080afbb4e87609c0499e80afe8b186 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aef5fcc508a62f8bb3dcdcff7a4526842e59c41b6acbb8bed478d3870edd0d0 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca952fa44f245103d34bbc895e4ec08d88f4b04c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed5eb20bc1339e5a43ded4d315cff6469d3c3fa9f135a2cd6b03cfa687bfa50 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba40a741aa6919579106c5fadd2a47f7a2f8bdc0 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e18075e7c4259f0ec158e6498246461967588f87c5cf94462ab336adb1595b2c +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a471a2df81a3abdbe80107c104d66a07a5cdc406 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f8d292f7af4047d70676645c155eb3f24c2fd48b5410f1b5d47f56899db1f5 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e759c052080b62375ec241cccae015d9b3673728 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871e75a209a7a7e60946d0887705de29a00deeacb61473c7cb2a65cf1ad35669 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed249a3cbd59e0069bef2a99c8e2a66fc0a269ef --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861ef66ed8b57a01995a4deaf3f67506e10be66dfeb26b380a60722502990935 +size 199058797 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c83deb48a33f32a345819f701bfe6003ed1b07ad --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be091be1e58a51f61a154a4343ea96c85574b40c90653849b4b3f5ce91e41553 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84d423c3a40359b49bea3af0757cb6f237eee54d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd3df859c8e627607e6f099a3a312828242673097a8df5f57922f114ad89787 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbddf18e5f4e785322f486ab894477915392606c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9408f4386d5cadf778fcf13dd878dbc54857a76dd69dac51aeb47338a838bd7a +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be3e1b03ef02b77525b69d52c93ea320cef41fb8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229ebf9d05062052825fd545ecbfdd174d49173dd391c9e1b64cb091d8fd3a87 +size 199058733 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..874275c22b4f9a0bfd947f05d942fe76aa1139f9 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea03231b478655e579abe62df5050654846c3746c774e35f39bbf21e99579db8 +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6a83339ab5b8eb77165e29ff4cab75047c70ae8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35cd58c8eab05f3c0a4eba0a9ed3ef66fa27773a9cae28bd436f5d809826300b +size 199058669 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9825dede9c34e5ed23832ab1be5923d509ed58ee --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db20f59cf045af04f0cadb7d182371079d4dae241de7181f6bc48e9c5c1c024c +size 199058925 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ccfaeb559e93158e21c55b2a9a8aeb180471093 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ef76aac4909d232f6f6c91e1dbb50b967b636bdb58558368bfe645816a849b +size 199058925 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a6d1078f55e36924306b2015c9763b2e93a0063 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7545f6c0b980282ead94f0db9de6ac2eff7f8ef358b6423fda255cdcfa518f46 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..261cbad367f717c1a61f117cbee3fe84371c32f0 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d8a38d7524b48cf212733f4c065741089778cc48eb02d0415c90e399f4fc14 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd8126ffac728479ec827b5b60a2b619cf2fc9b3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22a2d1d40255b35a4d6a16152ea262c65d726691d69eba29ed7132738c089f63 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b39772166ac077d09bfa3f4d07d4b9029810e301 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84af53ae7024d2f4c4ce8ce6927552d367d867110f39178364110323dcb12b2 +size 199058605 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..206086dd3f4a860fb85f46cdab20fe53eb5a7bef --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e02bf263527e0e5bfa34fc6778707d0e2b8b701b55414ec6be8f7b39f7f0fb +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d80b000739e860813cdb0bc755bc9519d9582e2f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99c9794f75ee5d18812f95caf1a8cf59e4093d866e265d0963575b108c4f3527 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33c8455426af0bf29681f47da9e772138cf3da94 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49dab900894979406ed61499cf8e7ae6837e5eba208f4451c58f7818f8692c4 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52a5b5363fc558edc509dbee53fd3039e7096fa5 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9349e50f02265d4f4676292454bca4aaaa46743f0cf1bc049a775147d102c54 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e760608d7e2ea183f42ea6857eaad8582afcb09 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d175ef37b8d44104d492dbe613d18b90d5ad9b8d92c7e4a6e40a8566f975105 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7a8227750177ee0a1135e1f8d2bcd9c9783d2d7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92334328646308782905acd4497c8e985bce6400c1eb63681223d269b9a90ca5 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51f69242bf76d5ac0bec35890beed89ae9ef5410 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3256e389071f7a7f713fee2b55374e0bfbabe428b1d1c4f25d8dfa0c500d95 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..953b2d8de1b262fb98b05393f5bc7203f78537dc --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:820c2b99a4c45d2aa43a83d1c6e9b0339a84e64a00bebc3c29c487647d7b5de1 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97008a4cf8a7f7ead97b7229d6335105200e6215 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e69a3dd99cfca5f3b52fb68a38dd30b5bee596f7675c6777b856f121a41e7c8 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8df154417b487812d361141b7580d4286333101 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2e8d607b475f7ec92fa0953d7836deefa1d0cd2133a0679e42a55961d864c8 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81c34c24e3fb5b952698fabde516474ce10a62a3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8e8acd91025c28df570384096f8df03eea4c563648f358f668dd199d49364c +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5a62e3e9619e0897cce66af3d0ea063bfc39d6e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36bd665d8fd2d10b0f115211ef80961f6529ce65154f51df6c8934db9bb2f5dc +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad58e90807b597fc5e27e5ca7b0ed5d30a1a4eb3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803e759f641cb3d13f406e54427cae949a98d8c5a8c3a8dffec89cae5230a118 +size 199058978 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..229ea49dd66cca53d436ae9da049d9c550204abd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4180d86320cde1d2ee4b22e35968cc2219cb35d2bf716e47d7699ab36042745 +size 199058978 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59d9fe8da5f224845420fc1ca5f746ceb6b6595e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669f94c74d20d61a6b247c94ad5e896cdb15b5e75e8676e1ea373c3c8613dd51 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d36e5d7508d4da42eaae812f370fc9475dc262b3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af67ac02e3f7fc144e57b73ad9877734d9fd3f3a5b7e09697c0c62e13d251506 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c92ebf1ce9659006e96e1ad0c53d3fa2b8afdb39 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1be4031524ab4798dc24f8ff18419f86b14a2de94a381e501979c97841041c +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26d83f79a2f4a9931536267547ce3f78156b697e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f89d29f8dfe6efd031725a638c671d29042ce6feb61e85811bdc544ccb8a9559 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4256b459d5abc716e44264ad36e338e759f3d23 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66389bcd1d5d39572c934fc328bb6381eea4a801ae7e28509603682f62f951f +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d994bba2911088290f3d8270477adb020d8da3a5 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13f7c128f831a5aabe76475b042e952fab31fa917c95da31e25d5f4f0317d576 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aac598ce27bef77e0abea28f3f1061620bb670cd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138e3d93c0a35b0ddc2316c808fc2b08b0f85f52f66c09b99fbcb072df001ff7 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4ec6fefd5c514a90eef15e7c89166aa55f30cb7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77403e33234c3a2cb8d8a7f6464b9ffa079a8064c32fd8495b8e631659b22264 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ba92d0bd190b4f4b695e14d17e5e74b23770430 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef0d2c82a6e983f9c6c0e425126384a6905117e8945ec198a19be5f4c9019e5 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4a303cbbaced2bbffce7f55effc6da7d51a14c6 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7ca2d928880e06ec4158b98ebd100a59aac54f5170fbe6fd864b164a3ede8e +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcd7d2267aa03e15198bc5c70f43d28541f99faf --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c4666bb04af2cfd6a86430442b2e9d7cceac6205eeb1f92d3d3eed7f055bb9 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f8d057e1ebc1de2424c86ec3ff0415e17ff39e7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f133c66fa27d9d7db9e189a1d906abf04cc4c272c88f096a42e5d1ac585d8628 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1eaa18d5341b3cc9a4831b68618dc477328dfc9 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ada96467d0f5b629ce01b9b7a5e11b114ee861a4b9982665da241aee6cb65bf +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e266334c86d43dfda06c430247031900c04e96 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eac998c0e82435ee2adba1417ad09bb94bcfa750d51cb6f364178f68bbe55a7 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88549a5e0191072c5ddc7bb9d8fd5f2b630deb7f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27082e14919a506ec6ad4d0e7f5ca4fe7b7e5d1b58e0e405476475052abd4535 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10ebd97a2380ba0a60e780a4b64746259961a36c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6401697a753709f048d2c9c18a178ea6b5deb874d795470c6fd142c428888ac +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74fbcd3f98460b125087cd340a290cca8d2c453b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7bab95410e988c1e0b83e7b855ec9adef856c9c313d29ef4741c26f0891bba +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1723881e044a83c83f546b0e4cf1fbbf6df6484e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ecf32892d6fe8cd42f3ccb1c1b4406aca5521f7b4364c571a5cd36eaa5372f +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40893787bc8bdf70da358ff93a1e65b9b0e3f693 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51da43027fc04dda382a984bfc95515fa69164a74724233b469039f0284577e0 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f7030033e689a00b99f443a533f6663fccbae63 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc9435211758b4e899f1d7b8d8f0585d44c9a59d8ea6b908c536bd6b4e77f182 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a9314ccf26c57ffe1359d9d6138f71d447c0219 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e2e9a29a48fb3d99224ed330df1e15b0d08b1b89a17ab9e536bab7b42e30fce +size 199058594 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60eb777bd45a6f1cf7ee9e6517e35aa9d7cd392e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d0b4088cc95ef17f37958732115f2fd8ff31b7908519b3ec705412fac817cd4 +size 199058594 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07b2fd73689238d2bb616f961493076f913b9df4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a27bfc9b9db92e0a41288d27395c00f998b11732c995b900f8df761f415e6d +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82379e6d708afd2e5a2cdd8d5aed7ebdbf13f9c7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c7ec211685031e4678b3115064c6d3cb1693ae7d5b3607a9d97e27d02f6dc15 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b90e4b1b45483820dbbdc4a54c5361366427c1cb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b1ec58deabce66a47577e70a9a0d156b59438434f04871eecf93623e827976 +size 199058711 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7ca286950eef5eaa602b902b9377bc4365f6e43 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5254a48a26f4ba3e98e323980063acd9e4563c1dcebf86632054fbfc9410a530 +size 199058711 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01e13f124ad46b419d5d66381fbbf285c5d28906 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:317816e878317715dec343888fd82f669dd8665da55025eb63b7733b82b6f850 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e911c93d141a2edc0cd2e6d6b2a65f2fd4681fa --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6bc35f5bab7720120b50bc3539d54272d13e0506adb04a1435f1db1791ca04 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff4e56cf34cdf589c448aa29b03df75c21aaec14 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9275c2c62607fecfd3b86cc8acf07121dda801fe3be9d33fbc0a62ee79214f51 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77aaa4a7bb46c2e003a20cc9c72adeacfc4f4655 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d64e6f9e7c41d44b2528812cfe1fb2974fff6b0450419328546f3eb99ab30fb8 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3e22797f74fe978ec5fe5f9d02c2a818e53b94d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bfff34a3ce7b152eabe514bc48d755184fb7d5961679632ce389790a99dbf58 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0102f09bceee2bd90126ce7c39040b02307f5f0 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e255256ed982e914dd0b5853c3cf10cec2cb3fc7a8ac1a4670a113b0e5407c1 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22409031d11f9909cd72883575c115aa4e3ea94d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542b756a32e7ea7105630cababdb1df9271330ce991c9202750e6e8c8136a9aa +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e9570564e6e165cce2d4dd36f620671f93f261c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ea31de96a6fde9d7ad2c71d2d314ad1061694804c69dcdcd9b0fda7ae7c704 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33a047723cdfa732e28d19253ca9ed1dcda64319 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802bd9fab4df2419908cda630fa4c3031f559c85f6a58419f406549ee47aea2c +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d94a219db7c5b133d0a600ecdfeb309f47b6fea7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8d820d78ece2454e7bce26e1501dc98ed4aa607bcfb52f3ddbe956a2c39fce +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5aac12f9d359f29088a644c2ff94235c05f70166 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fab4af7c2289c738327b5e6f2015811fb316f03437e30459f5ff8d379f1a618a +size 199058594 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ef4c08325f2a55a892adee8baceb5025b0e7a27 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ece280ede527cf8680850a6d97827e9b35b46dd0fe9e3cc7c7d16e9073119a +size 199058594 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4aaeb370b60a97265fea22b9104105f93d98e7e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da57170bac618778240537edbb2a2dfa495617b6cc14fd6361c30deaf09da24 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9a04ee6af14ed89dd22bf764e198006cd74185 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:659cca71b642ec68a578f0b79f2dc323ff33fe3cfb26e88564072c141452ec3e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20c0b686d86af87a0376c23c4a183b7f7df4627b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f13fb17d7583c90fc85526f606b6fa8a16c4e9cd0d46295bea56d0ef11972221 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e450287fa60c5d1c037386629142069a9c6552 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9866e4d4ff2a85f8f7cbdcae9dc6dd05d0c49e2df3dd6fb5c526a5bb08957a98 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39352b91f690324bc294917b97d9346c7635e520 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d768a080a655a4f514c958807384f12b0e0770fa00b373d4dff377bf54db698b +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1eb92fae97c1630b759d99c258b94ed16945f6a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e93eb90fad796c9db1b92356a2b3a792e472c42af7b32bd395cf9727430409 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebcd425f45d978936e0c2a1ef17f15fbc07e0ed3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4310f984b95697a6811cd216b240af66f0b63e3b1321f435a244768e02a4a6 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de753b0b689f044f2eb7ac4952e32fcdb164b25c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08098c9746983688dd5e48e8c1f95993b29ac0832e80bb2503df98486b1473ae +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c429b0c584484289662810b9fdd8c389451bc13 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d718da501428ca98c7171a396e312d792469dc324fa4b4ba5584215783239aa0 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e33b8838b79ccd3a40958339956cef4cfeb9c59 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9f1007e399f55a45c3136a95bb4284e049fd34ce042b04da4bb8588d4bff42 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7bc6d0baa0036109e7f592321ec7215c3aa69a9 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff64c6e51e3c9a8f9e313f33de837025aa8a853619e0728095e1d214f8918f8 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ea65f6de1e346d47044f2db3f7b446039777866 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6d476126f003f238b1569cdece295d77374ca5fe595dc2779fea03c650eeac4 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edda65f23f66e28949fd0dabb64e0cc8ed9c78d6 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:515c66726aa519005123798c08b38daffcb26ca68e1d25a2acfbabb312e70916 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1236466953ea3dbf1d5b851bf2ade5ceee8b325c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1631272f91237551169d2a521cbcdce91d44c8704fe9fb29f397dede86e249 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92ba4aa90c016cbf11883830d53c196feccbe0c4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee4f70a5d1fa995e7c8a17ce41ce388d15eebb00a0d0ad0233c2e3f2db08f742 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..739a0c48203fe431acb49c784dbd4341da7fd34b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29258d8626bcfcd351f7a906d126abd88b12fb9dba4d9de440aa3eab4e6e83b3 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa537a703ac798c13550722d6209759ba5a185f2 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3678b7c4d4c0cefd8ca0e79c6316b2b7ee1f8c5235ae787da2cbadee6c79aa3b +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ddaba7e82248ba96644e45e8b2b5bdb29489804 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dd816fb08d042170c0d9ab661171b501f9a5cc9b1c36218885505713ed0d38e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91ac36ae550ea01e3fadc4c2383a8bc56f8eb393 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5919c6f0d8c8c747d1112c68c9e64788ae7fe1f8b00cf5a029506d3a37aa5edc +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05b4c20cb6d64fc425f98fa6ffcf29e37e0b7558 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6523716a47630ec6c8deacbd039a05987947e9ff7e26fde341f2774508118d1a +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f24c0e6ba0e4b5e2597929fdcfdde50080005141 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63394c6c52f6a75b066683fe8eb219446b74282b5da86b002e00142614fcb2cb +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80d8d7ba91f39def2ab9c92a376ef8f27ecc2512 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164e953256cd9ca3a5439689eec7e53f5dc7a78591543003036d66d828e71a98 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..766222e820c2cc04d7449c5cfdb257427870f42c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07996abf124a4857d445921bfbcbe0d2b1250149d56ce505479ce6fecbf639b1 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c9ed7c4d50df432d728e9f6d3ac9000ae3f012d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba29269d8a48d672aa04b4c4ecebbd4ec4bd2863062f318edb11db080f3ed61 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b3c295d7689d3d57eaffb6567b782821c1da7df --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ef521c75c654668f8565262cc520b37c1db87190560adb9f9efb0c31f22de3 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc15514dd928ffb3ed6a0734027ee60b989a57fa --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3374d1f3f7e295fbea87494f132fe95038bbd911532412de16ef0cb781e1399e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d7acdc34ca2c6a76afdec77260a55ab4f697b28 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5011db5fe09bfb6cd9b4d727ee5ae454c1139ba45f4327eb50bc93a832aa897 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..feba211248d5676f8ee8a427b4b9d481fa7688c4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a326ce511e6cc6e73ff1b3800a2df1aa17c1d173b1fe24ffc63c175aa4459fe7 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90f04470aaf2316445b3a3755f593394616f8e03 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b46d969a66d9058b0be99d6171aef5f31cd0cfcb521de8146e58773c24cdb39f +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3c9c4c94cf51badd3803a6bc8994096eff2f7d3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0bba4f3a343b20c0622cb38b302960ced4d9e37784c7fbaffe78a199ce30f8 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..991141370f5a4e8e26efcbec8874cfb7b67cdb51 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f51bce5e3eb1b35ec35cacb723bdd67e245789d741a4406bfd44545cf84e3df +size 199058775 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cac7dce9b97e86bd1cb7a5d0dd3ea4c72e749996 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61aa9aee53dc2b4aa5502a2dab92710d84fc61786dbdec36c1d3b95133f2bc54 +size 199058775 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26ba5324b273b2111467bcc9eccdd9bbece4fca1 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68c538182fdc0c9eb48d560f4f4ec1c54bfbccaccef7db7484e25dd26422321d +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..578c9c3b1c83733a83d3dd8a023e3c835982895f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9b1ddda22d598f2d6d688d5fc6c827f7049aebb7adb086d972bdcd80dbd9e7 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b62a67ea379c01afc71fd2157699ce8733c3b5c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fc2b5954a33cd88e6b821d60c806783cfa176d89bcde662f03ce6fa5ebc0d7 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b53a4221d1a08452404c3e4beb467aadaea187ec --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3738d8298f7287d9a17f984f9bc274be24d8a9d4a70c3669e60e29cfd1f5cd2d +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..869101f3024f373aea8874dc05a8003e81488679 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f2d1051541a962dccd60c2e7ba96f9d177ad72f4760d5f6955dfbd3866d31d +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e59ff63e6c2aa9f5734285927178f631051fca8d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71b78ce2216efb489ca05aca1f94c217f09b1e8626575511c2133237e2d29087 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bd679d7af1bda4a13977c3ca68e84382db6208c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99b4e5d7d82e4ff6dcf2ffbae0da93358c7268c9e7282f42f5f2fb9ac8fcb0d +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78f2073d574396599ffbf268ca77f84d67c2b543 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac2cf963c82127709b3d7886cba6e12a5b4daae49709c72afc955521e04fc4c +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74d4a668c4e251be82baf96ee87b3e17ab923f82 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062ecd0a9645c183a78fa1c8c0a3e008cb8df295e6781a4a5ee7eeaeafe09cdd +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c64923c52604ef0bc5d653368a6f5a97f5e29d88 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:468c1acb635e12a975c825e1e86a66cb011b92501253a0a922c24062cd076e87 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d1dcf1ba17c5124d999ccbbc075beaacdf9bc7c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81cab7a605a5a6edab714ccb221f8ce4df7feeda5a6f075397881693e477eb5a +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e475bda1163f5175977815d4eabeb306f4d52c51 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0001bc85df76363eb97b5b0b82294574382db0dbfb778e58c15f8cedf2726aeb +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afe6ea5bfd371621668d39a3b20b8dabdb5182fb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9eec49ae38b056a8d1fed367fa35e4dae900bb3e62af06f2a974a7593eea94 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9b84751abb21a026083049c58815de487d8113f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0312dcb23c619e20bb2a9a0e9456ecafea403cf20ba4b8e97ce22fef49f61fbe +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3dc7f3498ad9dcfcd4ed7f58e8f929735707e9a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab6ea2f63be47cc41b01b84e3daae758ddf39493e31d005ec4cd48c00f76f02b +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf95c9e89c308e7d027395ea4e3260e756961e44 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5545fd2c9bdf8e222be12d5d168d1db7c21dd5ee9adea08cb0f0ed7e86cf7a94 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0f4e285803eb8c51359be14d1fde7aabf08caee --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc1809a8c80dd83e2b4b4de4e3a815462848fb1f6458f7441fa046c42fa0432 +size 199058914 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b8911c0cafe9b640c361599d2e078185f4bdfe7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a66f492440ca61c459dd5f81163633945b326bde09cc32cc31d148b51bf5a62 +size 199058914 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fb5fdec6c366c9db97327d9d76a9c594900f235 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24d08ae7ff5b5442a1ed2278342c5f3dafa70a9b0c498a3c304cd6e6f7afab83 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d059aa5a64d1d37b059c9353223b47b42a0f0f3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853c82ff44d6f5c437a7a3bba70b7ae1ab8576f6653e43cef4daebf0d2955aa6 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a135a1a1c7ccb31babf43cb134dbea9ca101ad76 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2989da05f16aeb05cb167f5ff8ea6a1a053aa86f5708f391e617c56a306fe917 +size 199058711 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..095002a15d92b281c2041bf32932872dbe5f6479 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95441bb8717086e04c0e6d6d8c6ea3051f588a362840f680bd27e961836ea506 +size 199058711 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ba261e9e034f7a9f49d29372a29626beb19c554 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ce58f7f3f90517b072e620cd0c780232926b1c42c86107783271ecfa1507f7 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fef7267baec77129f18322dc3d830b8a606e8a7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f8ff7c7dfc5097d9a7a4bf29a3d51742840a373d2e5e952c94afff9930a8772 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dc270cdd35f76d6bcf7bfbea3b37ddff36c072a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c73f05a43d9f7d03cf68d022d530fa94c0b3ca04eb6ed2853dfde80cbe7db8 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..055f86e2e394e581b27eda74ba15b9f925439ad8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f4f101488fb0f6e0c5fb2e6648f3809125288a650552162f5ac0e74f48119cc +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..631ca4c4e81e27d438f045b64d0ef43f4cbfdc4c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b5e301f9ec3254b82dc4ebc75c4dd071b5c22635c55c3e906f705962f680dc +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32e11d285890de8f8bc42d19ebf9c5b7b2b2bccd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331150c9b2be9428e60f9c3e1e907fd49e1831a2a200c2f2b7885227920a4b33 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea8f60cfa832e3f44332c3ba65d5c4ad68344bcd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f6233907fd6a32d387ec8bc0c7ee073f219cfb1b5c0bda30e3d9e1756fe32f +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e91bab6cf65d77a0f30dd6fba3488d3cecebd294 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3af085f67c7ae4878969c437a61f72069a426df2878f6ec13bf164692933b56f +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e209b835a6eaa5a01eb69411beaf6035bc4f290b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9cdb19066716d5cc62cb3b6973a66747aca31fdfb558c16d7d6240154069e3e +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ff55d7aa796575cd979fe2b4874622cc6deedec --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05564547c6d9616277e0582c7b5791bf55d414fcb45f97c3186d0587b418c66 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6391b80c809de0770600ccbd2c01dceea4bd0f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade306ce9327e4b1015f5e245bd0b35b1e7476b2bc3350eae01588c65e7a384b +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17ab634536d028278231c4123f4a3f483711e49d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e480a97edcfa4a1b50f44d727fbe5d21c5bbf3d0e443d163513647314973db +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90717b77b2734030b1e3c23a688cbfb6f1710098 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edb9fd0dee9cc77f8eeda0b6896fbf1dafa97a3ba0eaa215ab8b7dfbad583090 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91d8c50cf41ff6df2abf3087f5f2d306d1e3d045 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938c8fc1061adae96446fedc0a8635947105cf67c95ecb750f52f5ebed18d24e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa8a0ceb94d1d7b7cdab8fa6dd2709e769579435 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97f869e9b9f91f61a34579ad2bc0eec9355a2fe20f3d152535b205420ee8330 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa0df94de32daa5f2840410193c4b5afa6418a17 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:491de82df1f98823836b8c0f0cede76bb2f1720a4dd8b9cedd3b061733ad9715 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1eb430fa3fa6cb521dd0c1cf9dce658a0289c37d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b250914431c7d165fe50cc7ea5b5c3b1b569a057c3f93746d2cd0c8dbdc340 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4ca6308b0ec26023bc146c4f9989b369dec25bf --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c674071e3a011be80462cb4cd3c48bd896fe978c76cfb5e201d3f11a39513091 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..277557bffb9b799634298f87b9a5f7399c51fc16 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8caaaffc07f409bfe4c0029d030875abfb959ebc471ebad73a008308ab025c54 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c109fdedd6871486b0e1a5cdfa6c5002c18b0ab --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81ef32c13f5b42a05e532c695c04ee1fc518d22560ff52fec41e5e6bee3229c +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17b7264da90999987dbeab93fd2dea65c842dcef --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207953ce0e866ceecbf4ff20c2cb173644aacefcd000b08c0aeadf537ebca4c7 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcc95f8f301b76728b8228b35ea035b4bb9b6303 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5f2075bd9a89d7e5f7c4302dc3c9ce46cc42a8c56b9c32344fdd11266a72ec +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..198e3b8dbc3f31322453bbf08df3579c9e1d2425 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa30d053c629d4d075a7e0e13e7db3a2ea37d414d7d18a57bbe794a6713ce224 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c517270796b8520d7faefaa4ba75a90559762f8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:811ecc18708fe97fff31b21c6b6f7e9ea58b091e3e355550ff8aa00275a63c0a +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24bd401f3ada3f998603bb7adbcd061a43cfbeb4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a17dfd44abe1822dc60d17004f2b292398e8bce61a134bbffd7669a4a46ba0e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e7734a0d5d8875bb39ee4fd138f186536144915 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54fa55671894e0575b2f9ad25ea7a3db4513087237ee1596d4a677a24ae798bf +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4398804ce31cb9d3afb449cc5ce0402812cd6224 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3b01b91c2611991d42fc31a16040b7ca47e42bba420d0a6b91d44d8a934c0 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6a6bacaa911e125761aa87272952aac59ccc15c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f549c132caa8df71492f54fea5af8d6b3d8798dfe3de40d72bde338a372c0c5a +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d67dd18f7edd0887cd28f4374a5f4dd111eb1855 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:823dc676b08c141f73188a3cf0ca9159d6412aa5b4bd6bfcbd2ca7a71a4df131 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cc5b9feac9c41a5e2074f2a99968ab23ba8d461 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1b6e412ceb0b398299dc1fd8a51210c118da9845e38795c76544e46d21385f +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aacb274332b78e3352171d5b33529aea6a0759d0 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d920b3d57dbd6f11f3ad5bf0aea7904f8ced4c1527fdd2ca68a34ed98cf9767 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9135c0a762f4b9d7855e1e1086710a611929caa --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c609d687838281f1c90a9c61e9d19adf011be1ad360add5f054fbe922a2a2d +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8d1fbb68ab4a24ab77a3763cfd064fee10bf5ea --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42ae71707e1bdf299aaa9dc2017bfb1e0b0ac29e30f5f69317531a71170d786d +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5665ab7c62ecab22e7595d341f4299cbd50ed4bd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b480bd383df93dfe9b9c71c707ebef32cfcbf28056cc09bb91b103255cdcdddc +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..538e17a22a4587a590cc2e17b051d2dfb3519e1a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:051e44b60cd2bc73464e01f51cd7d8b5f38b12f0c933a96b8d8ffe3c88b63bea +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..314af68dd05b02f561459f4e8270261c4039c8fc --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485711a9b34349aef365d774863dddc091523c31e8a279fec33a1fd4dacceed0 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4356f20ae489abea25cf7c3c79efeea9202b40d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5589076b086339adda68745561dd1f99bbb8c073c3061296b94818792069d20 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adf9b8ad046c6cc78ef165b91bd5f8166d965cf7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d44bbe6649a0c28ba9940b76752b388689f167bbcaeeb22ec1e78d07ae61013 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aaa586019a786ac2fc6a3e2655e7621214621bb7 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc81573bfa9140f34c661313bb4aafe90fc43df7d792501129f26814181119c +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef3a57254e0dcff4d2eab01586a565b1a01d29c3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1969c7000688fcf4833ac58b3a6071b73e35703b1abfd8fba8b26b450ba578 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22be32803badaa1b95af9c9ff1e022a2a5a3060e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:540aa6b756a1e229799447ee5603d3d1d593ede0651a87020da9b47469aeb7af +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36de9b4c1809fb9736f795878190a55e393df729 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47750aa3aef54dc0894bc4325662b47ac4ebbf6a93910e2b0936aac76e37518 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d41aa99b7fa86b03c4866d5f5aca1f2f3d9057ba --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0d760ea1dc234326c4d4ff7f5674ea7fe0ce1b66f1e2c9504302c95d72ce55a +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cac44a9fe742056189b5cfa61139dc68596019c1 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106d55fb703615c907f3d09dd1c1283cb4ae8e73c8fa1cc4ec24e02380c6a7b0 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65d8a88de2341faf8f793efd78d2d127df36af8a --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d5023d02763d632c63c3767171f20069ddf733c015e9460876d1cc31c79b4b +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b970a2069cee8de38dd26c42d55cb30f6a4a50cd --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04c90cde15e4cb06605f6c9333a2183d209f0dae3da2b10323acb3224839ae1 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd625686db7fcb9594cc7102a0fd2ab333e14abb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b063597eef2f82b139da488dca73c60a25856227d31cc7d49e7aa3aeb9db805 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..904cd97769f745c77cd98c12a73367a7fc01ed4d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fdb8523030d2a6c52e35d5775e729f87d9c61a51414a2d590ed72cae671d4a +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f39cf0e8d37e569bc379a77c0b8ebefc6131530 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ab7c58e9f4443f37c2063b1fd591360cd639389434541884ce3154f52149f8 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7e220f18ecb8a5cb4ea1af86bad8cb709bd025b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e311f2ee0fe7767cfeca4203fcf3d8de3a8efc0d5761c17401757f4a4510e698 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5aa9c38e222213dd8763374069fd860d631652fb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01789953144d567c7f8d8374327de199e065ee002bcc2d36dd7f5f61e81d9957 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d535b34b5fc96a65ed0f64b72069d2093ef2843f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b837f847f6a7cd762b02b0d5b3c4ecd8c68acde83d0e7a183b6c658d509eb4 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0ec10f2be152791925c3c51518dfbed3a581541 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8791d319a0fe93512b1c42c5e9204fb9d1f1bf9e601d20f57437d2af6a6b3dfe +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43234ce568ba3d61f3a959be8b08bb1cdd7f3957 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12fc76f058d719174fa060cc8eaa702795bb9c1954dfd47e872da1662eebb777 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6df3118115b6d741b5a2ceaf43dd6552497af92b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bb02890719de9052eaca1433cdf46ff188d7f8956a314a80a54ac218eade52c +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b1f287da5171a5c6302b5613854b99227df1a9f --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b539b1cf50c48a9ddc2af45332ed4397b354f08e018bb26ee97bbe340c06d4f +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbcbcf0e99b9f6b8fb966a24134cb079b2772ff3 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde15871e8fb25f45c3bd7c4798904d2e10f1c395a9dadb7b6e24aa5be9ec7ce +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cfad01f714397ddce1bdf1662c1b9f40e3d3541 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882f18faa5361c44ac185a0cc1230367a397a1e6bf98de95e592b941e57daae8 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b28477e1bd8609413ccb4da5318f556c478f5160 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e0b7eafeca406577a864b9a97f221d3d096e6d94c7b2e54c0ed7f5fbe1bb4da +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d5421d28a5627387eec5f7051ca223679ed3020 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2296b76860f55fd92e1975d4d784e761b61e4b3e8f5fe4b468cc54a4a21f618e +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33a3e9f6c51fa35fb2512489d5949b69381c437d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8829512149ee1fff66294c37f55ce8f6531ffc4ab7113732db1ae95ee5c3a45 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f00bda6cbd24e919fc55b09d01a736267b96eaec --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b12af7b6f6e4478d7d6cee51d3aa1d3a021f346198831303aa6c1344e48914 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92c53c8af874af55788fc5e123955a8c43ce26d4 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:133b8e23e98d448443aaade8ed350c3bd2eb097c09591e0524dbd5ad04076c91 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2cbbd4b5be916ac9eecda8afca0cc393ac4aa54 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21502fbd385758f9b58e4cdfdb53014de2e4ed9025bf764c51cfa9ee1ae5dca +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a86b5d1abcffa2ffaddb5afbb5c301b8634345 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a7fbe33efa381d7183cdffb27e140ceb8524b676e34dbad92753a52aaf5129 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..864382190feca120c12d16aed8944c7f4a63f937 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53acf480dcaf9d4897aadff8efebc7c6a3a5152d68df6edc237ab670033c2e75 +size 199058647 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..625386e9a39c51418b7cffa9e3290ce4953fdb4b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4421d95abfa3e0df40d794e6695bfcee5b56c1225f758225927fc72322bea9cd +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfae4ec49c23d2c7585bc40bfe04573d871d5f8c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9b63eff5c663c5a01cc25c778ca4addf95ec5cb5a656f522ea76a9cf8c5d2c6 +size 199058850 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cafa692a0db581a8f707b5e08ce8785e2966b23 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b938e630cf2fbeaa63cf62978b6db915ba413083be25d1141647e266651be56b +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16e9ee8b046f080ce7f5c05eca1c54c2ce0a1d41 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d9ebd49a660b218d6f9490f79078a3c6dbedbbf1fa288028a283f34912850a +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe736ee3cd7027248fd01ec0afb8d0d1c56cf71b --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cf219c78f850f43ae2c6dc3b21bfbe55572a9a71e41ee6f364ab47028c6b320 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4d7e17197d562cf82bc8f9b0358b51f0fb75731 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c918cae0b3c4cc6766ea286e0c8923971becca7251ee31816adba5ee599a1944 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..000065b16ff04a09b2d1d110e921b379ea15a7ff --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114ab1396fd569bc8430fbf77e515b2dd85b72e0e504e56171aa1a6809c1208c +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04b0f1f854c74e907e88b012017eb5bc2b8cce6e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b53a3b3196c83f0e436fb97ac34fb9c53849d05ea5783106275e61f2571e157 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e56a6388a4a6d35543b7b9abcc60a1b629642ff --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261508ef2995ec82f9e97341858b606178e0fe2f7e1a25beff9ae53f7d6b8748 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26b3ce55fc88218b09f882d0d922d6465ab2f928 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06dd373a76dbc400519ac4f60a036489a5986495a143e5e5fda48215540928e3 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9172d00931ee96884e540f48a148f903ee504ec --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c454688cab5d4a0938d4849d922748f4d014e505bf358370d2cc79774f223c +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54565995b8e8c5566020bc10c89ef81d7536945c --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c54a0cc6f905f2ac01c593c4d068207787ce2f0b554b3d93ed260b1e8fa6accf +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a4b8bf4e2c4e37c080421e9d26a3e535fe481e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3df44d70e0af367ec3b19bc42d648682a8e6a752e2582675f0304f8f41c5e015 +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71db2c07a066903ad0d1a5e32ef0084f3d0ef093 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:725ff30b1364ab2b5ae40f9c4d83c57c751bf382a35685d36b63f1cc55f3595b +size 199058722 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb1e5d76b38190619651eae51ca82196af64a323 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4607c9ad486bc611e2cd8c23620b00883e3335bf3cb363970fd4115f649e11b9 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4bbba721a00c1a437cc99af3e0b27b53662879e --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e228db3c000c823fd0106a4f467a5299a96ef7456570300978781b83cef0adf3 +size 199058786 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81014966ac646d48c21b42aeebe451646a07b5eb --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae757b58db54c318d945b260398ebb3233d59a68bcae59758fce368533efe26 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e32554b213f03896f2dfd442a1988e3444c75fa --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713963427af9aedf09f84e6186df11a6d83a646a7e1b5454c50fbacf0d72e3c6 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7859dfa83b147293e437714f1af7a350b572d7c8 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa4dbe19057afab2d7efcd79355201606770957e4d61b3c95b801edb55ccf12a +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43a8b7bae5c1c7b204534096c27a1e08587b9ff1 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622873bed14376d4d22f5422caf68e3f2b497ca345b3d823a3de7dbb8c4c9b58 +size 199058658 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f706985782aafd0b1c1eeb29a1d673ccb335f1d --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae1d4cd96e121e9394a89e538f13507788a9e02d056e20fb29ced4c4b9bc08e2 +size 199058839 diff --git a/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7511688a7cb72cfe96f53fe3f85021703e1e7075 --- /dev/null +++ b/4b284b12bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e3cda8ef03ac17efd526f5fab4b19803607d5876b55d8edc126e7d1ed88fc8c +size 199058839 diff --git a/4b284b12bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a1d8ee8133c0fa9efea6187fff05eab7ad87b37 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43cf5a9497208fa800a5b92b0367df2cb43f41d19d99b97ae0cb1165d80c22b +size 167511299 diff --git a/4b284b12bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ef0cbfd60790bda13bdc0d4806975ce6d7c13c7 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05590f5a9fe9ded271787490b2ff7b957c438d5a1182d48ebfd2a19fb00f35ef +size 167511299 diff --git a/4b284b12bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..883c85261bd9ec30f630a958bcf9b3d1bd29bab9 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:083ad2a75e0a40811f5982c31a2e893631341bc3f0047f036d6bdd8cb37288c5 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cac433dab1666c491e9a5e41917506f14e441562 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e71eeddd7e12dadb9daa8b0c3570f8d5338846c64848c5c1ee481e23f0d40cb +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fc3e917b18c6a1d93fc3cd92fdabf87ce05b2ba --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:526c0e170d3f61953ad2511c46edbe30243c41a3d79035bb016579847aa06445 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0de208dc7e5334db2317cb6f0532bf94a83fd95 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7a9fe4aba5daba29d73d04077d1b69f192be9822bf663127c56ccd6fdba055 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73f7f9f6b9b2d14f5ee3bbb7aa7491d403925203 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0144388ffb56fe2ea082479ea5192787a8e6c9d8c221beab8cbc8af72d81af9 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1587248fc77dfe5d7eb37b4e38217c360e0e1fb4 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d5b44b6674625447ae756307971abec724e261649c67db7ea4fa55311e8f014 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adccf0b9cc4635e8a8a0674c0399026eae871a57 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:198babf3ee3afd6068e93f2c52a138c06c635a92a18df9db8cc1485856d944c2 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c054b5451388c3bee006ee633fcd025eb087853b --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ecc78d62eb844278f0ab77105881c28927af4852cc600b03330e95f832b8dba +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8101fbec8e5de62abe263c71bd8959cf8e63ed8a --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef01b64af52b97ea77c5821e61adce6710a79abc453b336a53be3473aef9f91 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5f1411193cef4604e945ec11563976b73907606 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b8dcaa18461b53016109e5c2a68884cb0415c3619e388109da817e2c5137d9d +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..691608acc99a6f3b208dd64c6bdf933e544f3c35 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae87d02561fb456cda1da7ae2d1035e41bd08d3f49952ead968cbaaee216a4c +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecccf59b616326899301200ba00626160dda5971 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f7df1ab58b48e78cec8e719d63ed95bbf190382cc0aa92d0325c68d701dbc58 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b00373155c6798dd62e170828159846a9b2a7195 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42abb9754c8a17e8beb65a65d8306e925908528d9eb18cbfa4c9a617a5cdf3e7 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0ee93b9268a25d11df9992973221d3c5c6d8774 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f633631b66d1a413552e08bafc52d565763d986b14935a1a8c7fa2087b1e12 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8aa2c457017262b5f07c684b08eb3900c550d9df --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28b44f83460ef1dcbdcc7c2ec7536646a4a2a88b43a29aa79a7910acfa217ca +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecbbd31df1c7737ae00bb3b349f04f444b919264 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a5472d6a8a08d6910735c59b26bed66c6a81cbb11509d309fb3e20b682a2b6 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9d8bff70c59800b6a1632f60a02ce073d2c92be --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0cf04a8889f9854d62db979eca8baf8c0b4804358a8c79c46558e2cf513ef2 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c66ff4b761b604c9e570de709fc2b5e697531a2c --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4de42532fa0919f91454d034e0bf7f533304dedbc201196add1c1ea7b08d690d +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e74217e3a4f1866d90c9651348138208b57c9dba --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:068e0d5a43bfda68cdc1d8afdcba909d4cf1a6d03c2bcde146f32b7c6c201f81 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..230130e009781b640d6540178f0a18be5ee0914c --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ff4af1beaa9131dd460c1f47a0cff4b74b275f1e33bea5bed75aa37209c72b6 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e2b9025c6ee6bbdc2a461444f024217f69f04a7 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853b95499c4c6ec9ba53623e6e9dddadfafb23bbf8abbf7ad6a9b7153e418457 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5959d76cc8d89cb3b7c1621c97dfc98e7d45de3 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997358e71784654d8b07f073a3c324f1c0390a9a74fea69db6332be725490873 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7965208a4af30c21746f5a7cbd123064c35783c1 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43ef6767ed6cb6e94054b442957e9b4e53051a8d5aa2ea59709e8c1aa928861 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1a0c66f8528cd800822f5a3961cb57561947fb2 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:081d669dd7f0823ccc2ce0a13ab36fbef85f90ad679de8005bcec7db643c0732 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..93d2fb2347232fce3e7f53ef9924d1fe80afb133 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c6f1ee312c955c69e3ada8bed7745ab6a77f1f9e80953ea3780a59652177c9 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..340156712fe3d50b0661978d42d750d22364d248 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc36b02e51656890e3fefa1f6917990116766d2ac56fcbc3d045b522bbca4dd +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21548c0d20e6ef166f81b08d3345b1782307b3ce --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dde6616b52763a069e729e94c6f911239e11d6a24dcbf3a634e9e3f7199936f +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5c7e4d20463c1bb8fd5c873413d2cf62bc09eaa --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a13e373278186f93a5fe4f6b8bdb0ffbd59bf1bbc5be8726961152e4abae1de +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..067ff8bd63cb7803bace202569d0a0b24eeac827 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebccb0b85dcb62986e2ab17760da37909857e81e2d045149063678bc71a1519b +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0db94cc47f876c130a61bdd04064df9545618709 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f6ec455442b3b6625fd172fb002a851d4fc07227353eddfe3c1b032527f1bfb +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dc35668b233e1349891018bf21800b5bc1d0315 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d09ac247e481a7d0e0d028038f7e30e04dd693eed7f4d7e743b696019c33020 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d84cf420378e1edb78ad2b754b53820549bd76cf --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f72b414b08fd653b48e42816d1c50cd9a3bb96ba0088885041a72217d7dcf4d +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27db77a9e0ab0d979f5d83def6ccebf3a56e09bc --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9152a1ea878cf029d3633ccb6db86c6dbfa6a608e91aca295d10a6ee0a6eec8 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b94b94f8c3fc01362c224cdb94303836f658e58 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a04db16f1f2d28860a6c6f4adc3ef1bac4efc8dfe8a41443830ec8308e9691c +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..056349bca21a23997eef508fa81cde7235c31b49 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d693052cdc24019340496a1e19b85281ea452b7c991d1a1e8186f330ee10ee5 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f204946208ab9812d1fdb7d1ce6302f42edb211b --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7936ec576955b35778608aae82fd4657816ced2bfa62110fc265784d51395b2d +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a2d26c908c3ee78622e3b4776d691cec74d3c74 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4def31b6d297e9ae6d0e3860c78d4264e502ed558812650303bcee3dfe721e +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf9d2767998a46dba90e2d9c7c127e5ac9696a49 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5ad9a264d1a1777bd193f632eb7c1e4b7d2e9881258894e4e09f92305fddfbd +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f238ff40e5339de9aa2e108448e96456645a94e3 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9e47c787481dcfe196e265a377e21d7d54a519ce6f0889275c05fde5957e4b +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e203a994a4cc56fc74568550efae1641036af31b --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6702e38f91c897232fddf1ec34fa3c9ccb7761dc1a601c61dc6066a2c798f0b7 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbf829f176cd330a6c060c464e06f0e61c4ede23 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d05e9dcb90a19e99c02bb10b615c8aab5024664e47b7c6849c55678de958a9f4 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8cfe7a0f5f37ba2a5c53977fe53781813877f51 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7569c6041302a486c494e8b0dc7a54226aaca3ca9aa837132a8bc85e1be4732b +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f7ff01478667efe964faf53a4ec8bcc31597445 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc926449abd98a04a752965161a6e6154f8240b7a903a29d8969dc6071b9ff9 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2d2598be7cdc672830fe52c2e85760af5e575e4 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafd06d600b330721a3c7a30d61c9c61668fb905fc5e6978c82dcf86bb270a64 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97546300629a687d86ae6a8001c9972f28d0d963 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45796af2d6a41a3a56375f45f44ee6cbb5f6046cdd48aa7a739185f66c0d0d52 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9d7fbd8a51b6723740e6ef992ff12599d791d0b --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547dd41231490780adde603a58e1a1bbc5af435bd3a408f34a1710b5daa85087 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13e4243656ce257fcc3a33999053f21e719688a1 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e6128d3d64e237fdc92858c84ecb2da92c57c37e41015b979b9e85333c2e9e8 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53b9ce165b5a3fe1b5db4d40f660e3d409fa4fed --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282210ae8f45b3e4e25bd817641c2af14338a1eac7c103c6116cbf214d092ffc +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b2bcdd03da578283b5be1e04cdb78de0dc029ba --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:842ff3d6faf3074e6770077a2fcb9efbf795cdec6cdca16e78250872463eb1cd +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa8338f69e130443c3b5287f85601f601316866f --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa67d998aef873cc7ed19e97fb856e02c2adab0a3468b73785aa1d3301ac3e96 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56de7de3786eb233768141db8525796471814ca2 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5abadf6295729147a5e0b447bfa1376a0fae77b0ea8ff8f173182678245428 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad62960d12290e9e8122e14e92d1e7b6b18a6c68 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c526abd550d868b23df80d9b603aad398757aba334c6cd5f794b32988f8885c +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..568084f18393afa7c31d8b17926b4d5c402633ef --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c661d4491616720868a5e72ee785eb6284755b1b2f37a197f00ab7624c29475 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..861bda6c6e554dbdd86078c1557b8ed044e86153 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9c24f54071e8d74bcf7f4365232b261989a3e85db8d07b097562acd91a3a691 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a83dfca72d612b251e1ce075a6e282ceaed4271f --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f7d0e3d3de80bbeb4f7010b193bde80ff25b23dc8be53cb308c74e387856ffd +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12e37bf39f2fbda6e3f618439d3c9897b987f95c --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa63ec976ac884d4dd62e7cce065938542d93ca8475664b1678ddd568b1baa1 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85460494b9a34d6e2c5bc619dce74910a5187fcb --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922c2f54283079404f8e04455090e0b63612ce9f3ffb235941efebcce32afbbf +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a77c21dd8eacc41d48243ed2a6121766f1e3f16 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce5f6d6c396ec5870a88fe62a544e782f117fd01b9f72cc2070e65e9097dcb1 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8eac8ba7052362999ed4ff089462c21f94a24462 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5084e6d9628dcdb20246853473c41c9d508bef628f9d2652a9e0f616f993c705 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ffdcc0a5734a5bd68c3b2b3a2fd604f8526c594 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:903b101ce8c24b06ad01d4f69e220ebf9ac271b61af9f560cb39f874f7432b5c +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71bb6f1ebff92d9f05601906b60386078f986bc1 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179ea4048bc133d08abe78c1009df9142781f554523e550f5179534632df7a63 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98464ade86dfd4f020ae363f892a8683c44fb2c9 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0a743f0f9eb64fdbd6dd52adfc4882eae3e53f9bf05c6af95ecf74a44aaf4a +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db97c295d2c365d3d5a257d4a54cda4e40b7032c --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f46d3864a730fb1994e4266fbc30a853414f3f6ba02c8dab5baf788ae9da76 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3a1573320d9bc85dd893061555b4987114116e9 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab415155dc2b6c8821138fdcf649aaf9572aec41ecf9e9f19b2af249e70d3abb +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f799617ed523e020ab725896d7d68c6239117260 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c6b8bf24148d0c15962a0b1a244cb0ae8ccc0e94d475063e468233729e5cb78 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..879aa2e5da8476ee8ab229734cfe54aa5e073704 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881ab7c7074220492b93b9e6f5c04312dcadae39808e250987418297250504b8 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8f754b25d7fc57808b38f503d6dce4bf3a5c4c1 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d988e3e7d8627dc51c11c8cd6f925bb4e5498f993b33e468fcacdbb67651632 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f326781fef241a86fa7ede87f656ba4e661fb2a6 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4489d0e7bec38a7d3e8634df4a7b3ed92323b35de61b53b3af2a346a05cb2e6a +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55d58de76a763b3b0ce2cb19f66ebec3a62723e8 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7491b6ab9acbb046e366890b8c7e820f08db420775afe4b4b7e93ef6bb558768 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a99f00a474d13ace4a70188cfe152695508a4351 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec1de3d3f6810b76763f81831e8eef8d0b031692b49d7dca8b9d91b05955acb6 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..580335639af8082b407778159266fc06fa9e9092 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc8cf77453d2c4cc3e032b82c8458d6a9a55f133937b139f066ea6f9b5bbfd3 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fecfbe68dbe0b09c166c9229350cd4477ecf1adf --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3315c03df36555ef0e66767bc90434797c117fba6398b1da2fecbdfebe5e81d1 +size 113308931 diff --git a/4b284b12bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b12bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3be0c786e5841ad6bd05a1efe7f147bcbce1104b --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59a3d21e3307e4d248d7ea8972f0295204d48b311ad6feb4ea2ea9dc1a6a6920 +size 13507 diff --git a/4b284b12bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b12bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b07120c6f365f3d95e0814ea2de2f1b8c480281 --- /dev/null +++ b/4b284b12bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52339716f21730cfeb7ec608699c81a6dd3092e87af155c834b994760f3df4e1 +size 13507 diff --git a/4b284b12bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b12bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b44e2388d254cd1b823485770eae601eab7918ac --- /dev/null +++ b/4b284b12bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a6247c25a2daccfbbbf70d0a8590b31ad83e347ddc4066a66be0384693be6bb +size 51443 diff --git a/4b284b12bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b12bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2dd0043abb5559e26f904522bce0cf7023b4639c --- /dev/null +++ b/4b284b12bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de9dc9cc0ec90347dcae7481329d187ae4fa6784f396caee3b620b8b3200fe2 +size 51443 diff --git a/4b284b12bc4/transformers/config.json b/4b284b12bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b12bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b12bc4/transformers/pytorch_model.bin b/4b284b12bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3897675f44cdda1721f2e759e3193fa89cf22af2 --- /dev/null +++ b/4b284b12bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cee177f8e2d194654c57d77911209e4caf3a7bd25fd5a88b85c20bbc98cd500 +size 8781203669 diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6dfe3720a92ca48747d6d3a0cd746e5adcb121e2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4040857346605273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04358756352339084}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0759904796250538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019478615830651011}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3009878218567671, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046586299284223885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11153940555452811, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002178375350508395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03649587266689502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001284173883599284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14683508450255534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032437032345681857}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05308201459552208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013729761880117914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07180766426825755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001761388989113738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.28987291523705844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045326872175802165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10594741371659425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019835178597520093}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07245769602542276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00184585307065297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2877191217238231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004439207690226351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10637513260264994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00204505600505615}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6398f5eb6e0ebd0f98efac7cbfa56caab560cf4f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5179012826475189, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03546328546887922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07590692259956473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015260502670476222}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3587176031003754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005304902318303979}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11715894355967386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019757967913343107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03541306461486918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009438407351314926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17490815925289047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003575953294927914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.054620931903283015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001228796960295478}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07063353254188104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013315866040551792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33926359580036936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004980489876121937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10948455596662156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017556314356608658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07145722539365447, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014201666549177136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.338101035904291, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004861723234525834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11027827926137282, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018206662711825689}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8c9fb5e302e91c64cc559cf0a19765bc124bbd79 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5353533242406296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03431413900192352}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07083033588676813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012393618247087826}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3713979630102006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0052635629154557445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11225032910377221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001729127211467464}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.031078186918670876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007042214847925669}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17869319720275487, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037152494180892654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04972618028817665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00102253346010308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06478794864331923, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010868099908157105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.342386434900962, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004791381155468985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10286097604129013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001521136220014524}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0666711337015857, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011558366976764654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3495506191555596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004867024005105769}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10565101438548488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016072411783332626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d0220cfbeca7d412aa87c9c8328c2de2124e5863 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4473435878442557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.021920475877328035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.060184346315459554, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001087884777571127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3414484174381052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005172668914728109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.09638137782837077, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014829064557002423}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.02365049876833611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005803565276567791}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14942490983015533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003513250894148731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.038282775688304856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008484797038289617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.05398887767175152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009593908114878796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.306677112968288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004574894720404393}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08646072645548598, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013059843771721818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.05643326067698096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010263660613837862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.31950364177604385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0047753784913865055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09025064736984334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013804266464441872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..efe0665561d9bb6d7637b9c9313ea74966bb0d06 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.42391682641977435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.025185202302157747}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.056211864304467056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010231526331645241}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3243551913604374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005025965693402745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.09021138584698546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014520692052325743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.021734837271928865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000537789346158442}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1381516300501002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003431494708848335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.035274756528572794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008164665662699456}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.050592287065444816, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008807998394512943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.29101119493777294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004368680842712586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08124379060939273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00126253468227921}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.052801000471684074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009509504143137485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30284581980574177, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004598745476101752}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08470874404935472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001361092950551602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f60f988b21b4490ccacf2404a92dea8e03222dd3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4676760272424504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02303026154350977}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06321141539943463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001223200335826357}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.34518893665726086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005046837181813745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.10005721103740961, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016082944292364137}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.027408388290100973, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007495147001664794}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15813941094305575, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034339025470937284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04337652772461485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009940986270453964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.057943890660243344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011331942696761865}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31252933987652376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004418336624345426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09146925689558233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014605690131820356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.059802685455499036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011813042691715043}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3231337567814217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004620411779037274}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09443075155430865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001535767562485951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2436986a9ed5fbe2c258c7b32f395a2e512150fb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.06635037756207988, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00222464644174656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.14012624989204728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002036835675458236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.0649820340050219, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016774667522408912}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.003020102888519982, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00032547132764282983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.006611308173835243, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009951093670627143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.0031079038623638374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00034130095697030574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.05754089923288528, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018114419198071445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.1332053573384668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018778409471417506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.057959376823978205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013701772199315872}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.05219309540753949, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001901332766377738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.09949694242629789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017781434541041129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.04917830665991281, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014301781368119162}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.1350370966328972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.027107668307824653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a66900f56ad595897c11aeb270d4a547c17839b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.24515040330368046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004054348224566517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.42594170823778976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005787897761386721}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.2770237126205559, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004038789167972144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.09175996157947801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002601492839334607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.1684955936702656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004064838991261767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.10582834218904164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026426543643199916}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.18555197093693734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003147342164119049}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.33280053513758845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045573071529510685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.20936546936617006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00297046089780952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.21610700955371467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036569952801652067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3772047553428585, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005272234747783686}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.24390484665607676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036238123294501058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 2.052485569424793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12102973129799857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..773a1d9835235fad01bf984cf25f696b89813f2d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.3566584079552788, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005545339920311794}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5400091585387286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005358867844689096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3738311083147904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004303874024559728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.1806299226100141, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0038767177534562857}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.27810862336642045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004344946134160203}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.18715015079380715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003209239165332672}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2774912289584729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00456955894576886}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.43176983982503986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046883692841029135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.291139847215283, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003491774771956361}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3096504505654974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004863905721617155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4751007623048103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004978268403772034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3252458107512585, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037896518417690843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 3.5977911950601484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12923912317137534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bc9031f2206b7c875bdf61333abc651705f2f0ee --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.3507235841349617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006063184051971503}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5514350155602691, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005182852642362103}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.362382378974461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004512618110179221}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.18695632263623327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004276682318235075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2924425597856253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004413282563913562}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.18921046750479484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0033743653265960907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2760401929409245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005081600899523718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.44615489211207415, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004731816271399687}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.2847450223443878, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037300768332280904}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3065083273455118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005390003916850601}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4874589762506608, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0048935064804825285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.31660483791521354, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004003089154824215}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 3.364926224666278, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15045294250595048}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2a4027c8f726a79fb3af127527d8403d39003f4a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.33093996934970976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005991487675562594}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5449792771957667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005139692154283905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.347569335324534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004544187677600413}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.1780670412943538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00422080166488958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2914835377835669, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044150558699339055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.1826637134726461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003379682428755349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2619538812269815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0050299249467279095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4421088823491754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004705889366270654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.2743549921011115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037902551916263946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.2899719423937745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00530779434079022}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4827220286166995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004857508825913098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.30488899444485074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00406398308359489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 3.3107172893336636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10227637753147577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..63b1e43823ce9ead82c0878df21b2c5818206c91 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.3193273080501676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005974116499113547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5422038885918751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005190435741567349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.33762053843234585, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004523525986261017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.17378010156605053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004277219617652875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.29377091837736397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004471597574308531}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.1794175550575056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003344405579036441}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2557225993181413, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005171558754142887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4418326332600047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004753839209134025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.2687968014233197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038430798579204405}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.2829548142815575, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005414729183997336}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.48224704322769923, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004901420700466331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.298462025989663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004099301415037842}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 3.3115358177828296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08566639180105731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9b8bc0dbcdab36f4471fd1dad49e2ae3db913c8f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.12648828841845464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02172247828129552}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.04385963495568408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016141799634558302}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.20910265806376718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00260407682272209}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.05185506267199114, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010693704857640345}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.003657728710184829, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00040744794041285926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.017381693068654103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011150256177358017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.004093823034187833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003237805033460212}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.03973081953475384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013217691489049088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.20624804575610606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026094299138881873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.048929691951989984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008927940829166845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.034850412608671714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014877807289763565}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.149019716230595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002039697594288268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.03823939846444661, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009439507711198298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..190f1791a90e6e4ac029be6b081575715b8f2375 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.9717259245306719, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05184305667078613}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.07728525564656695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016345680555964624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.47912345654951694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004974585856018974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.1251494842748045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002228010113423068}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.030217086264151687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008968155132881358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.19718798013757022, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0041103934021723375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.048933388223398, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012848999460203356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.06575830202193088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012184254155840335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.43631558492945605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004551703262770418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.10781056512383318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016643371248226597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.06685866773660962, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00149258308259344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.41789578588003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004754874400535324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.10816019192402117, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020352482624991708}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c30c9632fd622bf556ed849ad86b6f0e045a02b3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5553302383124015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04732740194035888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.08832852532934997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017907725492439333}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5699592863851318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004714536643229723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.14300003134712513, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00221162536158628}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.040843244255482615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010916245640850226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29220518033840015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00429630894545052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.0664535256463577, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001390545014519469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.0752494824229621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013331527745553307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.5158723539376079, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004556028773885754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.12321433839865718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016493743934460554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.07693161532405232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001600850652295148}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5035674346293266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004594034425007832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1245303464212252, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020005666890666714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..acf1dfb4a962c4d03f94189332c7a59b601ed2a1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.6393485578889786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05502650807288692}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.09384501026064822, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020410705936949164}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5594718362988982, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004835901700325949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.14778204636533104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021535294935283475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.04531974070359654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012592530156795225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2943813172867371, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004307507870651324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.07150806038871549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013846757650982076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.07902797606364308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015876424977483495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.49738396026840304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004670214814101292}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.12572390453967686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016646524639872145}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08205420153622084, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017980621481149767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.49588185251631867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004634355511167163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.12939899168462599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019528328894905256}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..304fd0ebefb352da34d5bbcc5bfd66db8351b914 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.7027172131809374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05464254396711413}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.10010463739366897, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002451958960487023}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5497072816752133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005022860494341607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.15265161943963146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023054471134340703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.04962506210717931, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001556539762450491}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29783112596150063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044507361953258365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.0756101747685517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015266803509634318}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.08430872709932842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020229841753327373}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4865738026311325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004864018248102004}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.12964486472277573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018553204638499313}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08814511726089463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021838335215906522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4895059624812954, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0047832687395796825}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.13454630567628098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021043161354972286}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..383585b835936c03153ea1c67321ef5de18a9fe0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.6824798233982887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04389086131276499}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.10174735745648503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024884464079584444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5382688832041609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004925292282078669}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.15505433986152437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002379813501347791}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.05132100913201794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016726572160033631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2917616762862299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004373173732071418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.07734137021121697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001591701564073588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.08461663803530442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020159579534064975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4721845695274402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004823954126413361}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.1300997256412491, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001870157247818775}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08987573937521823, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002214880051358518}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4806380167892775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004741206948484179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.13698687887451672, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002131522469829751}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6596e8dd78639c27bb67a0b0598de3e638ede5fc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.02706027614634666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007199959223496352}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.21871619349540652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003017122387430669}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.046262195996473646, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010671021290240828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.005489404852576963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00037168133237378876}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.04039720237090506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018373898311850336}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.0091292497595853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005586233596109211}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.02627865667490291, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006333002013249642}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.21539356134726143, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028718466577210507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.04506442218172945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000951352027550179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.023947674613616042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006311347984113742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.19592189387116501, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002541067542265314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.040980292597500946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009311721807975657}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.15649947819833196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0164476915271142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..693d398691f2abc89e79e3e28d68ad46dc238eab --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.13566963010336247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001901900628554032}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6825350408773228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003845047297379057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.21539745536997254, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023973358203774717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.05837651126421021, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011430214532306153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.32049306904101077, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003907206521752479}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.09345632716516253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001519666475238801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10694697019167072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014098708843255005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5679246988503801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003975927481974845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1711923400170551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017590691181499992}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.11553086397885941, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001655229291094829}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5900036748532257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036908035987317585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.1836183182793589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021016203672280464}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.6599294185467146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06211968556668088}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..381234054a1ceb86250378b2fc60c886d83121ca --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1287133172816397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016808581165882275}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.700127439476741, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037713353423981066}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2076581040011104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022583455874752344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.057317013209503706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000967554778182446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3482915132800189, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00417921721931668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.09362913479276391, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014080197389295243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10016020267379452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011971639013113444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5771110494449467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004193435707875633}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.16305870413789514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016446312009551617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.11082374580358195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001491597958210396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6106710174543206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003704970101429629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.17891433463701623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002014498326601367}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.016001208228411, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06460792855397608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d4b351621358cfacda5950b8078ecf445c7658cf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.11266596272845417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014964276623119837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6895629319342107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039035968599018256}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.18531216357578198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020715765879598448}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.0490979915579544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008487279552315122}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.33860846873242856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004229418238586871}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.08163089196343765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012508375480738728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08742234681738852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010302549899947866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.573502299945498, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004398266028999487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1452630444217927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001454000064654399}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.09661481768181748, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013130442610157457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6010245565717771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003825525802603016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.15908058820614102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001825274022822222}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.905820614761778, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05980040525962388}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f13745b5e02764590b421883ae79d6f7442bf303 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.11665256703134419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001621644318040768}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6784675206475869, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004050953554566147}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.18934839167613163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00214187386551334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.05132787287975547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000890879748268458}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3387961329017755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004335693857367359}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.0843786630967621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00127055713033487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.08975734228271094, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011007534157180517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5600296137109959, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004483398306149866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.14739292124783474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015062281210773386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.10024251913081043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014111520702234548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5919031167415147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038677271600221276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.16288936969969903, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001883263572481922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.9502927707808198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.052020548700027196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..14be6a9dc198637bd9d82f928d6d00bf17420ebb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.13927878202099392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020669175591838764}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.685945936711341, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039463810264872105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.21838667940415663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002647140626260128}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.06409430762337304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011913214183825302}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.35060843893599064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004265349732243715}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10182499744322755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001652327275662303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10609511200918728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014594305881602467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5591004118426898, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004296337590666952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.16835209567162987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019166110953087441}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.11992638153693579, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018004459291189343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6004050096251264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038569357942678053}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.18842254040460846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002322513232328268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.226426427718422, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07967413364321198}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..42a0b8863ba9b88bcb154f3f1d95c9ac6f87b2dd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.022502146837926656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00039190309284914334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.19344726390485775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00254983973899585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.03898895789773015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006319532341075637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.0023435938414272757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010979866573243933}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.02355849492139947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010345098847200228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.004145988607839967, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001880250546353047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.021854072209224234, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003652930614699315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.18931671006162032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024402267670285537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.037891534322081845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005877375532899554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.019010396571724965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00031345492195533365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.16545014623836463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002138025338260489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.03294863446012075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005052256590344441}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.06125988659412747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.006732229818369253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3d759698106962ece7450cb38c21f6e8da9b0c9c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.05975795472897865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013335748577968784}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.42130399574332783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.006021456162891838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.09987693823507093, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019433674289780272}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.01988607904073932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007117135743174414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.1562830460937769, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004166853394878266}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.03326346351515616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010503739288322067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.05146250597168572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010354352926742175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.3777395630694112, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005232167400098075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.08653005164164256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015090364266964335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.05159147675400835, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011924678247232138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.3715339371365095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005597099196018005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.08634306012268002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017368588952515506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.6395614972095213, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03517585023209597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..30dddf6aba32a8fea5a4d433731b333b93a006fe --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.0810153812005263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012511509455719108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5539166378964278, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047292784942071205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.13518295850423986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001797688945586576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.030941146425207795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006330818483411955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.24102157803692914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0040746001964615045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.05216469490783778, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000960104338852003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.06909862720510897, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009250350731313912}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4944830893295965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043942354024226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.11606876587008064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013292695114441179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07020253091326639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011333955676522578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4865962462473607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044821465897910496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11717633383150924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001626596491079524}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.121353825084264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037725280779123294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bea1b7ae4e75d00d50e3bdd2bbe4c6926183e8ae --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08857774552753468, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013096992843302045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6136927754214567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004518995907883919}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.14844541358150928, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001876503606273822}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.036937522767553584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007034293513284538}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2927059910268213, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004314941715310091}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.06259164725081237, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010634444975241323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07355997157054911, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009262242992983204}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5376590137054139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004237557928355675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.12422704982040522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013236189729225278}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.07743844850036462, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011897972880537623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5449619727275773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004481006887033573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.12987847358588697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017109839553052014}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.4361174109813004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04805971189618587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ce85b50a3645f0305c354ffbe72dd2e649d67283 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1061802370663775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001463735554331979}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6843250436064637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003920444970220284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.17657657718723166, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020678023491144965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.047497391617128334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000803679569171621}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.34713674890385776, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004140778426080258}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.07984816777629551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011943148273671116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.08171760415769097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009627992963593435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5631982903026473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003904405962830157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.13715386462196175, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013567926683826159}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.09467046528011347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013521830799601206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6186687061302965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004013369252904057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.15756804588459467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019178471738246978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.7466592730628139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06692844403557563}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9b531a83394f81e0841880fd5353b60bc7e0e598 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1107893274792466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015002868666159005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.682687883229354, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003949108407059522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.18316360410296145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021020976278752416}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.05017272336825218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008414952830314042}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3508191888121574, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00424171066823648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.08392662509619025, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012407470721852972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.08485730190536447, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009914986606501136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5574721443850629, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003989848256957862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.14161914296738579, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013917592431469871}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.09855924666321038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013739812469034743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6138525490419964, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004009808040489546}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.16308062545421093, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019353462436310985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.8970740397337433, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06599212740506186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..936713fb2677bcc039b09df877f2a3f791ea11a6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17950975675556238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001914110077898584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.3173563571239822, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026609188157218433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21366320551854953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018302570738398985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03857602767598888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008009614211947492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07079824622444279, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015441980491765989}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.046175339585206684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009130716302254005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12327264643230568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011861060865238978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2279772883295184, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002052566227453621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1487525060765686, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011699273254385182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16610983563072396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017654810205492105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.29490158184592447, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024968605272683683}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19795971817373118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016914247514331865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.055966635720875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08686542114749111}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1664ebf9fb4424042cc18ce559b258f642bab8f9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1645016598415173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020439988941701897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2781090142745825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00283011336228724}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1917572377223911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001964109694244107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03451158809220734, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000862899520574188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06057288990931606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015023541556298495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.040168134051973815, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008924396157130113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12025147568699371, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013845922051473774}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21167965788899928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022235680991272095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1418918052948833, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001334398810370599}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15192216506378955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018955299167400376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2574177460768254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002636255512192128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.17718406703037073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018199571065040082}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.8472449992080195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05972184050829182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..68217df6de5655ca5c4e76933f430cb284462e84 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17461621417538908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002031440517917525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2964362262793112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002726714099481616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.20405321174077107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019056600496191807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03893762449906625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008525122868332632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06838081813859713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015611393255752124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04567156337896173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000932894552583276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12669938791067256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013586831428520094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2241091285172054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002190856032138392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14991267136518538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012964821918338744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16135042684996723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018812047542648442}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2751530306927472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002569597423808091}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.188778720271424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017684778684466625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.1250913371882443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08082994225847336}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..377de3305f6c5b0f4f66f7dc87bfcbefcc035fad --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.15230744068697982, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022959707383516507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.24820187689883588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032867277020760042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.17160204720456523, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022197524833034343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03434424273820165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009243231784645324}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.058823449262171576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015444564017563487}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.03899411935423918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009187136392638074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.11092147540747452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016240093919999271}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.18694301859386991, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025880219875020465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.12583346122758107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001558801054784864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.14135092515979492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021286008140915364}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.23101489053303867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003064522846659888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1593908947935989, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002057263273890102}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.198529629762632, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.065300185227075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1e4222d3dd4febb0434d4af96deda08562c483cd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.05083357669029093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019357918297942475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08244342719669248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028813872714789964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.054987533485356926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001890886367853537}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.01117210874945532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005995255967837975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0202786766574289, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00114540984571034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.012617379158558721, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006313655807193013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.038366665167606055, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014542518656553282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06396990192776628, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022687929818678404}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.04157458484226149, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014009278175922768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.0469155306552092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017939164451876647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.07620716657990878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002680492989639914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.0506618129825057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017421779361228911}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.520053083116806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.039623345914897065}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9dd10444109023e2787345fef4505367a060541c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.00861372538113473, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001009916194724285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.01235013958324705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001203646367559825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.008446934607939634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000836413601735138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.001811642633420617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00027281114466191735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0027137133448325683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003698010920981975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0019124752375349172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025472485058213127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.006548532654933444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008249280853205021}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.009381116428004776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009128471950554741}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.00624239280071284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006013075457035342}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.00793806088766393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009498773132576785}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.011369439233893231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001105032966952515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.007713721590588463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007579891955786283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 5.517328369410883e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0386230327964954e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..23d073cb756b2d3d79e9d00502dedb39ca257ed8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07582764050130288, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014190110762892887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1277933977634297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022684792145317057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.08828349309152168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015115974554693625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.009815067973854086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004155165586017958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01808910255093881, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008644066184478445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.011691902088528949, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004869755165002716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06629364537173689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011431141940482985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.11462267518707464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020002236200532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07791270770462665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012475871998712364}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0708523421701339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013109444182449496}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.11963990635370186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021050935160865825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.08254932336997271, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00139813861005186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.5075417133761574, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03895909978746295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f4e64f89c077bad15be7f2cd5d4d5382e582b99 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.12719095564056437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00177458933509712}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.21244487177301377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002669065247115643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.14743483455043024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018102937951181683}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.019336079923150524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006341515576063969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03414861395030515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011732733204742757}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.022783938421878452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007214151081414727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.08932246378264151, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011470443498572222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.15500596932030242, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001969158666967818}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1046400973633858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001187021079229482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.11886530728516936, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016395583632656235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.19926746759843972, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002492327836587634}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.13791245377427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016717888556597065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.2257876930057892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05098211417674737}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c5dbc43dd2d72033a8d4e017a17c04b19cceb1b4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.16106457000008795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002292064983729894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.2557520463315039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002954728954094822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.18051522966442277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002061705179353923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03652753511175213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000971777928411179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05873911371100809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015215967074205675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.040384981669924526, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009583825110008918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.12029197082375237, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016567061948962215}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1961569067857406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00230187114523418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.13548549449383468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014439794973907486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.14978318904580395, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021402940102766984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.23874291009412155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002780798318742396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.16793091858839554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019139229473260492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.2113811064047963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08314019884439218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1b92ba5fbb9ec7111aa9f33f89a133590bd81ab2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1533759665863088, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025577920564815457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.2259942077891069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033053436745626315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.16139635448455195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002249446575446669}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03725286699889529, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012332919944079443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05566813880650291, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016306556541296991}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03839113878587267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009971143418407548}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.11690535506105171, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019433797646747934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.17591155413008638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026478755663225566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.12314296805478218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016547329508161137}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.14302393847897105, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002412541062432112}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.21074872905474742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003105762840068013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.15022458367211336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020979925530018205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.4654367545222904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07024206603649888}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..99504e2b3c09cbc499597e903035600eeaa39c92 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.05028568853458266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00208707409282097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.07186995080563745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002683339801036182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.05011197616686206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018346039701914467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.01253202087368826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007848832756426618}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.017990065250321798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001031089985634214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012281079448761325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000653869447400363}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.039231023055013595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016071013078014019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.05718233871131803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021546633328687506}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.03907293229046315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013997352977999286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.04676947454126935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019385598696383512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.0667844061216085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025123745789632553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04654151233042153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017091787582805187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.4508825944565298, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03559978337994854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..124052bd1db6a8eaad1ac9508acd24447d3238b2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.007347042331656005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.000855877168700121}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.010778250545529063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011521636177528214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.0073426086769494935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007675421686666748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.001710980048638282, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002755932507822362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.002533909675938538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003864838960639161}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0017130160893721097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00024830222441035233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.005509333596115347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006321643878647678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.00833978450728592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009079115070139992}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.005567943512045762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005784419503900142}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.006862990886278549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008064303620221591}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.009971153288718336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010652484370260787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.006800973372764665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007105353563633134}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 5.6409965833073724e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.1870470936219164e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3bd4b8b007deb9df908cad10fc2ff9868cd5db7f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.15985049689784436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003621011460260559}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.13728275328074455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002052435724642858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.11036144862640505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001530599636812561}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.027302086171180434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018954157009048764}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.016538074725194387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008215165441681659}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.013469176886654546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005757882354899623}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.14013802103333775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032342892555596208}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.12221875356209057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017527389452773581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.09678143768792878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012407236257212446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.15058787526521, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034756128206833727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.12914383726578268, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019154210955539695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.10348080739372295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014111115580161292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.4796152263707372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04020140828753968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..581c1b84cdc2fea4d35a8a0a72dd75a4d5f47758 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.14983169230821397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001875244981058499}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2495117966397059, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025670277663872557}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.17228713075884045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017266755452744172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.026135894449625118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000877367958825746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.04414741214905309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013248833452923223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.029442252351593513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007924688244414296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10567365961028638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013254739675144156}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.18141239888145783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019407502811784976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12211374752033236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011442930963585947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.14040216161795577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017456836256513886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.23471174419778962, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002419985972720382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.16158045703046053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001603451683480107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.466848818979777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05122046143290242}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..98c76dad70bada424198239fc3240250f2526573 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.18601116127457196, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002674455746095039}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2697340694293458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027730262294329102}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.19387864929813403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019373404999633163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04394534305996648, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014119764995516235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.05981581118058386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001480173209074023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.042503871652830684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009697312508853417}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.14114910909948458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00212420941260633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.2082056428265491, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022135365734890456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.14644335466030722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013693635698585327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.17326036096446687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025172862394438143}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.2520550560532216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002612078427237868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.18052324198395442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018002340718791736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.1241997955779706, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07242902905742744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b490ce7d12ede414ceb2c77c32a7b9ded8af8da5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.17904260819617532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034025414878592264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22026524786669358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032630989926899674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.16464113343789846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002337449087567628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04756210696143349, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019178497340389886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.05249440732325283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001516559702111135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03902103124926395, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010593881502234943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.13877691598479244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002822308452152884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.17071662509195945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025830517349758665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1254589966504649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017313167474427463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.16705087756637185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003249700349640652}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.20445349405368257, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003053057869293393}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.15262792584886165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002174966680287157}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.452893619605998, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07535908896046421}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2131b458b7bfd6bc047b0e569829467896acdb0c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.05505736228678805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024837468725238195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.06470336830124988, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002537880582413385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.04778674517582264, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018365614759514285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.014779446264054693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012182175752722738}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.014759184580045196, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009302529142526923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.01084532274916111, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006463021300909807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.04401215074366241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020765428405668932}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.05148347306396624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020437326538755756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03728476607430857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001410339676510439}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.051354143409556074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023530881508590775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.05990804305583379, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023627429149512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.044201424272258295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017014625957187778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.2501992622083412, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029972656605070418}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6160567eaad66dd056c8f14d24ccc674359805f7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.007051967760789057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.000875522796789484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.008622048628992848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000979421104357186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006582381210133649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000736046358117549}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0017517411283309776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042676011787971466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0017169194523458937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002723182367088392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0013869187743711499, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00023793902364453525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.005694743781016978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007231881422995928}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.007034117275652774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008076270617686096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005255733664967675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005774560163471757}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.006656694900838437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008364652583978511}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.008085611227571524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009219645931116729}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.0061682356318462596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006919242043320082}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 5.897323116804045e-10, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.7226783506844502e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eede44279efc103c47dcfd552c123d6a3016b76e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1498248405216555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018423908624139891}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25526647462429297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002588155452951415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1751592790344999, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017874794849447823}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02866254588676867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007271444111163778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05162759513352221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014037908842865225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.033925070200158246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008240877531204439}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11555678082540916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012586001477032337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20559491764149926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002138172468666332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13711284339871613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001264815697203119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13771072102711007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016874302363856556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23587046561796698, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024016834853047104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16124043802649227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016384576957041044}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.484894171357125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04279862989387049}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a78b78eb22c7565f2b7a5e1e733becb05c7f20de --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19820460200268925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002420333361877177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3126809375051542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002841672003398339}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2199694812178742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002010819735875878}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05016728772414271, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001264223371544086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07815311881303431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017034183791022172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.053937307211284244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001075193073237261}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13997435500321276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017712606376717578}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.22624387436643148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022018166295221108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15551270440786846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013566776773220371}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18565508806059028, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002281622879542436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.29360401456830726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026791632190064567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20618836045677164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018846829640657573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.6577106650236018, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07159113573701131}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..30d21f8cc2a99d0055a952d3264081b3aa99a275 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2156784752655553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002732677109431224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.31237685950514665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027963244796416183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22435918885551207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018967794998520616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05738474692529928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015267876059967937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07970531997370003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016700905255101184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05625290668830642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010633219356063186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.15451467159603913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002139183021303883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.225376744790094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021771714204925977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15917844337997755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013165721234956872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.20314308377906012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002612094861988029}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.29423268640214134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002642162937712342}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2111183297537402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017924385498513864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.8678009037418817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037508767497823454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c9053f578fbc189dd858b30bb91c8bf030055fe7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.18405392984121038, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029940021228191355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25514944671406053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033459375347498267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18429548803283216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002297153333126352}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04819840246533481, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015092856540081493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06631560044043883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016628335172097389}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04688317854067561, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001094346737838606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13375051095662321, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002380892659084607}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1853146721328658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025854840001956502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13173075629033756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016475788902520988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1733338698061136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028479993519003807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2401891556702282, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003169734302090452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1733778342320142, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021681294491408274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.9935371246792863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08389052572374653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9352ce510b18a186df4ea88d01cb590b178498fa --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.056788240211910736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022506997974891542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08339625174295712, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029290040087097207}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05659885761358278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001930223829324733}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.014380667558554284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000977357379524512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02101316087357165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011619283931715888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.013810868807903593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006930003786904605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04257610439946902, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017464834049321358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06302882516092573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022681193452949904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.042007409754473535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014258890197756744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05318358596123625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021166691321526723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07838773551352705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002768443855799422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05299986098950862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018068900698462816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.5461443377554994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.044799916123802616}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d1bb5272f196cb8f7b858d2d79bb7643d9ddfc20 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.008911405879685974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009422475593785767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.014063917251247313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001396048114694196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009231042835921754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008787834704319035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0023383395560896076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003211681514690954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.004206929070053043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006053899081506379}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0024939038248004536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00031409966961336354}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.006838517996278174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007309899843193323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.010882093390231358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011052580830821516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007056657402327181, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006761968053255695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.008357520892672053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008835066902880334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.013172143042073951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013147147386398047}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008623638497104687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008181729546128778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.0112557087204399e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.4150938941127893e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b51330bd08c0958a51e28b4ff31228642afcbff1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.05237091241441722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009605690159925948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.08326091502282784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001307194264865998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.05960995824024046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009498209060065967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0011869125521613468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014357479408292278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0016283614649422807, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00016925819879762434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0012597496537706083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00013199105164764973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.05023321535529295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008739713332714877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.08083866411125382, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012488146734351867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.05747907925182631, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008796408111632294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.04836041522625723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008749326026700274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.07770833418809467, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012165480160224367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.05521751020238309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008638920844959852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.07299605737537797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0185742385092221}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ba0dfda2d19a0e40a226a90becfc33790377c216 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1343039642172445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016732214768695325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.21562840393454535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023741026821102597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1528764985075287, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016363848989828594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.018046982061484945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005964805823451741}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.030411699104941473, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011304776755259463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02069013916427929, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006612995194022327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09295669984502772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010567113830758765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15447310700799974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017294150710180575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10660240871433446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010219647383725968}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.12687184785116162, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015609172625269986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.20412846443023683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022261282748651857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.1444397478005342, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015191037848864151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.1223009951089067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05836654112265766}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a36519607b7ba82b40f46df7000d844321db297f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.12697080832224822, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018201089851012445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.20109011349921127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025569830771983836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.14316491558968603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017639356738778811}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.018246718615969375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006418388179326544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.029736909566824603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011083029361886995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.020465708060419606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000671449947746698}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09139918661616583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012236115566095836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.14881163413712634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018937511898562988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10367762364897098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011709222919166911}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11904750892035158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017004264999703578}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1886874000288314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023745592779610444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.1341766534995056, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016336963572172586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.1180377312448153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06708589951037792}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ea8ed005e25d2eff3e7be4f5e1a42c7052c3abc7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09887369502675063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002043227921916099}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.14678993430035586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028179673635622318}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10634798502269208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019791669918603184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01567989332792064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006694052515306127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.024427777089035076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010678311796068322}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.01711652805678942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006882433105651278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.07436580031645514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001457445017450415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.11271151612770153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021453041661070715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.07999450335755226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013938218605260453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09222600214130718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018861337709654303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.13747754955861127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002634806979634438}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09936266385012239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018384025373419349}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2836466923526264, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07070516293414274}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..95676fbd81ba1d7efd427127df3c8953a9b21717 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.025095421938052895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012461801615902318}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.039170985603147426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019196761418163904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.02713454046170108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012975310299864612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0040053841270205085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003368144516166286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.006847830102416033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005830833698761628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0045440484711843825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003623167516261232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.01906700487359207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009287450742508868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.029866238646993826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014553209547808461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.020373823655558135, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009374343240357042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.023282946639617354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001159003398883688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.036183810616452226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017667839555253213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.02507998631189558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011959812138736497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.0870839562978014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.006785360071011486}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..06c1328ba1bcc15733f190ef1a839583b9cce4fb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0021405897631643946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003541239789070288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.0038253034844642347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005823536138290148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.00249534987383561, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0003873473209623069}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0003169846199202593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 8.619486969926879e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0005250548080533436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00015216407757586244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00036639163657650276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010049770707316975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.001603458706659755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0002602089443308959}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.002861894235467422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004267622874298756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0018743787593694622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002863935366559193}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.001980619561518416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0003290309068301346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.0035607312643202195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005431230446018559}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0023091312121054073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003592857430542886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 8.093999898733972e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0779878610297242e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e06f65160fef71f13a2209dcf967de6916e744c4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732958}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2a7ff3f30d3a3ed1a7a6d4a6918dfe43114907ce --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.324, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01480686473373886}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ace2a720f0a18701fe01f901b766e1ccf43deda0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.352, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01511040450564866}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.361, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015195720118175117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..520de17462cdd79f758999177dc49efb18b77d19 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.343, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356951}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.361, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015195720118175113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cd4543489663a3a87e7a42b9ce1a1cf501d6a791 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224472}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4b90e38fd4cbb0f92a562cac6a3f56163bfac379 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.348, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01507060460376841}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..75ca9d21cbcbc397d89ad477a21cf83986f3ab93 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653598}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.331, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01488827258820394}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..74bff1e774095143b27918cbf2f10c9e18ec31d3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811485}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4a7d51d15f00221b1c9cfe0d47f516dc94b866d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653595}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.329, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0668cdea5a155ca2d650e841d8bc2205e9cebf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.352, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015110404505648658}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..097ac121144f9d0e75c0b067cd47a645872afb66 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.341, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402707}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.353, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01512017260548369}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9998ada5c234d50cfc88544bc085a27c5e36b246 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.356, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01514904265930662}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.351, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..60dbfc0faddc647a6ef2ff11e2a0db7c9fc8a110 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620347}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3953298acfb86c40ba87f3c179a35648597af11a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bb22f77fe32e14e30062b9b3cd61b919f883b9b7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055235}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c72daccb187b79a2f8f716eab4f19dbc8d0cdec5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.359, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798597}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0150806639915631}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..125aba7c48250ba430129752c332b81495e4bbab --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01495508791865359}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c40067e69ff15eea560887f9729720242553902a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0149550879186536}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0ee9b93a4f297fa9d68f134673258f52c0adacb8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.349, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563098}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bd0eb7f18b304e0adeec0616aff06b94e5a858b6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d95079f7e3f9353a0e9ebcf6cc1dbcd36846af9f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456732}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.345, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9142fd4025d076652ae815312a30649c11e804c5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928357}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..db488be3dfa84c6a135ff412aba9d60349469b3d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203943}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..caa19bf440569be5f79058ddb6502bb215458257 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229857}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..425dd40124dc7b5c4e027e313764e089c2a24648 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620347}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1aaae7b1cba863e33dbcda9f24d4b33e910c0e5d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811485}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3dba12e5d605e3c7bf53c1fbb7f2487c5ba44f8b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.356, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306625}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..77e9498e0cc103f33d1a76b9938891d746090bd3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.35, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444236}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd463aae549779c3eb9c889bffdff7214c253aa --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270334}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.344, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..915a04690cccd8287043a1c699242bf611d0b5ec --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411237}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01495508791865359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..807fe81dac80e2249e8c35cb1c948aaeb8765a30 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.35, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b84c1d079242701144f45c2c330f25e507e2c54 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087971}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f346ee498dd8f5e95cff66aad13b902ddf8861af --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c23ee31bfcb1a043b0028f74c5e60d9823b1e3ef --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.337, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149550879186536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1c53bec3e042d0de619413c18ded02ca84fc13dd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.311, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722692}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.304, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950453}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c7e4450e45b2e1b9f0f69c3f54ae35a8b5401aed --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977885}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.319, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..260683b47973211c18f48ba99aad8f047b1eaea3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.324, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7058ed6fa4df8ca5f953603bf010bebe4e668ebf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.319, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473479}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.316, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d029f2e017b09db04f5e3f330cce3a47c8d4166e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620335}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.331, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203926}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..52d92811978065002f9a906887d08fe2db9d7f2d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.314, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087973}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a8333c6972e4a9fe607b535d8d28b138afeb0b90 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.303, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014539683710535265}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2c37f245d30d1bf49ac26f45ba6a31c5cdef2a15 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664397}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0146966319607925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3799fb7377ae64da86766385614f7470984113b9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01489959724281148}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b246ad486e5d692c6460eb7128ab7908a691a7c7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3220ac42ae8976bd1653c41f29d5036a96ec0d7c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348628}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..114a3aff01f42ffc6bb1815ed11c783384a8c5a7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934654}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473479}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9177c89bad992e10d2766cbc74f46f0ca568ab68 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811492}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.31, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..45cab10ff5c9b911f00ffe829cff211122ad7924 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541035}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a652bbd4aecc735effa22cd1950add76a00ad4d8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.311, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722692}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732972}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bc093f5348b16286ff66d42994d720068b57cf6d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e5a5737d5379eeb657cce8a9aeff5cbae06e3ace --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932575}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0812d7ebaba3a99679d3632bc924de4e2d7fa92e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811485}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..34be37d3812fe937154082f7f9fdf1edae928cde --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229868}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149550879186536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f82e7dc23979f175dcecbe770a4831be2d377280 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01491084616422987}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..36fd017a8d9e5c7df82f045dbd3fd162ff280859 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229859}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3384fbe22aecbf6c5cb770bde518ae7d5253c71f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574888}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880215}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..445010bb80b554aadd66cd189e47ddbc5e54f340 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996686}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e7b93dd6acc12d19c20202d3e21d6396544219a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996686}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c452dcc56282e97501afde7be70a442230bb545 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203928}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.31, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a0fac041839af0530739c0ed7b67f6d13450ed9b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473475}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01478291360099667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..36345819209f60f9bfdaa46b26aace0408aaf1ff --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136774}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013613950010225598}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9da6418dba61a07eb02703ca615645eca6ed5603 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251947}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ac999f7625021013a0937dde1fb0f1861ba2b449 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013579531277800917}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d9ed9488bd4e82a737ac9c6cbbe4503843ae6f99 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821474}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3475, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013751753243291854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..084f6ebabbdd60a93d829feed2dc3e7682742e14 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.31666666666666665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013434078660827388}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013471620929769139}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5d7a66a009aa42fe66d4f23ea8993bbed1d987b0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070708992}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e81685fac1d2a03a98ac037bed60bc7e40b9fc7f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01362243481313677}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ddd2197de0f328f11535cb3da8e93c6c9a4ac72c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225606}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d5d3263ded1642b41f9fd5598394af6af3b46c17 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01342456883035645}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013296358936471103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3a5cd1b9e4dd044f75ecaa8a61dd51fbb46b9c9f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225605}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7baa5018bd7d24af8e86a39a713f3a396db1eeac --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31666666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013434078660827384}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3175, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013443538681348056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b0fc3d3d2992a894ba1855ae437644a3eac1a3a4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013415009084004868}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851548}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cfde617488bd770dde56e8bdf874dedd0d7ad8fe --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003665}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d42d4b78ce8517006577647719f85d11410990bd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013664144006618265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..097c272c83d4231fb78f5dbefde68293ac2a6766 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982096}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3125, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013386029277441229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..986ee67ce25dad1f490eb20b9f755de766c0645d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225605}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ebda297d9b73df1f6b4dbb7f1eea4c0dec8dee1e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013462309712005136}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4492cf9104f737ba09586c36f89991182d27487e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013499258621103245}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013471620929769135}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6b7288127303a4038a3dbb7221bca2de54dd910e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013424568830356452}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..515e280ab5dc749c87ec759e355bd548fa30e9b4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5a2390e9892516b3e1f6e94db0038df6cec76734 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33166666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01359683672948517}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6abfa0fe108aa1dc99f2b00b3e9837b30f5b76 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433633}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0fc19f5ce1765b6862463d12b2811f490f8e0a76 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3425, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013704669762934727}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d452017d8976dddee44970d5d8e1fed42c9b5b00 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136774}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6082701203de05e26e2d3877a5a411ea4263ab48 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932882}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bc48bd6a4d6871e128e4f20663d1ca2c62a2227f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136774}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..15c0f2ae15fc983bcae5f63a61c0da63a06679a0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013452948996996303}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe21843ea5711c29df8dd1eae5d618c2481cabc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618268}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..655735e15b59fbc73282c88c12ce3d09e35c3775 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251947}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013605417345710528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ec4b5df882fdef1fa3e60264e0b38399047c78b0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996296}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01357080625843362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6bae0eeb3a39534b1fd5079ed4dc70b4626a0a6b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012352507042617405}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012352507042617405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..991a1eeebce9577b6e4e08d697608e0fe4e47426 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948856}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6f5aedba535d865e9ebb4cad2e6e7bea2b499c66 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22013651877133106, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012108124883460978}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22013651877133106, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012108124883460978}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1c57258e854c124853c90d34568a2f27e28e895c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012207839995407305}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012207839995407305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6363193a7b932e1edb96a827e8c88a9d34f35cf8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012624912868089765}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012624912868089765}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3e8cc783f2a7ed2b6879c5f8fc850ed00c269bb1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3aedec2f7485ab576c1a3d443d3ac085a0a019bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012928933196496344}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2901023890784983, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01326157367752078}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ab45db2b3cdc5c68572cdd722475b5146f25a44d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2790102389078498, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013106784883601346}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2858361774744027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013203196088537367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed1888d76affed42a70faae8f75630feb7c0c9bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25426621160409557, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01272499994515773}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3e7299146ab7ed7a3e897a930418708e58943a39 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b972cf418b61e9416e77cad50917b88a7b0a6db4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01279455375428868}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012980954547659556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..764fcafebf60eab5bdb24a07d526f272b2fe9ad9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926433}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013143376735009019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0926099505f3873a0cf9ff0bdc2acc5abd5c1fe0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012445770028026208}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013094469919538793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e6de4d7b208e4fd476633522f9ca5ee2cb62c7a5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012461071376316614}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012780770562768409}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..de2e0e2c69d3852d59ec1a11634fd4b47a292acd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587094}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25341296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012710896778378606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9ab32402dffce80efc635adcac5cdb50e7338a35 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24658703071672355, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012595726268790127}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012696728980207706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c899f0b477a43cdc3228711a55b352166166b82 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24658703071672355, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012595726268790127}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27559726962457337, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01305716965576184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..aa5ba282de869f3e342ed6affa8101c891688de4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739432}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012780770562768405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ffda4c9c4a8486b1abfc9afe71e5c7fbfab2b07d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890797}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2ed0aea098019d033a50999d5f2e1781e4c0f45e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22440273037542663, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01219140493860383}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22440273037542663, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01219140493860383}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..058894c1108899eb7cdf3b3667010814916cdb8e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.21928327645051193, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012091245787615728}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.21928327645051193, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012091245787615728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c511cfae5b988e600019cdf83cc747e81cd7e5a7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.21928327645051193, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012091245787615739}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.21928327645051193, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012091245787615739}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9bd0e0716b4b3c1ddebf782d319926db594ce3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004748}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012399451855004748}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c22f342f03f70585991f03ba98108a6aa25e57a2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012368225378507148}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012368225378507148}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8e36ed35ff895d8f3611de2c0a7d30ab5046c1c5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313969}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01314337673500901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f682cb98a3b6285541c1d679fddfe230f6042ce3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01291577478152322}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2960750853242321, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013340916085246263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..396927674e9823b606d2e97f8118af85d12fec27 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136416}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1be93eedc770a3c637714f16b92342dcc6b00aa3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112552}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27559726962457337, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01305716965576184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a866a7b15d934c31cd2dc0aa91569c3f76addb73 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27303754266211605, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013019332762635732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a69c339e799185ae0310abac3870cf41347a91d4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_challenge_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042967}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012942030195136414}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc641b37608b1307e8201add1b2ada6876aae86 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23274410774410775, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0086711691205793}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23274410774410775, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0086711691205793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..86995fc2e1ff3fe476391b4dd981a8f4adba62bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23274410774410775, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008671169120579301}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23274410774410775, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008671169120579301}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a8364c3edfc22b9b9e29be79cf4f69b15bae000a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008757032594354026}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008757032594354026}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2354e03ef0ff4d15ebb89e46be7111f7f123f2f9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008809171744720559}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008809171744720559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4fcb0c4ae3bda5dc89d5c4aa9a40fd5f7a1075dd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23863636363636365, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008746465140706126}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23863636363636365, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008746465140706126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a2271b48d66b5bd7da244b096ec76b9ef1d605ad --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23737373737373738, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008730525906362441}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23737373737373738, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008730525906362441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..677bce8e7f15227db3a904c7162d94554055712a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.35563973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009822854395535487}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31186868686868685, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009505823345817654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d52f14d9673b08992f3677c0f7abf7ce173bdd29 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3442760942760943, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009749495321590819}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31397306397306396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00952324533521551}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d1eaa41b2935060090fec88addea20f115eb963c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.33207070707070707, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009663817543072694}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3085016835016835, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00947747234297813}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..39028a07d5c53e46aea3c7cabfaeb277349eb62e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3345959595959596, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009682137724327905}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30765993265993263, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00947029257583118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2ff0f7611d05c8450a02e9faff9fd1e8910e9336 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3400673400673401, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009720765494805281}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.32365319865319864, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009600478182273787}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0bc4ac2a2d7661e6fb8216c5d38ee1ce50312080 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.335016835016835, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00968516076593236}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3181818181818182, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009557408782506374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6e99b000b817fbaae4711bfc88d3b3f1d10c86a0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2958754208754209, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009365854134140067}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28114478114478114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009224735470287007}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b08f7056094a89d487d30e0c0bee8e7600c5fa7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.30303030303030304, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009430140669278959}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.31565656565656564, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009537019245566084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ba3ff2ab8aa65d5f08f36b19144b783bf72840d1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3181818181818182, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009557408782506374}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.32954545454545453, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009645184190953861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..89df4233dc05ede7025efe0c81a3392fd73511ca --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3202861952861953, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00957415266873942}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009673016668133383}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab1131a67aedba1a8cdba058e49c0d80f70ef01 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.30134680134680136, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009415259879351623}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.31607744107744107, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009540440071928289}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..50b35a80b6db40abe372c4e3e16483acfe77e156 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.29419191919191917, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009350328648861737}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.30597643097643096, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00945582203642662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e49b03525108a24002c12cdb0fd28721e12cb076 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24242424242424243, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00879365151648508}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24242424242424243, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00879365151648508}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4b1c2b3942194937d214aa12d7159aec4d03eaa6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2478956228956229, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008860162361464027}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2478956228956229, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008860162361464027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5c023cfc7a38f094352231c9214d3a932ee9185e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008757032594354026}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008757032594354026}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fafb6d19e6737be30705112f27eac64846bf742f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00882458861121908}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00882458861121908}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fc133879ac3f4bdf1a0c8fb43741438061b01b5b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008900141191221641}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008900141191221641}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7909bfc3edd862ca90caabe3a266f07b194047dd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..70d5b4dada96cc91ce47a0510bb3e6960050dce8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.35269360269360267, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009804420599378657}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.31986531986531985, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00957082182057359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..962f1ed9e5cdc7a850cf0c4a8baf71dee042fa3a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3400673400673401, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009720765494805283}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3063973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00945945357339833}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..38a1ada1b9a168e4674ae6c32f571b4dd11de80e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3409090909090909, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00972657959342402}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3122895622895623, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009509325983631458}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..91c73de88a0bb5b66bc6b1651665f04d94703d87 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.32575757575757575, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009616642976885971}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30513468013468015, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00944853109416391}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a89893887933f216e06ba70788cfebfc7295bf9b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3287037037037037, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00963890316702217}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3042929292929293, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009441202922359185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d6cb04a5b68cecab19800fff72b4c0c30f0b4c90 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_arc_easy_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3282828282828283, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009635749509262166}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3068181818181818, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009463075835198943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5c157c38ad54e8ae17ad68adf3d62e57627d29fb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5496666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009085074954912698}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.625, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008840308272346428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..56d92bf85be10b8614c7594985b98105437a8115 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.56, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00906425508467605}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6123333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008896822947561608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5708c3e354020c24ee5247fc4490ddd5d02cf228 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.58, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009012606487132143}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.623, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008849657553427542}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..49bf5505ee8aff7f94cfd4d0318f491657d52555 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5823333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009005596833757831}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.621, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0088588464102222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f0df468755cb8b825432fe1440a8300dd0d37f59 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.587, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008990955404907169}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.624, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008845002997512763}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..54160d99bfd557b9a4392d6db2d9ec81fb0d37b7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5886666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008985524690229495}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6206666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00886036232472252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0d1ebabea71e0f09206773787509cedf2ec25355 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884811049411477}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5423333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009097447488896784}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..be565dd4acfcac8699d43e8e369ac07ce0412391 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5856666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008995223478188034}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5773333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009020364414843638}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..caf54292fd2b204a2f86b104c72cdf6a63120936 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6053333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008925330066832188}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6023333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008936959925716907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..60bd0153ca19de78aa55ca3404bbe0d0c8848f5c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6026666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008935685051576502}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5963333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00895916952266258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..53508353068cbc4a97b770c2c3b475b231e01297 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6043333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008929245712536294}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5946666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008965091467970754}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..582c1f4ece4d9040c0ced152787067e5a250717a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.604, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008930542249025198}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.596, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008960362494453694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8f928489b2718e823fddd2a0168b955bed34c649 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.603, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893440584870012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d613021cb1521b9e9eb9f1e23848707590996d49 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5576666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009069303681923062}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5476666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009088646624339615}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..043ad417b55ba00efb6b9cb2dae61ef26e67e686 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5663333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009049526374650797}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.554, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009076827433934433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bbbb559761e43dc4aad53519eb00506e3975d9f3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5706666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009038582451449425}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.546, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00909150987738652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..77036b0e614ee87f862e535074c579886f547043 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5726666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009033293159951224}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5613333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009061278956794627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e5649adc74ca97e6e0dc526013b1d791c713c5d4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5673333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904706345689798}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.553, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009078792586293545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3fda7c61cff34e444ca7fc21fb6f573639c21c3d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.611, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008902401412932075}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5006666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009130223008005275}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..45109817936b099ca177fd3847051249d9443efb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6203333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008861873799148995}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00884811049411477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..53a65c4e37f4a3a5ef97f61ccdc1d9c8e340f143 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.623, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884965755342756}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6213333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008857326053368308}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..781ef798875b21f5227a7371bdae65b06f4a975c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884811049411477}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6226666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00885120015653439}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..34933e127fa950ec6bbafafcb70b223fdbd9fd56 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.621, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008858846410222197}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6196666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008864883436857793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bbd239596c09bf73bccf504f36ce396412416e81 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6223333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885273830576469}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6196666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008864883436857793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6a7d374290611083739c5979ec23596448647d95 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.606, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008922697920438163}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5d70aafe898b25875e4a3f8aaee70662d65d6794 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5746666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009027853030468712}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6173333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875277637761275}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..db910c94c4a1f9009a2a7e69b52d1da8cc1a4b4c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5926666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008972056373066367}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6173333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875277637761277}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2daccf661ba32b41bbf5717e11a74faa92f4dd3b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.595, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008963915658236387}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6123333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008896822947561611}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..889cb33f67dd1f8d4408bc4f48236860a8a50049 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.577, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009021315205815771}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6136666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00889117431069549}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c722c3c5ee8ae8c44b1b55065917be4588ddec56 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5723333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009034185176145654}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6086666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008911995272576807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b1231d33b7c3bc303db927c17783fc6857a95943 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c343cb7d528cdae79a9a1d4c59464935d6372f6b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.22169059011164274, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7c6a43ce272ce62cbfe8af189cb764ed46699e66 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0663363415035954}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2593837535014005, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa3499b7aaafea0069cb8dadc2f7cb3bc336e21 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.24839948783610755, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ce5d653626314a0e5f03caf89d944659fd815051 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2198067632850241, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f15d689463cdf8fbd6b328a0e9ab15acb5a1b2f5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2183052617835226, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7d3b1ec03f7e0784cfe1d9e0c61935e496d130e8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813057}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2850877192982456, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e1ca316e71062a43cea7cbfb9709600cb0efa324 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7b51dbf06b34318002d5e32b3f90260ab9325a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.375, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.25438596491228077, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4b8fdad637d779640b0571cddcdc8c076bbda84b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930824}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.20028724376550458, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7dfbff4fac447467472f764404a11814c3b966bd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2183052617835226, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..308220d10e639baa9b0c9194dbbb5ba56c12c23a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0663363415035954}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2641898864809082, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..11040310fb0c3b69fc9b865c6a239b0b409b7535 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2580185317177477, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..40a4e8a845cfcc9c99f12ee2b974adf9ccf07cee --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d224e265b5291adb48cb4594d4f63fa6957af100 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.27619047619047615, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2efdecc6d01d5a70b0cde3cbba9af6507e9e5b4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.33283950617283947, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..610c8d88c526052b3704225cb024f6d4a57da8c1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3328395061728395, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..defb2aa446e169a0e70b983454220ac4e75b3ddb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.35176007116533237, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4c5add3ef03fc9226d072bf76ced0b978f58cc30 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.336846728151076, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ead8cd2a4e9e394f1639993d306ad74b4fcca060 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.27441920164292133, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..63fb0528a39f7bbd0c4db66f9531e39c7c537194 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.16071428571428573, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04952230059306299}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.15573630249667678, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d43e170d02e8971112ea298c9478d54d6249a53a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.16071428571428573, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049522300593062986}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.14387464387464388, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ca2d51853edf7e40bdf661f86d402cea2951e573 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.14285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04718416136255829}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.12557319223985888, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f4d9e8a074b36df5469291639addd2688b6126d3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.16071428571428573, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04952230059306299}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.14033189033189034, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..db8aab40d28c59300b0ae403c63b1ecb9c0b95fb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3076923076923077, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ca5ac2b6741a93f503f00f7d9be3e2ef45e2d065 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.26652142338416845, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e045e355cda624b061443bc9d3787b9a15471c30 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.32269503546099293, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c0409c52d29d10ecadd95956991f19c1c239828b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2976100628930818, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..67b8c3338aad1198824802ee0d023b935b11a2f9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3461728395061729, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4844986303c05da3bfe5923f81e1d147471bff54 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_cb_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2976100628930818, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fbfd978893ba1c4e39e51e105126a715eaef5470 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049999999999999996}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..af062d70bc3afa1741b85dc676a5276463b56ba6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2b3019bfa87d7082a95abbf2f94fb34c6e79f4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048241815132442176}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a088d54cc044994ce719cf9ac89eeeb65e2f2b73 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eec7eec629208fd348c0967fb77f9dac89d93cb7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d60ed70ec51510a34a2c130bc2a1805939e5fa0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_best_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c9bdd6e723344a741badd4d86e1ec751ed01f8d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..076b0c1d00d4c9c6a2a781b98a168e8298ad1575 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ec23e16d30fb5a4cb35d4e82fc237f26711b0cde --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f184de475b6e32bd1dfd88c064b810f2741426d3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b5481e12d874ddc3a2b89da4506f681cc2fbf839 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6d74c618ebeb2ee7a66a0380d8e5b1d7d3d16f5c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_cause_effect_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1d8da2452b9ade4733c169961b819d36c9baec3f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562427}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..538c6b09aca89ddeeb5c38e254b1ad142c0c14bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1c892c1edf877dc4bc1be9abdd35914448e4370e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04824181513244218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8733fa804532c500821175d538469668df802771 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..56571c67b7cb462df08bd3eccb2ccdfbb118198e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2c933487a770543ad3d013bb52d45d65375f13bf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_choose_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc9f8aa3307c4dc3714c0fe4441d16ec5046214 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bef275982ae1d425ca72eea8b3b40cff3e5b0f22 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58c9740ecf175e54960b7e3d6dfa82bd92c5cea8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b7dc7d03eb589e0c9dc164e8bf0f5286ae82dd9c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a95715e280d66015201f8b320e71e4dac5f8c19c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ad0a6a986a3ccc9f1f5a6109624f57412d59cd61 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bb93474dc0eaec5126d4ffbefbd2a88393aff667 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562427}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9eabc44d2f55bc0da3bab2661cc4fe5c6273ee07 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237101}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.35, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.047937248544110196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b21227c25e07ea5808abc3a66fddacc0e9193e78 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0083cdfdf6ea67a8ed98162a803b6ba92dd20377 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4177c1d85147f6c83f49f162e3ffe713e3da2424 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..96f174490eebd81fee1d0bd61820c9d0838a0173 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_copa_plausible_alternatives_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049999999999999996}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c44ad7c8516f97e0bcc6262391cb8d3d501a3ba --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.872234562531536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06666687313033742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.19449892872154664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016649751999105959}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.3385859719691564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026943011480723986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.24189630901934833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019324120985675024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.07891999672740428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009533683551406194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.14092640846495855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017170890317364676}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.09892905722529392, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001163722813056667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.15770316249517122, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012736741855482734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.27888219312603624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022960014811976687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.19713523690512094, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015249596621099413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.17297504164263205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015157865911726413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.30157958831453335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024874143749410256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.21518082639820849, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017666395173976037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..43c26c4557609e17f99ff797cb9fd6de7f14ca4e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 5.618664355824302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.044804380544830956}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.29980096617944, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018479476291498587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5510960644199242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002746177251645845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.3790777063050893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019180894806166623}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.12320524930026791, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011603794001722478}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23411042101520732, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021839181570230963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1571930586851638, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013851703119846389}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.21150876369324337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012091371590184875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3973772754817481, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002348890772602287}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2692553660223294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013312984394904634}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.24763246470479972, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016397591465259363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.45603718052824205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025930969811509105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3132026852747841, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017579190044477355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..308a08f9b9811955943288d5cc6e8482a1915abf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.655264751599302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10851166271873992}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.2956786617020848, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018018971024399673}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5505653372006829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026480715686419227}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.37482466732781167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001820505134270105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.129469707240088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00121806826731611}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.24899455762459116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002246273347531876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.16543669174208636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001418641213100885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2175954785088815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012481117214859327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41307647476574705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002319703375666559}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.27750839928052384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013211747767623314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2472195816112961, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016439382219917379}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.46060025676338434, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002548116733582456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3133486537665277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017193781609667741}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..231d6af38af99753c9f5d2319f04daf7b5724948 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.948031260618786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07024191698575197}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3028962873149275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018453669075132997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5503482491585948, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026466497124214964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.37972552437025653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018092314859358045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.13494336292795747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012826406801381255}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2522143776432092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022660646785606113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1703052547809578, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014604429313616653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22176702009033705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012999603001234818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41046709529612835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023382620871218933}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2796335792236481, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001342957090112707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2546972130694942, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001706082034205406}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.46258844899711243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025482851759882335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3191175618704876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017351084798555157}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dc2fd78dd24c1ae75184922f4ec8dce1e5dbc4db --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.11987499239522, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12056984540140626}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.30702881034304425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018940787804765146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5510843355772259, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026497395614324347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.38334638913624297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018564450302232401}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1383407701093842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013023724651524862}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2550398171391987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022789288990834686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1738253944796353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014854824402731792}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2225382181098862, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001346023397016377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.40621675306005106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023173016257193608}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2793510383067617, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013892803599562522}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2605328118487642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017504023543387641}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4677978217704759, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002582278022445715}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.32528129826527175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017876332019525448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6a97d9ae1827f791577a30133ad81f5ddea9e2d5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.021064993034315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11474627928215739}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.30826387603129907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018679719801617597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5491523180108668, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026220387381628244}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.38381893127843775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018139236221588497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1382607118725467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013102373270393998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25338957755007935, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023051813874414608}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.17329021394802077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014896565428026973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22310499125513847, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013603626862817944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.4041141573216587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023508428676916305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2791853032877538, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013976915078093702}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.26191481097823616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017583830712396412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4668488083352266, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025930890163178466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3260232619607743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017795108045555923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6c6757ec3b9800f40fda1e0babcf1e0bd2b9f3a0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 1.2087041243674348, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06236227717440753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.11495488513218448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014917009813178457}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.2228480616093966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020952477514755564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.1485373299435639, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016885326851467698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.01961510433394365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006727987776785344}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.03657357239352748, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011960431470597172}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.02500994962430241, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008315526871574698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.09374319199704093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010902607092754888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.18550669125605201, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017172608070746052}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.12205181393918246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012795921662070245}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.10182213242486479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012349205591824877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.19958826927621215, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017982055619464277}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.13207144996991402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014103766498503702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c8d88caccb9cb8ab27cc214ad1c6a146ac4f858c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 5.719467204595487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05467102028072658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30689525431041903, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016979120981340232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5607471544318147, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002541857266880532}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3880684705597522, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017293341079469464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.12815271668435907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011402205486632784}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.24170118147382424, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021562937828079123}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16340881202208163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013545945899839753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.20611685586596215, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001198478500170298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.38367500882495165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023144118800144126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2620939439124173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013281943135113716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2540076031390351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015610224671734501}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.46438460329961756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002459731462210942}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32115446566223654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016529379356976753}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b7643a7540e8ac8968ea39e15c8ae909422d2708 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.481462554213176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0637597415756679}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31304294257033033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001675584026068997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5791447618132454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002476899866394818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.397978553188765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017070834320949354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1361700393455817, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011628001288050507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2603882016032195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002228895122171891}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17467867245016275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013955055510223417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21315768275684321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011907007297547669}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4014777565448129, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023252703119155016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27248947489993885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013417737031681963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2617186737971144, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015398800543719527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.48463177955108755, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002439535597275367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3327776090849803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016458286188702437}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9234d77ee0a3dac37fc623c97ad213ec384297d4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.612323545023084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05976238792927671}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3112721062235333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016625710557320614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5797972107422691, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024927181964562407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3967724891293664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017005575062401018}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1371955035707705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011750480699642973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2640373903872155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022501962149717035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17648552604551038, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014153455920465827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21252206793017903, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011987552894719596}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4019456170099069, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023015152807916543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27221279373131524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013520964327518468}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2596822226642286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015551841205914923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.48377583683476527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002472550681565558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3309974761576321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016730505404799294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2e2d29ff72c5ed1f1470e81c0a1944afc1472f3d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.691991628747666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08350250197448322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3071216799151067, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016645873768075543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5734199732733007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002497421870329201}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3918497625319794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017118070244260361}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13621235371774604, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011845432721289762}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2624365602666334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022614408095196375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17525069265082474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001423044315562278}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2100790609152195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012057172066683016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3985723300660603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023232420666500823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26936703256276495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013634469715740345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2581428748840792, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001562414889046062}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4817784670028705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024626011315168057}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32924292332328525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016762151102916476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9f382762229dd2956d0903940a1812e7efb98ea9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.645972521435882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09437494975947229}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3064735649925744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016464375750162158}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5729264801132828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024516925929830738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.39134978775086127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016911013819947294}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13656095219775433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011845029911870509}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2634948500997642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022812481290132693}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17585670830781294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001429411829744073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21048241965429831, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011953684611401595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.40033920081936836, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023247277061194146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2702295976370569, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013593970744284988}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25867956586812857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015561827508017755}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4833218028427562, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002451881437295078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3301539965432325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016676064941407525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ad920be18aee646c9c516871e6cadfd0e1ecc6d5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.0028333333333333335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009561880076163621}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.00021398046398046395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 7.182900764312899e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.0003977278759887456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00013354742540894128}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.0028333333333333335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009561880076163621}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.00021398046398046395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 7.182900764312899e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.0003977278759887456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00013354742540894128}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.0028333333333333335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009561880076163621}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.00021398046398046395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 7.182900764312899e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0003977278759887456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00013354742540894128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e812cb9ee4a665310c4c7a7782918fbf3fd0359d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.7785930657042605, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07371286925187409}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.09307418766156685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003831133208744367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.09684372086710462, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037674098990848615}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.07707231946910663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029012639839078266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.026646794320064136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011779628965901102}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.040281858969099534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017553221623290288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.030860375813935463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013215709327802443}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.07194869702103061, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033904984440571335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.06646442818559543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002627445124677514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.05293870853728192, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019913751107059636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.08237065814087174, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003597616882105883}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.08104209712761277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003180633123309235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.06475443481713601, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024499768948500517}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e07dc95b6c1a618ee55c7310c9bf602e51b50a21 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 4.024301729421313, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1532981962559809}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.1602164847749438, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004262488194816882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.1893467230974042, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047311550193664745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.15080808821145256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036526531895815233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.053157839459133875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001489446683458996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.08110347423194934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023151112057970876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.062337691922640125, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017277056488068175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.11906005873487642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003676903873427953}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.13037070564313896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033405273229848032}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.10356717573719904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002527891994310027}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.1386315725589492, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003950043680135484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.15770240014439066, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039943713450391076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.12582494031130287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030784867770067177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..24ed1f27a26da25cd19b9b83b2d71ce9fb230fb8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 6.2616432240038, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17231457870322375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.20703579454147295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004332835339427102}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2544471431249528, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004998201805906469}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.20303028288911848, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003846560801683136}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0728023032757386, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016348952484523923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.11015181488739925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002540705745285619}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.08500284986690841, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018852577618806415}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.15147184243295325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003718090407697094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.1755748249833486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035736277120166807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.13949830107514838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002682661526674531}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.17716302997626837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039987377595068554}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.21119592100785614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004224651124239091}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.16864868735595545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032412006854200303}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..69e13fad613297d89e6d879ad8d1a8f132bd2d1c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 7.796909762932712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1669092241005445}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.23562398578259758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0043331973616364825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2918086256369194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005035638323372833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2339004008254913, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003879962429844055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0868161625025297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017252832094021909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1301381299164276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026742073604647753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.1009343907225879, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019833361884621687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.1717453473169215, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037266451879112016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.20219125520831865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003636988165748045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.16128440664165175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027278295370788417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.20100861293322395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004005483075655645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.24237757228826143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004281416671478345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.19430984547637892, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00328399177028088}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d68b891bcfd3679da487916f9830a339a8e001d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.504901044297757, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1823982585890325}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.24905192651947664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004201658035798914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.31491991748172854, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004981037624003543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2534732144080346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038630129775201146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.09568433998600458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017454509782120025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.14154088138404378, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002652205184771609}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11077316594795349, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019906677332412748}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.1812665699547391, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003574420490219075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.22010316835835358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003620300580738604}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.17650040533909794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027472261983493805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.21189846325175168, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038691558037997923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.26224286165281196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00426905910632388}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.211084813103085, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003295971549920327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2f01a30b20d3b2c2623ca60e1e416a5d0fa784ff --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.3138256134465956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.042478650177872466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.09054875368611116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001766981005220654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.09899686345340121, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0016339321555193618}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.08626819478230045, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014217358109946006}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.004635097118571576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004584060579508488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.006688039413188141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004649040639428354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.004707141554710639, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00037478881160494376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.08868499326830791, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016895151305050993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.09761537714635501, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015968584648120006}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.08482739955206746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001374786595816378}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.0697631127415905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014027633376290944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.07822809105458635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001337355132253774}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.06674662648443055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011005680176848465}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cee7cc238133f99eeedd87cc56d343bfa3ef0023 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.380638446426456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09557609009378211}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3328975043204133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003509121285453912}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.45270988550034613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003062231284181297}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3400267632274026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020403696620665606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1499329665878703, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002263691832530101}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.1983614972096706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020825028564397568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1486663277769484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015136508138340347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2592088269282877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002698465205645672}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.36228982914822633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026355915533943}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2676344730573564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001562435233832024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.27037933017473875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003062674460499446}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3663561677071641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027976045204321166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2753891028069307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019025285983907253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3d9e329d0c51f21f6fb40c83d3cafeb322daa5b0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.3634920937473884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08616925114576995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2954297857860177, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029411840632251563}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48120558575759376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002844478586577598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.33740317942041553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019381935089587043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.13374093712808155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018836984171556573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2183048706264126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002121311703914965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1507673483604289, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014399337503822303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.23342217539614568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002214559299952568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3907575640409568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002505134070265779}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.26968155815162015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014592699636009493}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.23886641857979662, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025627121020893886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3892155963722792, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002698506480779706}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.272599814561145, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00182733683050086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2ead0040e5e0915ed7c2a0295fb0494649454199 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.870066226903575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09429297050700895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2995136417218204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002877943933866991}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4870699752407154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028159974008536274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3444162792721549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019646659016549933}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.13989458331193982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019259857294848428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2283186786335389, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002178927464111703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.158839720125521, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014985646156960436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.24052403808712888, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022515901969111986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.4008413679522952, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025096595704963047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2794048970004934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015380955393133962}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.24501488471812358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002592614810126833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39799000178268285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002715969584528833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2812645652371936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00189815704910076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7ee4ad0f6a3292d450eebc7d3f732167a5ec61ad --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.017381678234455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09166907850331675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.30143650107181685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029314795240309452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4898031820273571, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027687333603014837}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3465967840398284, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019574393578079014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.14062404051839908, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019112056232477572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2300710505484132, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021943432548559823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.16013598883167798, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015096765844560063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2422982418880056, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022053611865584053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.4046980533581673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024846126430562907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2820802687702902, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015095048991938657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.2479193537232055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025849971540480303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40376209522673123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002721581361466135}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.28522597669249417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018960405914839836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e2af077aa2e945a48a0223cb362989db8b25dcf4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.260211722864896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12365349150703955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3111425574072635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002912308644972262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.486978690036314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002680144522191287}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3539789382700311, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002014807799247425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.14647797107879432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001938088224804308}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2296020100709576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021177366913279795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1648812739511937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015410229627803459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.24849239666244133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002231342628042193}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.39930639747976754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002405155865675547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2855363745230388, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015231421878303527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.25960917213172013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026365144213955465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4057201628355578, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002662489705069069}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.29494158509832297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019724923487253014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..143837b3f9931531e2badede63044ccf73edc691 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 4.369170070060735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07204262900742721}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.19059607371932916, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015976396764760467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.41030667797285486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002620865522372885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.2540920126266791, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001743002313846714}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.07958464128539072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008512115391560173}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.17899027774981593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018510907312350095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1073793884770636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001057524513675077}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.1612888816485327, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012295381787847557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3522789855005096, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002286155396163342}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.2161433172109618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013779255059734295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.16374077920845426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014355224351735455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.35431534893364613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002559304615115808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.21871213194709838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001635496810687772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8477a32b421118ca7933469e6d461e09c391b94b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.211882508400509, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06087984691073597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.36654808520126797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021595231400698836}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5825981312150091, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002574992415760553}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4396792572537879, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020547864942206586}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1645678333861915, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001432238933148995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2677310092553899, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021604481761519365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19855266031915028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015519283833039233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2570869498638546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001512490055714042}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41635046992448654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002345198987228168}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3103214994230373, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015536997171868116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3030665303851319, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019368940114554841}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.48277562928021334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002499697912373815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3637273198283416, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019205698683061002}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..14ee0f99538ac6c1d64d6e851460458a2fde3754 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.618865785291484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09097368527602766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3662955696861521, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020996329194513177}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5898222196381344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002594156432810366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4418673874038735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020138555153139262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16735791685138235, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014286796631250598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27701539052345003, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022736180715884347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20344292743727435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015925311127849093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25828930577862486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001478479770624807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4238497421384899, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002419962083852209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3135424508873694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015516029637538471}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3055329010748124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001915478709605836}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.49320303301582513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025853778948590807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.36881863339193455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001931324334873031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..26b8957ee6eb1217dfd42d52cd1587c885812e84 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.838654430562631, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08770664641031416}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3653208109803443, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002089278039587839}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.592412984015862, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025780992158462137}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.44203444996828867, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020077674797794494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16785818992713525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001444636379498952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27957982192927644, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002322764141427059}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20463135769763866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001622776743332051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.256612100970035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014662808098234782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.42390466259959425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002389221135189317}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.31244127951805134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015421579875842658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3065499840629947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001929720013741525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4978186288299029, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002597275043020314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3710630525199719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019551761163624595}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2fad90ed9df07aad50d01a5151794d6513fa96dc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.664692810007541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09761641585182086}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3575562551113939, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021077592166897385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5827020964603545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025902702211649093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4334753650888552, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020383137826383165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1624053149694478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014639579406940794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27102562833445276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022990242634122787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19812463968549573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00163502903837983}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2508887474422941, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014913740208745742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4154733525023283, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023523244972575128}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30580137424286685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001562672024634994}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.29999696611259524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019295094756439808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4897297628793384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002565917788303965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3638439110752512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001949294337253243}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2e648d9b967aea9f4400c75a106f02f2560bfbbd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.621008002795477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10561546867270089}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.35130413207029, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020584837208460355}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5752142038046015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025882386282357122}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.42664757166768275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001997906445835858}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15915263498125684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014446749684239855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26695747744909765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022973676919300907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1945014895681582, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016230097540462368}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24911377312443383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014837091652911921}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41458252977023213, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002378711814640577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30416823372321367, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015597528882425847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2956310081550243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001909657845289513}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4844948513886233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025773079411811253}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.35902864736200923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019327436427338909}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6628e38599a6ec0b20a26318c2eca5e089dbf9f5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.09973135507547863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015973080444745867}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2483551581802608, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036884551976866927}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.14045583841997622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021460672635315163}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013401782513852779, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006463045667296502}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.034653579743229065, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001719993783733488}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0190566029197429, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009165165117914078}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07968673916612877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011529481491568863}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.20035694919481709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028106923696229165}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11250509517032704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015602744886009001}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.07993531686685008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012877020733687047}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.20117384720414933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031333624545092204}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11288459242315821, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017514781321482647}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7010026542087479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05557886575567746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b1df32d6c9969d3ba75533eb82d73989f6ccb31a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11279192477107143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016032349108474262}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2766207479271877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036066236500548418}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15840047240909824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00214692641322964}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013486589799660116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006573025385863716}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.033834009894690674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016640769892439025}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.01908469081931983, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000926119882035849}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07903088461894958, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001042052923479771}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.195968848936739, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002491056986604568}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11129001725840247, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014098818626850234}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09067322769906408, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001252995528243287}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22440113001075607, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029610037715897537}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1276356932508086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016950763004042622}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.6799429561855435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04879251393094118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c598f5a602587670b0eb9c581f0306a54ea8c085 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11759195430339829, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017689078804778012}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2889675810942405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004062189732858682}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1651965564409125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023832852201559826}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.018563434665803503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008384469201564846}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.047169939804170904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002175973099128802}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.026322118720045605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011845918036721787}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08578950235723362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012213960229954062}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21259477004430985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002927850647938266}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12076659101965195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016553933643729203}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09390304577868243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014156962162275915}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.23292557695865002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034258792471330533}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13222073865796682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019276011969453877}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.0018438511511791, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04226334665257509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..724115b1543fc59340424dbabb26fe6f09d66a5f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.12268247063515418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002004429738071348}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2920661828398265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004556228488458248}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16959674585893536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002631406491027894}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.022908311173769465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009443759203422804}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.057536503485596614, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024061542462623067}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03227615271942288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013255534293243935}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09180476596307438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001465280362062083}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.22042631379574948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034317489548799755}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12716380123453083, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001928363853274409}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09794481566650097, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016340921798746898}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.23499521663243025, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038347753688523496}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13571148483109557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021714373357817397}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.3657013486015295, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08729701107016043}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec2d43010a18e2c8888164fb0907ed36643523e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0353729182277329, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002120724556038934}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.06963538400891243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004245614128488828}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0434980036557313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002547351830150303}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.006434256872019694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006047069516458604}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.015043103963332192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014409503521463944}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.00873416530365632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008131278526907818}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.02693683498375084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016356740431801658}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0526495618772081, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003212351040820403}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03275624297934461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019019226248188242}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.02896015737664471, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017797342286746572}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.056639676651518825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003532055034648066}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.03526865532078303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020893045270848074}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.6251466968093098, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11921280575698076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..200235e086bd28c9a9748ddfb739567caf58f82e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017130559457731787}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0001583205885417987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 8.10965586465461e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0003020119153708095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0001543461059925123}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017130559457731787}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0001583205885417987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 8.10965586465461e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0003020119153708095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0001543461059925123}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017130559457731787}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0001583205885417987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 8.10965586465461e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.0003020119153708095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0001543461059925123}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2c9c0a097c3ba7d4b97b3f8a4c2e963f77d2e767 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.17644165930352093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035159930377663164}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.31669581782847406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004204880715544353}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.21057429412703157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029620022295376264}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.045892492699293866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019747730351747164}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08074272013017356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027066202276012723}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05349637631115593, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018866213566015306}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.13966279995339326, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027977834380213088}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2533248436654788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034708540063103682}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16723801755038487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023715164713580613}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.13757208272049673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028514349754435358}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2496171915878273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003711257561558963}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16467108129832378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002492627666300806}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.0794867877910552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08783103175339281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3eadf578411482b701054705db70f02ced717bd1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.1452348716004801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018745609056768366}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.354637619733734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004310412338891105}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20366433099160464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002506589804140991}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03631887785067335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011535908212142747}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09256525737403981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029692107599791377}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.051542487477497304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001622782922318877}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11435270633499948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014769836849210006}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.281332040329211, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035948252196224143}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16066302558679957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020034483691429915}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.1148601995557349, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016090320139797805}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2832784836907842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038965527110954635}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.161499696273688, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002187584389664099}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.063408955221386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10793829466409724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d780c942b66325e5d7b66fac58d49badffd26f2b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14803725413512445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018733512973581588}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3610354652240794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004389296174890975}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20750338741773555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025080223518937884}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03764444720923887, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011831632104640343}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09572292100591462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003053817939922839}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05337831779753894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001659922934200303}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.117013811273496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001487754889983635}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.28688812754471305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003583873941321553}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16422047885591626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020030333826574695}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11725652432083132, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016010042579447314}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2890122736547096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003981206683676861}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16481921665835544, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021846730064098525}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.0662695038715833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11604502835868223}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a141f0e567ee495d4733508737585916cf9e35f6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.15005408001279993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022275262688393857}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.34931766468701486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004799937617922755}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20491006822687677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002761147760944239}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03817756156834947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012790098539207952}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09333832339595143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031111903067984276}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.052933674983345634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017273248399238827}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11653214452758791, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017452636081294359}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.27290106515166107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038631652216362072}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15924458772595346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002153213639538366}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11841489997398834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018745424755303739}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2773235770549664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00417220414799562}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16175793225879814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023311720661425635}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.187340339866472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09337748219810847}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f1742330b8cb35789f3d89102eac196dbfd2a0f4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.051357937177083465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034115765564453455}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.08957854109700515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050933482072906084}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.05673653310470254, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030997047693230627}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.011627742228248835, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012945900748551743}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.023933291444758153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002099710700842424}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.01418426797251855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012124277033053392}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.04126617524992539, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029705705175859863}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.06969693031985601, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003989487538391626}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04423726916636083, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002428377343138416}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.04252467493794107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030391760211394485}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.07216556113564128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004193160744760016}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.045736152774752646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025402110740203494}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.1685643584135952, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1951566077821766}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f034345a561bdad5020c420a767d746825e21c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.002815097659210261, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007550078312773055}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.002058439850730106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005489718025062792}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.002332190655682947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006168325099045509}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.000355815568079719, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00020880203605271082}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0003037449971412236, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00018804668706895537}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.000325473526945072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019692751752173486}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0019160135596418518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005319338017808152}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0014383859186266749, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004043038910360008}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0016104525385042783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00044591521297227443}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.0021304217929180094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005877695890086486}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0016128812346314246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004569974976967699}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.001800332071457902, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004998974285323676}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 9.155484870335624e-42, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.4620320117295877e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3945034ba99907bf0963485f9364ed34137d12d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.17021358510341336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024373039382226077}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3520563857681371, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043918973093069365}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21991779087563262, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025921031610263}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03871189783801819, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013940343459980615}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0841631095984103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002861696063072104}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.050742967235947956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016968813344534614}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.12632426569550884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001895537050144316}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.262981285619352, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034911247032713426}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1634042310463717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002018854408540066}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.13144702322249674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001966787320258499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2761271441560725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039020054143034}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17070284614092165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021883919787213176}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0719375915266327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09381739892136316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..18281f27bc4b1b4dacb287212db1b6e0cbf1a1cc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13144344568204896, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018269600623641938}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.32351107047900046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042227591238073415}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18467573869385082, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024560883653668973}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.027812878457341133, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009822890456735115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07174244190002714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002623646025136939}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03956915695649403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013969113381600835}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10177712280555466, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013642070339291124}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2530115289147634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003361296390437375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1433445973017209, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018589035220238875}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10392255150260828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014996096643151138}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.25806957261490177, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003642123743587609}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1463591812261117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020474760089749054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5179351179741758, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05443901746520919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c560c2a35a0238b34705edbc4b29cfe609aba1b5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1310122719121047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001774463078853173}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.32197642898756434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004128514222989525}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18395936513372504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023841684164883858}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.028592074721851848, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010146723977319822}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07420088979991933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027242327105131063}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.040730679478674064, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014445249170945054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10506688444375611, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013615120559406814}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2609222630071341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034251450207401918}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14791071749656537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018605423267814124}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1039952495955331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00147131239785375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2579710996210495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036391736637925577}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1463682832676705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020126152822738364}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5610996318449655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.061361742735683046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e001cbf99efc427bd21f0016966f605be4a831a3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1284212760059797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002041598803417733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3028097339054088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004615535937102858}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.17637720631581139, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00264797834039018}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.026902495307978534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010371639486970465}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06717388422150615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027080652362817086}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03767895922224648, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001448043764579111}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1032652428870357, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001609764469770287}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.24490235352225267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003766948264946798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14193633867939442, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002090431283179229}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10103180270559059, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016751299955245328}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2403006525007405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003943762632099877}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.13894288832668733, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002188039489816412}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5762937299614814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07362827411239845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4346c7be8027017877443b5f56475c0b630fb4ed --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.041468733586705914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025971833623889785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.07627594081632273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004409224115860876}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.048911634240246346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002765697903406571}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.009048514108124975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012181947613622947}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01699435587690579, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001625353914816647}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01035393012550112, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009785561170831303}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03361424158025485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002192257614857122}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06101985571016838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035062484157730514}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03907075670579812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002182547417010533}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03401363654888806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022524026754152373}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0612418372651088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035630181436990096}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.039302040446406394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022230245548452298}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.7513821615574038, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07905589981346285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc23f51cfd6432abbbf47ddc4fe07fc7127aeec --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.003214034101498322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009221332972255172}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002755372289086832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007486991780011711}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002904097352119159, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008025657506213006}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0004553870087049595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002660277152615564}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0002936371804296332, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00017278177414040624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003555930988203656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00020825306662602857}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0023980769070289483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006896763285113808}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0020343184807463843, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005525851742486296}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0021617867610418057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000600765218788033}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0027490625210994384, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008051771911473355}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0023634542664537216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006464968490936459}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0024822527344372726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006926475370230472}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 5.9664964945316196e-36, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.37663460549604e-30}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2042bcd374f033bce7ae23bec8d7cfa1191f71f5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15669364098257, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023593186592923633}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.342374371356989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00437907614669953}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20603426954244244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002503133782526908}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.030783905399410026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014282737935382643}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06881031698129042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00251481083212002}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.040126835769534804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014339486984846503}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1118036127326331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018294125382753313}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.24498174699159114, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033060394110144436}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.146668869960305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018236466465731617}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.1218098460935552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001924491999404007}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2695259798153811, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037470358825042837}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16072295732233483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020479535637844687}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.6729531930201382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10373342822948405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..76ee9947ac5ab93a73d05899b66b1acf8bbd7d8d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14788910472235636, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018389112071502275}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3632558549763597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004249174048494396}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20772684063267086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024574107856270774}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03422805026149446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010929716541644215}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08684664107270823, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002805600167681186}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04851112854401421, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015396516708931478}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10960862089079661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013730570572271972}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.27132452858310807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003336994048105982}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1542221581796846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018545206679636554}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11840342830898004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015232398298886394}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2933510485102284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037203141797391708}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1666793386233674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020689435117509725}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9783686260260467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1607197494307949}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d5ea68d2ae4cc1c994f1107f5f17182d2de6d46d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14591017846113652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018829813606970626}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.35417878162606353, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004336112520719766}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20419784491514162, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025189481605346693}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.033450171038277396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011255630755179697}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08454708459773036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028776060737512313}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04736923476037229, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015840684340098893}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.112357445163031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014098869920409992}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.27457572511057443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003414209315676662}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15747382683561614, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019009865513009152}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11456495811701795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015800062086523089}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.28097827542002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038590856032283075}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16076832155716242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021544746480995827}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.802744798920398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05920522944838898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2b14facd2ebb4c8d3b747834a9844a4a47c71890 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14153741165892486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020927973441702535}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.33105961459502653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004650854800318841}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19422961589893378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026855658611884946}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.031230937266789643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011103593069502417}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07676489034975985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027744764471766274}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04353291741965738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015306953449422075}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10809159570024703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001596172312046679}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.25411156370365817, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036584523222178214}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1484909183587473, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020672313290265535}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11098479481711339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017470432310918238}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.26198935148766644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004040675001072013}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15261940522006723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002267767106091909}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.7884006934583645, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07907574142987076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..75f7b3630682368846fa3e15f1f24f0eeaadeb48 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.04322202252738077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026080541844490843}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.08145947706027855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004597925269874288}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.05196928184468956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028568557569465304}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.009217798283394814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011515928095724962}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.01852184368380728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001676813297073238}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.011212198666180598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009995094999208916}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.034066440428070936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00214328249850724}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.06359700022888996, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036377298500940803}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04056089023030101, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002247569677623111}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.03534210706251266, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022064701869351773}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.06678926323593559, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038482831513349976}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04238835402403162, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023646209795898624}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.858896407235953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17813375842071116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e65a8e73a8e1225d9e44b3ed8734e65899745c0e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.002820827522912925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008241014146566761}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0023517396771233516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006955198818749466}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0024619745886219103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007068894891377003}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0006013749644563891, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00034827715329002203}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0004582288780401988, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000276736360710651}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0005008107704990395, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00029259545868901905}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.00242534330718512, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007134131592000302}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.002077455090168221, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006420787798082092}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0021415097697000925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006302603088867335}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0026888839947429815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007810704185094188}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0022737730468411124, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006780275889829637}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.002363959396267095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006794393822266892}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 6.730767350313837e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.433189914171102e-32}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..94ce24f1a82efc48f4ba0a94518700663146816c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.161357867973778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023312739071830115}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3579687540791487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0045453513416916095}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2156313325024613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002659844715910497}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.03816216737627533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014424649587105316}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08755656958993782, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030000299859141506}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.05124089074244038, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017434147699145796}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.12090520179487235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018371253252162907}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2697345298612283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003627327469111352}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.16164669459673872, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020770948236222796}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1263110602681917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019426636581592435}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2828419243164692, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003990709561595801}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16919936670173066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022655905836503737}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.139796123448687, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0864912451995421}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2e4b440e79c9e7f78c0d172bf4085e43f0638f08 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13025847232053914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018599814789393584}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.31675206292581976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004227418702250003}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.1823978250185138, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002494889694305653}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.027325104152554167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001013865586628979}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0693976720523522, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026208374515650262}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.03871722499957788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014304008993493518}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10265566343076016, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013991604181805165}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.251846472183418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033529856967831223}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14404400088909086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018950818451793567}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10154113890914356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014807942774655064}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2494785849680061, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035726690764623942}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1425722908096705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020191945274259963}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.4741686816314712, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09394903896491678}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec1c220678b4106e2b22fabb9cdfd77671850b9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.12565737450340297, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017756118340194749}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.301251337982613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004081353108976665}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.17515450965695337, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002375995326570658}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.02725985494322299, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010257708768464891}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06822620908929833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002624357062351836}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.038463882894735665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014439165288524466}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10562763001104193, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014305250731057942}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2550603689921568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034548246638929424}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14750967049873018, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019352926679694342}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.09586724631609989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001432813745557231}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.23246625054729966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034962734013648925}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1340651218539847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019565808957794375}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.4891373477876866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0811523557434027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c8d9e699ecbf78dd68d1c3572ea5c8a1ade34ced --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.12133492152104836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002108052911078574}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2771118310610424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004513875830290629}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.1652692842917284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002687544705180198}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.02620916447329947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011062000625487128}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.062338878703100085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002685465546018337}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0361588906854937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015014165247081573}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10257894420601281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017009747933540017}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.23624832791965056, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038268204334818137}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14006875772839858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021980071144352764}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.09313729488739347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001717942721208578}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2143290252431487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038151394078170886}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1270213480349254, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022098783786563886}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.548542398410883, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07300875765901975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..512a764098be792849edc7ae40a33c5641fac4a3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.03643719370186493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002327364890907121}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.06520608390203138, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003996161845682704}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.042930043440632525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002579054260477061}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.007917882560037967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009203895438615683}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.01504105351920977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015317801562663841}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.009582916301059853, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009704986418888822}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.030403765142791965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019637434849462013}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.05414283521586396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033076203364920415}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0355906132515583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021268636919251588}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.028885184139424556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019316925983647367}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.050662384755815255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031895292441193332}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.033437432695987035, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020528905348075315}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.6256689661157525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07517644959617945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9b3fd4f40537aa6abff82c08c726b0ceff820c07 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0011435105774728416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006698929504092044}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.00020164583843829126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00012654106887528574}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0003382885458357156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0002086842561151499}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0011435105774728416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006698929504092044}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.00020164583843829126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00012654106887528574}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0003382885458357156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002086842561151499}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0011435105774728416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006698929504092044}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.00020164583843829126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00012654106887528574}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0003382885458357156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0002086842561151499}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 5.331681904215219e-233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dbe6da92adc5af3f59170ee8db6cd2de3370e575 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 8.221264640592608, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.3149873650760696}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.16666080011920595, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005180719422543184}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.501863559860732, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.009052996765184864}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.21211270156193437, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005814456021753165}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.12083386201007094, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.004471674153000206}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.3806965470945097, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008597586629652064}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.15599026594193496, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005128636671935841}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.16072996710527454, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005070936585051591}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.48913954648097346, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.009013481487297286}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.20516494642415065, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0057181465206832}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.16227516864724326, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005116365519276017}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.4895892873231678, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.009023257251500322}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.20678919293239362, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005766515768579625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fddaff48835e2e89fc256cd4b270c17fe229a503 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 7.3767510595275665, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.31999310082457355}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.2215000417152911, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006137200329200616}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6794277393295427, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0066774062595464905}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.26918713196522137, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005971404751580413}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.1640201975855851, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005503885160452421}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5160107151183727, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008200440904778427}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.20124233569826713, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005612862896818122}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.2140530349109963, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006041148360807656}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6614267819116432, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006904677170599779}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2608873390569801, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0059334583892896664}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.21594311252863366, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006068594142965329}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6630530525028233, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006882384898937598}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.26275507037270174, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005950371049088624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4732d5ae5864fd8ec133e31b1436faa558de7a21 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 10.989723799423952, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.477322210631878}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.40990335902092456, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008259416445477983}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6715421937524536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006833449843147891}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.4219856738666251, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007684410627868399}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.31002907544350694, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007809148641476673}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5179977182718715, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008225556639305274}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.3238830207743833, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00755763096612864}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.39649832180640104, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.008182538521597163}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6546910349804671, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007082687223632161}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.41063474482635426, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007718338436949308}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.39933775034518904, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008215822611471774}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6566914458481494, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007055405166544278}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.41268141482914944, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007720124425223064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..94a570b7bce6834882252356a30db8f0c600862b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 15.589010417058658, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8454556706430587}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.5135186861705211, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008431281414525968}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6766130266106622, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006905636373535094}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.5126931501254426, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007956454706339308}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.4022833285618224, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008346085939704194}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5340662726903487, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008140102426002193}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.4057090348109076, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008083151105761782}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.49968417841017443, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00841093839820918}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6615311276531707, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007119822079266015}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.5011072943138228, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.008026772259796028}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.5022494859348401, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008415389296058761}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6647270305912438, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007078089811566776}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.5031731006894711, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.008007126558781245}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ae4d243bfcdcaabac58c082c5f5ccb7a949b4b0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 20.004791024670315, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.073121182179684}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.5635809903885909, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008037675550817377}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.675003625813131, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006934430967374778}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.5564854580202747, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007723304287288785}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.44094340540720767, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008336049944543478}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5353191759273204, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00816164151577736}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.44072198270594876, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008143626907731532}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.5481052255869446, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0080708932692548}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.660534735042919, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007153754571776756}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.5439527567369827, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007832639006320164}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.5513165852733105, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008062156731256905}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6634891599008953, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007102976763556776}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.5461795795622169, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0078049936381923745}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1e16db88d427c68674ae49acb203c75bd450f57d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 20.541530615378996, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0236650658463775}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.5751633897784844, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00803435176216989}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6898127295322619, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006762762659826006}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.5698885703517087, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007714826040679644}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.4546948153669257, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008366514761157184}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.551114824772799, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00808762271709775}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.4560672630321141, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008171926975401585}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.5612455638728635, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.008091161331947056}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6761124712337756, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006983397662954877}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.5584117042533167, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007831607552652558}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.563673777021132, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008077158591177138}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6788119556165896, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006943167870722146}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.5603248083479772, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0078052598358510986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..016930189a2546c25d5f6174e15bc5499e0d35bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49455930359085964, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665133500637059}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49455930359085964, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665133500637059}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3946dd98b2d0f5f14e98c290ff8f2f51a6cd4590 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..08cd7a48699163fe1a2c63dec4c52f926a2d3b6c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.514689880304679, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011660788281735496}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.514689880304679, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011660788281735496}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8596735a448980a4c7516cd256f62fd5d75b27f7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5130576713819369, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011661845375886342}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5130576713819369, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011661845375886342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f5a0f161c529e208745bf1438fd39c5613072406 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5076169749727966, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664470424044972}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5076169749727966, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664470424044972}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f0c6f54cd6173374f028b6ab16e9db3e3717ac4f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032838}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032838}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..456b4f2b589448d9954d3f9265e357968460d3f8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1653247598984174, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.013821408415080715}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020998722136358804, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005269403955655725}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.23228388738077216, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0041360813271565395}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.036532603399439284, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008337337093122241}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0030927874561869893, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014944861884636616}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0386031419214451, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018914429245019343}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005465981531797976, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.000254177372520646}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01917199471590868, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00045663889214525273}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.2161115069458247, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003797851830611}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.033434992892598596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000727186829587635}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01720510228861565, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00042829503630886036}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1977154227864982, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036870557526130065}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.029988024300834148, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006784161057869979}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..15b6604c8c19b40dedda88c30f58e384f18ca8ce --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17208205991093362, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010301725465366625}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.0197808533467512, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006637399084174438}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2083765310213374, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004054556695081749}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03341660597066338, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008614668722471364}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0030943760026781223, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0001632745006225758}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.036249687836481304, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018967653687895932}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005423476681292554, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00027465476938201355}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018317454340834892, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005782023821136248}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19659411245081346, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003796357362538508}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0310540742053096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007532823292737799}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016131297257791454, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005516670663513384}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1776065844884305, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036553543805523184}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.027295011757363707, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007014745436916857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..352f5dd21d3b72fea12a182582a442bbb9df9b92 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.13535549963348786, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.011883201814221686}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019974450931119343, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0009096575382126053}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.20327780750289512, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004138109405329968}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03167331762534985, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008149162454634007}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002913124216946272, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00025124507871194965}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03375022580735237, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018971946690473117}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004712505847751591, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002441580237771217}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018532639910942308, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0008013412795519539}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19189422414566185, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003826189608547334}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029502879197560596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007158939543230575}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016551043276073225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0008244428247702725}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17348668108248785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037039820894878926}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02594274780299628, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006803333166678133}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..852b662146a086bd01211135d56896bfc48187a4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.13494230997950132, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.009681091491781743}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01911132845089878, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006891049637900843}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.1986412875937398, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003917458930279225}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03186472547854922, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008686349358294461}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0028206675747611733, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00019060442122968908}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03252120345407891, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019655949748724573}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.00470586424330017, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00027245099685908755}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017666871030393803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005944257003565911}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.18735984838330555, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0036829206769154214}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029595321454027084, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007712454590830175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.0156124167337213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005611219330915058}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17009443127881965, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035528405308160285}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.0260842453327782, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007064564831714856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a3d1b4ff2fc4a9fb4daf24abb25a86d06feeb6bd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.11573444053320703, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.008126260546620632}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018273055931518078, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006213051033422689}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.1929448451365874, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004110007510165885}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.030447754959100907, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007971323310690517}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002439514135649695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00013971466675209783}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.033060693641110854, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0021387293538605323}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004310884060921524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00023951170742409153}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.016854212838326506, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005620151575706898}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.17976699456632925, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038004506713869165}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02807329381234764, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007021802242750015}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015034023539825695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005141904845083435}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16691215660017894, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003747360877378185}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02508537471034259, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006359302429150094}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6ff9922a0ed656c4d3eb8e7db0d50be55dd57e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.12248518122547715, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007751047228685078}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018924493431835766, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006692135261600222}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19719638092839015, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004120526129777433}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03135317378479231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008162298173479573}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002778538269095852, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00018494204805708875}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.035803100604613836, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0021320541894636206}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.0046971364054093695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00025307514350424073}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01729661551542472, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000579508348294998}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.18308780779648284, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003805938347899692}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.028760590162420697, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007251430881917033}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015684253519749444, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005565161139387542}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1712948423769965, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003765879887375436}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.0260467389410542, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006803550833664029}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..61fe2a6c4a88fdcbafbc1d99ea62674f4bb26ef3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6323e12bd45a9ea918eb325a047017d88c5c6e6c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49782372143634385, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738873}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49782372143634385, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3d9ce67a66532f784e4d78b44f31194dc91a8474 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49347116430903154, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664829595210969}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49347116430903154, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664829595210969}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fbcd53c42b6a68b043967519904f708e488f0c26 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4880304678998912, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662480968070049}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4880304678998912, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662480968070049}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c074d9c531aa93bb0a7b9f9fcf46d3da2bac0adf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5195865070729053, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011656869979288456}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5195865070729053, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011656869979288456}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c548ef9ae337351dc4b7d82b1bf2b7d66e64a8af --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5076169749727966, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664470424044981}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5076169749727966, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664470424044981}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..14fd4225dc55f90054c44d07cb4c95b5e767ffaf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.559847660500544, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011581954727227395}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5522306855277476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01160199979686681}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..444054b39631199a83cdf1d8942293b047a9058d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5418933623503809, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011624803747232126}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5424374319912949, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011623729421518132}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b37a121db81e7a52ad6f3b93c747e73993093511 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5321001088139282, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011641758014820126}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5342763873775843, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011638380213532437}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..baad0ed5cbbaedf3bde95e6c5569f567e0156d58 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5310119695321001, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011643363511107457}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5348204570184983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011637500993815848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..975338881d6d7c0c6099e2536ec78c9d1d6cd881 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5413492927094669, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01162586411331582}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5424374319912949, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011623729421518134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..554a472ac192a7cf3d511bd2b1b317e70848f3f3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5386289445048966, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011630956681145912}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5478781284004353, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011612217507379627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f1253862dcda5992d966af78b4131258fd4c4ed3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.623, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01533317012577985}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.548, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015746235865880677}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0187923eefcd281fbc583a7907e64a33e78703d7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.698, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014526080235459544}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.686, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a18de9531631bce55cd7e824c969134f941cb68 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.715, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014282120955200482}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.698, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014526080235459546}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c0fae02cd52b883b27f664b95dfed42f3ef6e0c1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.71, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014356395999905689}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.709, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01437099598237794}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5e3ae76d9d4ab4cb4cde1226d01de6340a32d5bc --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.717, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014251810906481754}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.724, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01414298497574067}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9e844a909cd0684af1fdfa4423a4f47863deb705 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.727, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014095022868717595}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.726, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014111099288259587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1aae20f0e9c59af3922dc7d8e76fba5ed6aaf662 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.876, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01042749887234397}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.804, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012559527926707352}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..af4ecb09da5360e6e954eb03f27977c6c763a27c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008916866630745906}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.876, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010427498872343972}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..90d73cc2a127e638d72a488bab01ffd21ba2463f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.914, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870325962594766}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.893, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009779910359847165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e4432e2c54a99f9b795571e089c349e48a00db67 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.92, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00858333697775365}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.914, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008870325962594766}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..54c9920d49fb566e30a2471d7500f0700649895a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.922, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008484573530118588}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.914, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008870325962594766}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..074c80b9badfb73ad7a1eda9f684d2d4282248cd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Direct-Question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.924, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008384169266796386}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.919, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008632121032139967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b90c08ba31465eba5edbe4a798e35409a2640274 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.486, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015813097547730984}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.419, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015610338967577794}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ffe1d8b761afb83393bd032181b1b5bfa4087f75 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.517, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01581015372983343}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.477, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015802554246726098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c4e9480f9bc24545cad8082f84823c4c2083a945 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015816135752773207}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015806639423035167}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e73bdf76e9f5649e75cbfa412cc63b7a35124e6b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.529, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.0157926694516289}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.486, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015813097547730984}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9d5a4cf5865622a3f8786f9ef44bcd9ceb781e99 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.545, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01575510149834709}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.492, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015817274929209008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6436ef4326fd762d306ea5e85a6b23b2ca3f9459 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.547, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015749255189977582}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.506, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015818160898606715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eb60435714488489bd5ae609f8bdb410ec9af257 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.627, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01530049362292281}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.534, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015782683329937625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..339cf26629385450cec3e8c7d4b68f3a476bc411 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015816135752773203}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.472, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015794475789511476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8d53ea7438547d7de5a4b6e4d9919fe87e48bff3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.583, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015599819048769618}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.537, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015775927227262423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..546b4aa8ea5d4a93a774613fb866bcfdfab50690 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.595, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015531136990453047}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.555, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015723301886760944}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a3704b0384e885371760268ea42100dfe075a796 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.599, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015506109745498318}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.58, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015615500115072957}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6c74afca6ea2f9168bce4c752f0c45299e03c644 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.585, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015589035185604623}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.563, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015693223928730377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cc079284ed6f968eb34e731152c53772cdbfe9c0 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01549968516584259}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.519, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015807874268505853}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..01d75795111acdddc1c17a7c3e2aed9f27889906 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.585, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015589035185604632}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.544, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015757928553979172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e8fb3fef96d5e44fcc9f4a2edfb9d1b515fec890 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.608, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015445859463771297}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.59, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015560917136921672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8f787e4785e3b455b984872605951087955e0a0c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.637, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015213890444671276}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.599, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015506109745498318}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0de4bb83cd044b49ee3ccb70b4fa6b7ee53a13c8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.62, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015356947477797575}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.624, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015325105508898134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..69411b472e090d4eb7de915f2c4556b30084a314 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.625, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015316971293620996}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.609, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01543882629468179}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3d9b2a7cf060ed82e6a06e70ba5cfa4d7e8feda1 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551049647290307}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4975948690539818, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562298481438053}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b95ab93c424f85295e33acf44f03cec657e70e2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4585783003741315, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011522687288692527}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4767504008551577, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011549925483927456}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..45ca8e4e561f0110e1df2e23f20a8c530cdcdbb5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011541325320336618}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4740780331373597, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1af52d77198540fefff6df79820d92a18e9a07f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011527657726586461}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..43fba6bf57a7419de67100d668711130ea01673f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4436130411544629, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01148867172507346}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4585783003741315, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011522687288692525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a0a8c4cf10af4e8e2aedd6b437c830b699e23ffd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4462854088722608, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011495517440721683}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011519544865928056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..55b4f317b573031814ba50e48706a156b241af2e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4890432923570283, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559655791130727}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5264564404061999, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546234813777409}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..efd32c77a7b579647381b6386a3e8f48ea1bde1e --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011541325320336616}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4965259219668626, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156215314916829}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..550ea04f5f34f46a0c68077a949ff20fcd061500 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535764881641411}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555016408505476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b593afaf66fc94a38944ef956afd7edc17823675 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47247461250668094, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154489847386459}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4836985569214324, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011556285484521572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1798be4c8e39b684f36778a1a1919d82cdc2537c --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153229486915312}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4756814537680385, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487319}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..93f64f7db9351401b1bb128a013b7ddd171ff14a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4708711918760021, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542794417345717}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546234813777393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4058a0bb27289284b6c2f72f50421b90becbadf7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..66a18e70044ba7e02f1f304eb5b0e6a61e444e48 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..152e1cc672b2399a280e6632e59c01b86938f38f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8fb3a8276c3ab2a91e53713e6615d8f2847ec0a3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..798923ac94b4dcfb906b62d4dd3c926c322c1457 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..43ab6a244ba8bacac5a7238848acc43ce378025d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a89062198ee8544f350cdca5792e2341f7cac6a4 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154813982307477}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.521111704970604, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011552120807053812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a38d710eea8b64c1b173ddc4e15a28b7abddbfea --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.47995724211651525, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011553138977961007}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4938535542490647, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156155858904076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dfc541a0eea03e45a4bff236f9df2e6d1ff30ca7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697235}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.48583645109567075, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557792331301667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..41556cf08113f57d4b43d849d4fd40de5a4cb274 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4655264564404062, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011534917341355132}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.47995724211651525, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553138977961008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..98e2ca0a4ad44d6a368688bea27b76ab9a45fd72 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011527657726586461}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46766435061464456, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011538227692217273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..94b4f51c0db8579afb43f41cae8bf9d608be3864 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4462854088722608, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011495517440721682}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c5199cacab540e6f4c3f733d9937709dacab7abe --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5114911811865313, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559378273599123}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5259219668626403, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..adaf4246fd7b8ad8f5f66f38dd1e87b927f1f171 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.48957776590058794, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559920087347776}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5136290753607696, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558135970599896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a27233610f76947166fea777ead79672e266fed2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4949225013361839, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561836054238777}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5114911811865313, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559378273599126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9b152afc53888ef945487750b443fae9c56e3d57 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4938535542490647, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561558589040757}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4997327632282202, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562430600098489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..49209a3ec3eda76a30849ae2d8b2b5af5c870b02 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5077498663816141, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156104327886355}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5179048637092464, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555016408505476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a90405b18bab9b82d5c9c799a005384372436083 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4938535542490647, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156155858904076}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5152324959914484, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557065368348293}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ddc71d54c93bbe8d2cad048b5b62637374d2ba86 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20cbd7882200672a995505447e08cff19fb04ea3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..91554c9a316b3a80ebe97828edfdd6cdf0bf7244 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4e920f75f6336f287aa3fe508e4cbd2c97d9b598 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366422}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f51ff0afe0ea80d5d3f368ae21ae644f689a1223 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.555956678700361, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029907396333795997}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5776173285198556, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029731622646495887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d1e6e16aad7f9de00c7ab06b5118e2d719fe662f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5667870036101083, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029826764082138267}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5595667870036101, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029882123363118726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f25849ada28677b290424e9f1372758585b51176 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a737600863ce734bdd1f7b801c207d93a21721ae --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..817d163cb8e5867abd48f5d7bc383a499711ebdf --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..33e3affbee5c8186f4cae747b3518e01e3fbe150 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bed144236a70883ac58699a6d07fbac945bc4748 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2b20efe37ccbb3c233df69544d3babc9022694cd --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3c59437a2873964650021e518b68e66871fd5759 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a254d6d247118407457e589f8d3ba900ce667852 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51f4585296beecdd6aea12954261524fdf788e74 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d8043a1ff73570afd8a33a49bcbdd86c0c2fbb2f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..18c56421d476076e56009532a30e604b140b830a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2850c949a7110b7e62dac13a66b84c71ac9f21af --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03000984891252912}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5451263537906137, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02997363649541526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aba71d59086c8ae4125c70bcea406fcfb9fbb92a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5471bfe752a6e099949f33f9d9814e0e96a8139 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..798741e05905bc7a6bc7651572298d460796f815 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a62ee00c94222ee056edeeb0901d83ad8b542e5 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..636a4fff454bdbe68e9ed2fe4f839be6bacc7ec3 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.555956678700361, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d0161feb1069e7c5d4330f34c5dc193949a28f35 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.555956678700361, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..66dd47b4502540bad3688a213f1c73b7b791faca --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1aab4a27065e7e22613ae5bb39becc5f789b7cd8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..20ee795a87f3b0e10e36dee5a9a9be30795c1e58 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..52aff9c1d76dca27f5c09add23af7fbc3e7082e8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0584329b229639e7f55f13284ee87dde33061eb6 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..60d289247d7d9a4b2ca8d115b8ad9cf6bb651936 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_superglue_rte_should-assume_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7b35bec9da8203bfbb9bb3805802b21de14cfc85 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..203436745d0b1d2f9c1d25d19c0723c1459918e2 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228584}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..41cdc86e9b33b7ca6514f75e44f05d7a010f5c00 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497704}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404512613097859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fbb3095477f368e1eaa6cf0a37a1d1d0933382a8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014039239216484627}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.526440410418311, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014032823874407229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..82e9fcfb6a6cb3ec237ed6442c1ef24ff8c4a389 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5224940805051302, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014038257824059892}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5201262825572218, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014041096664344329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..629c27127446ce9641ca401c562a21c2588e4c3f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_Replace_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404827882040562}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014042813708888378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..872a9d889c23fe72fec18661aa522f12b2a6e20f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051956064076896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c448d7d21a0355b9bf47daac2bd6750f4e3e4e2b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.48855564325177586, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014048804199859325}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014047718393997667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9c37907bf3b014e07145a6a259eea64eadfd08a8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4940805051302289, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014050905521228573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5dfe2094a92045768e2e5a4bb2ec2435c7740c66 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4996053670086819, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014050905521228571}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f152277f8ed647ee3d5b60a5ad0a27dc3beac531 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5027624309392266, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052271211616445}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014050555322824189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..273c58c9ea8aa0abb608efb22cdf86f30e5ac688 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_True-or-False_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052131146915852}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5114443567482242, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014048804199859322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..857252f7b9277d9448eba8665b5aeb6b974a9855 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..262737aa39e2252e672fba5c4af4bb5ab4254dae --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076906}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48145224940805054, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014042813708888378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..858a1cd2231a7a9d69bb11e06c24e3940e184767 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529007}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a49568079c41e471d98a57d9607381cba686f65f --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783666}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f4fe13d3b143ecf7f3480038ec2cd4ee4e21f081 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367592}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c64c5472eba419e1bf3378859b583384d621097b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228577}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..faddda42f2f84e126141b08fdb47ddffe04d0c4a --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c6838b24ec4ca6416961c465567f5d29016fd998 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915848}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2b59a264893a515694591db05dd6a18f232269c9 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fb56eb2bfa24c4fe310647abf74ee7e79ef6dea7 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915848}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62fe6ae5b401cd6c60c6318eb92e07bd5710b4a8 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824192}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405248130604952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7a4352fdf30fafa5285a408b804dbfa62d2d9571 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_stand-for_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.48382004735595896, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978601}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_0.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e89f49c9fb14e9ec89b0f5c02d2bb2316c08b936 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440419}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_1.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dc1ab49ea0c2b36b1dc80dab57716a903925704b --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48303078137332284, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014044390401612976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_2.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e573164c9947d8e372af7ad5362a00b66eaf4746 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824189}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_3.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5040f7d93430d311b5655530a7ecdc25f224df73 --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790513}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_4.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..98bd00132df4884f8e8ea68a9b27086bf2e7148d --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076892}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_5.json b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..478328b6f653b3cd161ea554351ea5f5afc587fb --- /dev/null +++ b/4b284b17bc4/eval/agg.4b284b17bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367585}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..30363a9c2aef1cb955c33ceb81d272a2e5786f65 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3757d62223c4174f3e285f9b141eaceda90a03e2bfe6ed1e8631c862399e9bdd +size 12428802 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..59f02d6957ac90a5ef95aefd86fca89f57487626 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3751b1e6fd69a3a4a02a29519203d1e7a3fe44c1d110b4b4d18aaf658f3250 +size 10162590 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef9371b8ff5475aad20636fdcc61719ed6ded190 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d945618931c882187bdeb12a2b68dd71c7359ca47dc4c9cd026dbda6bbce789f +size 5963043 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aea5fc5c180c1f2735afd4bf2061aad703bfdf89 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40c902950f5d425252805756b589c07e1d1200e244e22500e6f86994ab89df4c +size 6864028 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dccd01da592cd4f57bdae6ca026d2c912a1e4714 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9246dc985305ea27634ab42f2be89d48ad7ca9bb7b0c25c169c3f62f53b4bd7 +size 23324700 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29404df329810f9249f57cb4a46ea656f1bc25a9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97666c8e3224891dff6f48949051437d29ab04ffe33471127a082d18f0b1d32a +size 17376042 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f74f43c4d16e0302ed974a5b24623cbad6be45e0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b6a34fb8afcb4a83accaea2b73261502ed764c59780f98e9b40cc0d47d2590 +size 12507879 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6689af64818a94d50a0c710df08a75f1e2bc7dd5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fac678a46d44a33e74123dce5a8e42b183466a8f16c49cafb8f4e9f546200cf +size 7639858 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a12b59a6c02f4439ae652b1db415bfe4ac678f4f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:220a33080a1be029a10cc9054919c7bb6efb8521bf583f672174e0a659fcb41a +size 4491838 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dbb48d99879143bfecd998256df5dc5f635d0537 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7c59319c4e9875141c9a10dffa0637945d419290c7029db222e7b3b129e085 +size 5328341 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe9dcf3791771b17a5f5b729799053cd6b9d8563 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9532aefc1c0fcb99bb8aa9c8e92df6bcc17e58a3b37aa1f0d3534ad414bca3d0 +size 12180444 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ca469f39240e00a2735fa37876f32195a1fd672 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e17e2b8792fbeff0e5ae02527cb11ed13cf3f1f3fce7d44900c64483c2aa4136 +size 13681354 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5b35ba1a43420322b0df7ca7751c4fc2650d45b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547eb09256d96caa6e64cc271140e35a84d779e4810f15f6cab832c49847640d +size 13572792 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92dd447b21dbb24e6e3c0a596b4983401f26c18d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab47a74398b6843e74002330cc1fb50b920ac1c6abaee211b21d377a19526658 +size 9753676 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1486592dbef45145d1a1d527a5caec067966cdd4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cfa4507948a4f3874ee29e3bf8deb824739e93c41db30a05c04f8c605e0f93f +size 5501628 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2a142290518656d2e1f9316680d35b5d73bd8a2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f446d79dfb5740f8713bffd026473d9cd53a60bd3d3173ce081b22be1f6ecbd3 +size 6063886 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3636c08f8c662f92205860efbf29aa0da72bf221 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32e3d2bbfee95a0d37cc502d93aacad536814c4b0d9f3b80beeffa8c3f144f3 +size 13292906 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b790e19d9ca9b390142d79ddc23508c80ae0a858 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c146d1201ac3572f58005f0ed4ac3c8068cf1cd5d937edeaa5d3f265c290e53b +size 14515972 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e853f9f2f4a9a80ed3041aebe3e5d93ec636065c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a595a00d58d4b625e7e3e01cd164f4cadaa6f5d4edbe22801972c783569fbca8 +size 14003142 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..18fc36d8f7a586461f1583dd67b4fb7d5c15372a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b4fc241ec3dff8c4d5a2ca28727d9add5d4c56659a9a9b49b1b3a3e7125aac +size 10007208 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71863b3b70afb55e4f60c4c401250b72093c4fc8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef730649d966a1f74f733b37d2bddd665ed1486bc3a0e1a11fab1d97cfce0111 +size 5918400 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcc3d6e2f2b6720c86060960e333d8767cffa1bd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0803179220f706079448f076f32f69c0dce680680f653ebfd4d4c701c978e1 +size 6890676 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..580211ccff1f91b35f771c45986a06e522aa7193 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d58f3cb5e681da5a765b52ee961c1124df7c86bc6a1e32935efed661ec95f1 +size 15403954 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e315665eb95ebcfaf27fbe57ed93375a7d58f63 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaea95f9baeffc8a07fdb6fb75e801d0f7396918bbc4047fb88fcc014a54209e +size 16870328 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..091a946b7e65d1fe4bf8b9f2a77efb5903503a2d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1dadfe9455389ae054d7327843df10cad435eb93e3d983003f6d6dc2fbd5ff9 +size 16730451 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bbeab16c645286de041da66a6a12f12f1323654a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29dcc0406d3a6636667f66c00246ba5949f29d9dc387c6502f7e0677d70ac99b +size 13786146 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96e3bba539fb9a5fe2c3462252d30a72392285c7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd054c424362ea03e8ba43bd89265a5364552fe9ccbacc8aaeb953601a2be397 +size 8076859 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..25729045e6ab51d61ef8b0b3e2d469f8e3ac5378 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec4848541cf50b0b5d205d3e074b3556dacfebda154b8f4bbdbfa627158fb27 +size 9423882 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93608ce42f180043d9d8a7212f15d2fa77ddbeb3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22940ee101e6fcb3fe9c48ecc62831afbe54d93b554d5d0e08ae4f0a7a565d9 +size 21295860 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..001e9bd4f9f0e9d45ca5267974257bbac688bb40 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0338a9c45528b018c0510b89eca661b0d9a752971bdad1e88102c723c2a2b8a3 +size 23791454 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26b9ed6125e63fb0ca04bd748e631fde76688028 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca1802d1c795e7afa63555e31738fe76ca9067cb1350ebaf1c9162de15b7bf3 +size 23802180 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61a6b97ca40996fbf650fec918e45a60757eb19f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3b4b77b2bacd4e61e3c925d34d1c70728b9907ae9371ad0073339d1cac142e +size 27114730 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0a823aafc8f9dad12144f03b1197b1bf58689e0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ee566f43254e666189bc590767392a45b1395317bf461610e2c4b6043604a6 +size 19235015 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc78db26603bd2b7bd561fccab2595551d274a37 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4994855259362782fd831e1eb37ab80ea989c69d660583b087a5de8ca4cf606c +size 24714909 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bb2508454bc824a6c61c470e33aaffbc12dfb4a7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b5dd8d717898073920a1c9a5441e54df9f3f7722c27171f343d296341ef550 +size 89709987 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abf2baec02b8868e33cce5f91134e1dd47917156 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35e54808ff4b4b78e664c40ace8f3a6514349f7011a4f7dc34964acb6e372832 +size 70588154 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e839b0b70cdd4b80d974f21bc40eeab6ebeb31d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:495f019f39c4f50320026f16a8cbf0ef868dce9a3bb384832e9b9a3da2eed268 +size 23575854 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cce261dd5af887808fed7c7272b993483e6ed3d6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36417f0ce6f244e497a2330ea92a56f3ca47e31ed424d87b09da7138354bdfa2 +size 27188026 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1fe744408e940ba47d31dc39797591b385fcb91a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64e38ab49a5dfd8d35a3dcb53b3c1dc9f8eaa85cb880f91296d4af1b6a65d372 +size 19264733 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6db592536ee929fe5fa2a9d60859ad665cfd153e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c47977d77ab23636e8ba1d9ba04d98764394f2061e3e274b9e46e94f1472aa0d +size 24772837 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52284824f29727d1341002a5babec001debd8fbe --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b62f11e8d06ff3c9e84103b28b76a0f043c9b23acc323bf9a223e89bf467dc +size 90017079 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4fd47c5960411837aa1fd964a90505cbd019e06 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06d0cff02bcb705b6760728d513bec98388100e1e96b5c77eb3cfe73ff3d92b3 +size 70874108 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1f7c391f802079f952c0c576ad809e098bb90682 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84147105c3ede43e386a45e45b9d86db10638681b09d3b6a148b7d0c7ee29592 +size 23440089 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3fae7b577920678ccff7fe2afcf43a3b86f8e4c9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beb00367e26f6ba78dc1e948bb39c03359ff37938861f490be4db58fd26fb484 +size 27417698 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..830b790d71a0b287ca8b8c3b49eaa1cfc679ea44 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e580bf27550597db6b3354da0baf23f6ea24203582e55f47ed378862e5b1f44b +size 19360826 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ecd1dd64b0f2e9a1e965a26e4b41993a70318fb4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa2e19418021d7aeec35711f70d18ae47afd0ea304c937b40134fcd3d3fd9fcd +size 24828883 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..803ece8600d2d5d24a75642a84171dc8a150fec9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a78ab89c9897dbd5d674acb5e44b2be3aeb9714a3f0d4cd75f0df16fc1bef1 +size 90312741 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d3c578b5b0174bb84332a832ab283392d8156d0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1a1b389e098cc1668290e83352e16b6230ae1cc6f88e12c4eff5be315a2852 +size 71157030 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3fa80e87dfe53c0a22e4190364695bff177f0a5c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:715e9160c66efcff858cb7d54c15282d8af2e27b106699fd32f76848d95f4a5c +size 23119410 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb2b3b2f118ae54817d9b268750cb6ee8486cc5a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c618286e489ad4b56d5b9bb42ab197fe7c32d65f42d5376a7be10b32935ed8 +size 26607964 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..09d64e72b6e917bb7e43263c05f7f03b1370523e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a039b91df1cea0e28b63da088276f35d904b9fd37cc130ed6b1a97bd6cf1098f +size 18875780 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9aa3931895ae6994f849932c2f9c16737f20e495 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f9b239f9b961836e1a5f03f2ff0e261621345316bbc04778bfc8f937624440 +size 24289382 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4eaef46d289f553f70fb01d03ecd0fffbfd7b38e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30ba8058d769e289abefb962a6dcafffe2aaf8516d2879a263440a915acf87ec +size 88376592 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b60c978901d997cd61f94c47da9b64086b8d2a2c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e20c70859fc4b454531c41987c91a72e22e4d04681202b45d96d8e62189c851 +size 69597986 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..46c8843499dd4348e94f3b8a74dec532ca706b43 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0f25b673e07ca5fadfcdeff37982b694f5ba46156fb482d89a01e595f3be8a +size 24886647 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77335bc3f2387c594ed8b4612dcf7d70a86ae4a2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b228e10eec1908472ced59110e35e68ff92bbed6ddc4f7c944c87cc71d94fa36 +size 28152596 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..478fedbd63598a0bce216bba0562cbcc03e5c022 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76c433fe71cb6603ad4290663982860dce8d50f55b3fcf8c94052d42c8127e9e +size 19878802 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3bb4b73b82cd38286793de6a0b6edae810cffda --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ce4d1eaa2a4b1755d001f6050a12e5752e3a0c2f21342347c7dc553e6c420ab +size 25455388 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7001e9fd0331e8f6baf86989e096be0a82f5b455 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:892e5dfe4ffadb6d79d27c7f1dec1d893430db0167d2fe80f65029e1db4c1c48 +size 92571153 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..264f20a9fd8bb2a6494b3e7c1dd4dc22b21db52a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dfc1e3d04c91e37224c3a95c9591401a40930b30439c5155c4f6fe785b83292 +size 72944188 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..317da5a9ba9e410d2f8ae3a30566df8f9b28c930 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61fe4219b6fac720b73f8c3f4a133c764c44a749c9d66d8d7da35128b01b50cd +size 993114 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b940ba51cc9e98579c214fa193e45c323a1c17a6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8232c5d0235ca1df0631d5fcc8e3cf12d4be44dff3553496f8fe539be09ab9fb +size 2903222 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c80e102213f9d480fc02726db102eb06a5476f9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cd10c58b73deb71d70def276c52026c4f3f4d84fb12b260770dfc4d5c9fd274 +size 1910335 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..288021b8bc15f979221f5c89b6db79c64bdfa67e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea1d54d20881867610874796b49ec1b75936fba95f095db03cab919558ef9b7 +size 2367576 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e6e3a5926deebccb30537e1913ba7133c5148ac1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a5994f5c938d133a0e82ca58f02f9b7f237c503dc0ca8df5114196fe4b4413b +size 2821862 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf4df71b807b7c4e139432f610c3ddeeeaeb1857 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418382abb9f510aef22f02717b6a4f7d5494383fc048a59e0d58056f750dc2bb +size 3278114 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20841232794da687783d1d570391d5a4c46d5e5b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed92a1f1ff05bfefc9170d43382782a51a6d0d39b74b7944266cf5e2ef059393 +size 1203409 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b22dd0eac2d7267e86e3396918fe2f60ecf8eb80 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b898fec9dfcaec5eb3e8045fdd4f0806ae0186db73ffbae1f1615cd020e4e9 +size 3509914 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bafc48effd2bdcda551053014f656e27a6dc0fb1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3aa2a68422da1a85b64df385747018aacb3af25378a5912cc9015daff0f3b9 +size 2304725 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9967cb00a8bc5babbcde2ce19ae666b047a182a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3667eadb1c286b715105c329bf89a30452feee193dd9b2fe3dfc97e1a6595b8 +size 2853011 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ff4fdcb457a960087ac8ddea5c9d295551fc9393 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42f283126352b4495d9a5343fef8fb2fa927042b6a38adbbda8c22266e74751f +size 3398207 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4a71b7757de320a7ce22ade35ef2a4794e037a68 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e87ae99c9a080b687a2805208bf94a2050b230ec19633235c3d6426ce8d9131 +size 3945474 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..337a7b1fc085b4e67c7200dd95a8721c597cb42a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b67ec427be28e72b7c2ac3ce6f34ba5402ffba5f1cd17cd00167e4bc86040d0d +size 1007964 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91 +size 1478640 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..755e08d46f9924ad19a6035e885adf2b5c6b3680 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6390402fae686f6d48b7f71e62f7a23143c2950684c6d2bf9cfe8395812ca8cb +size 1949618 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bebd98a0fbb1feb05d6781d61c069c3db3bde268 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6edd03032beaa7733375583a6fa91c3e43505fb86e40b5311ee964f800d0f822 +size 2418943 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91f2ec5058041019e9135141428067a03a6cb4d6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9a1d9bc64f35952343839cb64663f2927b76df46f85840e8d3ee95a57f40d5f +size 2885393 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c1813a33bc96a2744b0807b4cd4c5bee0251e8fe --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a59096c843689f0dc29f3754b8bf4f2a6036924c7c4c92bc0fc97fce8d1c04f3 +size 3353827 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..363f1a4f482889f1ea43a2e1fc4152826c586cfa --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4710391cbedf75516de1ec5031d38cad30c6c979e40d46400733a99a617e3f97 +size 1160984 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dce112abe8a4879e1a43ee3e4978f13e631fa0d6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8178fec2c3f51a34cd66e104df634a90dfcd0907a4d6ce960babb18a7d1260f +size 3337294 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d37d038a33bb7f043ad03b7fd73e6b236944e4cb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422bfa041d273932f7597ca4982fdf55355e9c1c51a3cbdc0d9f4e1a2d431913 +size 2176377 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a0cdb3166a61ed81acae272504a6d87d6e8df6d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8de1ec9edf92f953045db3bddb2ff4b13b57cc282b1ab630b16100b8937dc13a +size 2683115 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f8c829e63a60f5d0dea0aa3c6de0bd74fc691bb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ec667d70e03b6501b7ef12d0ce2a5fb59515cf0c4ac2ff4cee713023d2c98b +size 3187040 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ac9fa634e14d4c927e660af68c6b9798866ffb8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07126c91d37e299a97ee36aa35b743f8333eca5aa21c3639d3eb2b5fbc96f585 +size 3693003 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21559c1d6a53bf67e053e1bf235a2ce76beb13be --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e4850583819515c1a5b54a5ec70ca6a5130869776f2831f794b9389b4077fa +size 1027071 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cf499c30bdc7a48ad5b40c193d444089d69782bf --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8545ed348ae0a0d961d469ba00a207c8fe127e780d171bdb047b71aa2cb6c734 +size 3007244 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..812e7eb0bc62caf195aedce7c56c43e4eb788e5a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958b5b8a1e9620e6c320b5c5bd64ec1e17b8e327c40ed41a212ba926a486cc4b +size 1980575 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..992b73821f6786a8dd13e9f0e65a8036a4458ce3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3091a0808e3bea9fafc3e7f26dbd383b41e73f54c1920e65130fb952c2d1b810 +size 2456022 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e119338077a346d2e5d06bc3a237022f553cfde5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf5a8ca2d9a41ee4a3aa13cd49a31a79db38b10f90bc4d13da7cc55060bebd5 +size 2928585 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba6d6c4583942b2f5863fdf78c6d24de44862979 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r1_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e71efa9a024fc18ae118325a6a01755a3dd0b1d02d4fbada43b701e10fda1c +size 3403027 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b3a7b1e17607075f6d93d0d8113df09e1feeec9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c5c85b00bf1cafffa9f2b9c5b60cafd496153476a6a489a860eb01af65b36c +size 993515 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..333e597b11d734c6bba7857c9f45a1ce6aec368a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:187734b0c22d7d63ff023ec02cf244458dca0630e32c3559847284388d413369 +size 1446846 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3e927033395881c6e5b4451285681cd224f11be1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861d3bab349d0390ac521a1f9eef6e9059e6ae09ac9d0ba8fcdd003a48487ea0 +size 1900321 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f00580a94ceea92f056c2b36df56f79807b5059b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:400f64511a06d6cc5061debd0b7cd46a034bbfbc5af19ab9754c5ccbb8b0e441 +size 2350108 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5407b6aaa8de4c7e88a369413a37f53cceb1fad --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938107399f6cfe97d2483a05fbd95c64204e46848b697e1e73863910b94f39de +size 2799706 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3464c2a8709fe889dd74d7c6d0200023ad9b0d5e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527647047030ab6991897636fe054f9971fffc92a8a4e7b75cdc4e1c17155136 +size 3250282 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bebc4f60d6e04258e8b5275a443954eb1f356051 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb2dd25e3825f0d45bc575338a5a5a021e34b2796510bf59e0b48634e8d9cdc +size 1203813 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4e53efceb75d5010472a92fa20a07aa5b42197a4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df331ede7a7f285076446941b9d11ad0d71a51badd807aa99e2c736f657cf21 +size 1749993 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcb95f9afc3c486f310317ff782a2f42da5a0ea3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69955c993fb01d97647b6191776f4829069e40c67b9f3230e8ed314e0c8c902a +size 2294685 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7885c583e286090ca47189826171b9ecbf0d352a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c8d2a6f47cef39962a68fdfde6d2cdaff78d79b38389f89dd7356f741b2c552 +size 2835584 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2866037c65c81e4a9830feafe73f59ea211a49c6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46264013b41e1fb15672a964b77350cbc9c7feaef411a896adaa272f60065e0a +size 3376132 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47b7f44f1b2f6abaee10c218ea705434b120a15a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:866d2e72bd665cc86b42706f71fe9d8c8c6a89954e282dcd1a9695a0bc4eac1b +size 3917857 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7c59375e2558e8f483d20b6414b68c640d8a5d1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b9bd737bc9f559719974ed3293dd3b9778c36a62ba7c62e239949633ee6dc0 +size 1008353 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5 +size 1474064 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08670d95807e44e10386ee5f989de05fb5a78073 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7d668b27594cfb0e6f0d3c7e1092ecf1f3002878a127e27e84699744be6c70 +size 1939761 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abbbccb1d027229406cf834f916eb2d395f30724 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf4f62dfcb386a46e2d5ff5e69af30b806a4f187b1da8155065f5d788192f64 +size 2401708 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0bc0add35154859e36fcc438329a2f93eac3c563 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c56a775b92a205e2087cf07966a55e69d3db86d9f1a8317dc409a23e3475889c +size 2863464 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20f211e81b492ae40b325683f1a0e347c8f234a3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f20588fa12f61febaa471345094625a2d796d658c98bab5cda6e90443d5c251 +size 3326208 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b81533da8f7097ea1756598f151e95277b17493e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ef1d929d9651784b0d64476baf3acff7c8ab234850c7a7eb1d35666051b202 +size 1161402 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02cc4d8e6a3692f51dae79a6e440318d6ddd35a6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3737da7224265c2db5b7bfa5c03a1719c6f94ad2d45312502d70995200e9386f +size 1663799 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78cf4077e65a2244f147a3c280aa17fb0e779aa8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dedd9732a4c1429eb3d64316af1f9f1e9f0bff5c3c3d870bd74d1f3da0b2321a +size 2165956 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..76704c512abf51e060c7e60b2a3a14b4fe8db5ab --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76e31c39ea03fcdff65952b7008b45e6cfaa5cc5d65b2a03020dbdc97065aa54 +size 2665051 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3236288f59b90e3b965f1b9633966d8ad8444a75 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e9204ae3baec5d904e573ec3e754a9e30adb335c7dc8b43c4e58da7ad41f9e0 +size 3164129 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6968e749fc80a0a5336e7fafa8a977bc1511c7a8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fb6905b0e7e64250dce4e4286d8dc03ebc5f318d0699cf0ed7d2b4dc22c717a +size 3664304 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7a8e2f36fa81d55b972679aa417ffff0586b2920 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:942c7f330b8db29d0d181bf08897fbf0b940366d7b540452e7c237dd801dc45a +size 1027466 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..999f66d0c1ec60e329d6401f30d5fa5dd7d78520 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5875bc22fc1de7b77ea1278efb0fd3bb9a715cc2a1b5dcf922625b8f906f45f0 +size 1499040 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fb300b44c30d32d5e2218b3bced03bea9e158cf --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dba2b030c15c96d28be7b2648706fe3854236a85b34eb9502f1bed03f0b1389 +size 1970704 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40a5f566d0b36eadf52419b0cd5ff6caed207ebe --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bee7ae8d0a96da9275309947d189505be4c1061d60cbc5b5f3d6573dfc18adb +size 2438735 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e8e1cd98af728b63fd2b4123b760d1bbd189d312 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4530391e0079bdad0151c25007ae3843a04d91c1cc81a64e156f94f7883b505d +size 2906595 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4239ceaa8466872d8a018a63a131102583a0f11b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r2_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ba6a258de6da1ec02f47aa47aa1cf93898deb268d7634c4dd6a0966eeebd67 +size 3375354 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..80e8db17f8d8e0928d8609f7ed566dfd2373922e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd8f4406780b300592a7ce85d3c3fe5474ff0c17cdb67c6ef60cd6d776324b5d +size 1169161 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..15d18f957704c72105c40e92fcbbd5ac7f275b38 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:907d583f34eb9adb9524608d960e0f7b5b66bbedb238b7b6d833c2ce01a4cf6a +size 1698346 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8724d03064c9fdb97cf131a401bc39505c5005e3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50b115ea83a2d4d8990fcb49042e4075c8a1302c497e345aece76ad5eae3219 +size 2217257 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d79b243d9f0f9fb04d20a22801b2bbc9b11a953f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3da0f08e9aa35a2341b76a0b066848d03f2d1d9917c055c86caac359a2a75f8 +size 2730547 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1d6f1d06b94c1ec790fe6bb197f38b7359a0f35c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99b4bd061bd685ac086fbf98f1ad4b80f90720d6c2d2a7c6bca5664cc5d61e38 +size 3247974 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5836ca2085580084d59bbb2fe635cb4e9eac8d2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3eb32b02e38692b677ec1ca400e198fa127e2bcce2060776f85d161a34e7dc3 +size 3776424 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54e966ffc2120e0c4f9d0dab40bfeb94c91c38af --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958e739cb798f19dc0e8f1e397bde8560d1c607f79dce1194ff96cb4d04bcef8 +size 1421233 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c263bfaa6d69256b81b4ffa70b9badc64bc452e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9374e4a047683ade05b57d254568697ba1219f901a3b5c1b363378da932fa7f6 +size 2061833 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ddb4229e79e791b9e5d9086e12c8f46256c1944 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5d7abfcdd82d3c5e58a7dc5be59e97e299548928f0ba6bd0bb3c9e8f5f7ef4 +size 2689625 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..209ddf8fa9a016cb329a4a03df7c68c4231c7718 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd8fd6e4318c91cc7cddfe908f380969a29ad220f47ea289594c3d7e5ab8a322 +size 3312105 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e4324437bfa8a0fa3d17f7c9c5e94ac7d1f6515 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5201f37442e2ea3a3b1a0ed71f723b90df61f7ec62d397d817c315841fe10360 +size 3938755 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4c2217e81c9e61966afaf0fbec100426e8ec149 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dbad3f3d4ffbbfa7bcca6678ce1a751af5ac249031553bd8de4072e0c81716e +size 4576492 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7277150182168c08bd8d8e45bf6a7593657c0f34 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e58197396cc60bcaf30e420f7af7c3f9ae726f3b02cbf2bc2fd8bfd4b74ffa +size 3560523 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778 +size 1730743 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..955a8b7a9dfc02db39a28c96e88460f37cf4d9dd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c684b2b87f974e81854e8f70010d072457005a982971c17a54e16b96db024e70 +size 2264411 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..785f37b71a9c2fadd91327066aee07a4b1c93648 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7103a5fdb4d0836aa4a6b874130fb227e04990c228fffb76dd324293abc055c +size 2792113 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd8bce102fd1f627baaacd89364e16855e8343f0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cedf602dee9d212a5c8e12df8845870384b6ed08a69a91c15cfbef33173268 +size 3324097 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf6999d2b8632a8a3c036e3d813562176c245aaf --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c6c7907d8c793314f65d9503dfae520f08ac6eb19c586662fd5bb669217c5c +size 3867064 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b981e94d8d45f9747912643b2434a21aa35c8da8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ba336c969faeb86a670535ca53c28e5e43ef808558b162d2524ed32d069c8a +size 1371045 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d6350c3c037eadbad97ea6a2698f83d3e3e9f20 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe709edafc14e80bc973713053c9e14337943a2d4e6748a077f646830a2a33c +size 1959145 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27b73a8167c741dd9ecd2ef26451459e15e4a6ab --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db97a1b3e9ff8813584499b42e35d03eb2ece158051560441b62b7f2f6ec0db +size 2536918 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2a91cfde3ac8c444aae20dd2ea2ec01f3d8b9ca --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:694e0c518272b921dffc30a339530b67cc0b524afbde1437e79ae978d65621ed +size 3109751 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d4366e1f0088a043e102be871d676885a5a7d48 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df6966fc0c8ef345bfcd2b0725ef19c178b50be9ff24c78a61111be5ac028877 +size 3686908 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f348d7c3edb4b87a77d00ae79e2817c3564685a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a852057ef9a5b0435acdfdda76b0e04c5d0d712c114dc728312c7d74882e5bd2 +size 4275065 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e672a6deb5f4a00fe6bad31af5682a7e84f0b624 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8bd614267f006f86e1587f5fc67b7eea511af4cd049215af730c6b02b12e1e2 +size 3629175 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae395061f28ee5dca331bb974892ac3dff1bfe9d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1960ddd46f80ecb9b1c1b5716d05cecb45062c08f90f94586bec4f99ac204bbd +size 1760729 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e9d79bf3a852ec099b6012b854d77a7b1709e74b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c8bebb52db008029962afae366be29b9d69651c8bf3147d438b991cae8cbca0 +size 2301647 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e40a021027d4120e6deea01fe91d99ec8f100cf4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b02cad1d5beb801aa8a3e0dfec6a65c698fbf6fdc2d4996b34cc55e0a96dee +size 2836576 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0eed35ac995988218e7878707e2557ca42d6baeb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8c10aac8a3ca7d9f7b8159b817120639f57aa68aec4702bd63f31360b10f5a3 +size 3375868 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47cdf8a373e5a51718d65a46d3dc3807a98d7650 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_anli_r3_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705905c93e1af18f3523508b67e0c25975e53b677fe2d3a25bddfa2cb77715bc +size 3926049 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7eb9b89aa549ab1640ab63f4ad4c21885a5682b2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435df93c83514284a00c898cae9d09f1aa0bc0112f2a4b395d9751636dc5739e +size 2433554 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9caf7b07b868f29176ffd15b5dd84a148a9848ab --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f10032a8b180b5ba720635617ea06dc584ae6e88928ba3e8de780919b55c80 +size 1670632 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2bc1a74f24ba1be2a7f23962a4692dc3e86e2cc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db009a7242ed2bd3f77f03b3dc9b7a407dcdfcb3aca2ced28f1b54535289956 +size 2120084 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..59778bddca242227ccfa0fbb9f8ef98c7802a238 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c3d1fb8352430c4705da7506537c1d906efef7a1ed7f087d93475efa10431c +size 2577615 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5dd05c429a8acc2f6ed86fe4ec209233d9e705f8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e7cf4f0de476bbe35f923c3860819a419f457191669e6aedd43b79286671c7d +size 3027517 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4905455b69cc9eb7fd1ed0d57ea886c3e3ffcb60 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d41da98630881fc9d037ec09121d2554c125957be4f8c69e127c32e10cbd907 +size 3479076 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb29a231c834343fe252c80b0b90c5eed54ad3ad --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1cbbf6fa8864b91c3c1eacf60077110b820859787b5d0ab07115c465723c22 +size 2916970 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf37df3a8b33831e3f2df039198bd1f266d339b1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c845931002c79e5dbff53b11ed14a4a14c9e1f0472bdaaf439bc045a874acd +size 1961123 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..245ca0ff42c235859fd065dabfc5c49ed7c6b6ad --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07588351e88d85897dc19e0f4738b0b210ffc5f44545198c0206c01aa41cfc7f +size 2457391 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c4cd0fd70cfd79892e959397abb7539f750d741 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72414d61d57fe6519162284a7f268ee1762075b17467bbfea9976f797303c947 +size 2963044 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02fd6d0bd9a31eb0fa3813aa4ca73fd7c7649861 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:156bf10048d6f1981faf2b9ba3ea9f5ce9e974f82a68c63ba2086b376c92925a +size 3460542 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..187356bb1896b1bb88312c075296b33754abb4fc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d466ded6728e9349ed5b509b98bd4dfd91d89a5f1d756a5b70a1f0ab5cb77718 +size 3959877 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53962c57f12b667c94d0661d0ca98bca2a80e2bf --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c0dd84d732c8382e00f235c721b0194809571e230d0b352acfde5e3d4dd87f +size 3011312 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3e6357afab2ec37c1e5ededa9514bde03f6983a3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38d717b6adf91487b78704f230dda0f28bc91c0ab4cb79a95219b77b9eb3bd09 +size 2032307 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eadf515946736a19213ecb005a1c2e6fa32e26ce --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421d2ab537e6b3ad54a2cea320402acb0bc3553854adbbf7bb268a0150e7287f +size 2553599 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3fafde4e03aae95312310c222f754c2a5be1fd62 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e30f0269cf02cbd2a3d672a7566637a8dd2755102db046ccbd798c29d584f6c +size 3083882 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b71cc50b9f99fe563ff6ffa2ee85c71f59597c1f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0582666dd9da6696307fb6f0a92693c2b97fefb21e976f2500c600807d611b12 +size 3606105 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b22dcb238b71ac4254519eddbbd6f82f01491f35 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e31cd3f535516d29882b41e587ede3ed0300a7348c5b2c0bd374c6e748b91c +size 4129946 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eb4b7d21cf95bddf114c8ea8588de09d0dd6f02b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7f454f13585ea0917e7c91dffdc9e17bd48a8f885da7cadef5f4af9ba4a38e +size 2405428 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a674e39e4fd35bc8cf30fcd16bf44467dcdd4d1c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d9dc0e2404eda9756ae20a917cb5d80ae89ff4dcffadf8894fc5addcdd79e7c +size 1638992 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20576920b1790bb655ba2872ad46d37093a1a00a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69031899154ae5404eb808b6104e3ac9e2f99a376ac6777d38002ac74f29fc1d +size 2070864 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66778b17556ddf7f84ad5f59b41ebefc43c79ae7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2d8f6e8cd9f3b6f4850893e048260b2512b0a15d3d5980bf4dec8af34188c1 +size 2510815 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..755ee42afbeb9f7acb86ca46559800cc2585b852 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025a690f58606bf4e549ba28e95e3107bb2263fc4f14eba648c1b73ee6d023e8 +size 2943137 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97bdc54ca6307c34f8b5a80a940d1b26c26d09e5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39bf47a459a4b2686b950d9e29a13722e8fea7426a4905b7468b14b0460df41 +size 3377116 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..863c19a2e74641f94064ebb0e1eaf1425dbfa459 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9df2f67b632eebfa982d1ad32ec4f708e0c3f550336900f6f693404a3eb14e +size 2374962 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d25906fce23d1938c2ed8936dece3a76170817e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a74dc722ef1483412c595c9021aa4b456313b7c716784fa759d7263edde92319 +size 1557386 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7591fcd1990a1519a69197d43ef672c53ff48ef4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9afbd103930ad1f63cf117bdc56fa349b2d392bcc4652cf1ce507e62642b8f1 +size 1921842 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1ed24ff6aadd62a17918acab37348e3a34472a0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a86191f591a4b64b94abc1cf5ab8d819965c4a416ae33f2022284b9141c9fdd3 +size 2295208 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a86bcc856457bc016cb29b2b46e171313856b9a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a1314fec51c58fa38f0daf5c523f25c403059bee34e61089bf23aecf2b3d026 +size 2660418 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d736c4a677432ea55f3d25f4e104243fbdff1b18 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_challenge_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1512864dd790c7bc5532bd3cd7610d73099067ac8c6bc6bc47db658d0534fa1 +size 3027249 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b368e76f4dac813a7c7658f4dd65d2c309d8befe --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a674aff2b2c7cbb0b62b30f442e226f9e813ce388f8a9a62fe09f331dfda0a +size 4702482 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9fddc00e30ffead8dfcf8a4c11cfc135edbae23 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29b86f656ae0e4221366af1fdb4ac55c0984e651a9b4887c9b2cf975811ae7e5 +size 6348328 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..155b8e426c8fa408ac5e2da3f44d56e2b94a842f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a3fcfdc9539c12228cb6419f4f8bb9ec4968ff5a08b9f58fbefaf88ad1f363 +size 4008946 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e043b3343dc7b8a5ed0cc51d3c2f95ced9610a28 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7595852c03c3063f604560cb7cb589b3f181ce40c32f1975bf5684b40f5efa99 +size 4831612 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5c1e5a454919baa5923a94764e0f6ff7b49a8991 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f60183163dab0671410eeefea613fdabeee0fffe996ab6147585d9a5cff5d70 +size 5662356 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13a8724d085a475491282e736e67d4625f8c241a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e67e2b97a542c17a75c10cf062b1fce9e19d18fb1362e19529883b9e43d29f +size 6494729 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58de4442be8487d4101370b632c785f46dc22f13 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3780adce6dde99d2b313dd99c451d1a04ee2a354221ce1d457e3d8891828deab +size 5493498 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0828fb5b75b537db837f2ecfa7dc16d71604234b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eb02a8c4cc4a7e0ed2427a80d09c0f4c4e3171c8f7847951c52ca3da1ddb8bd +size 7299300 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0ef3ab58e6ff9e2e4144aca4a8d294a8db794ed7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e6514a0d28564ca404f420a1a806b97231e0f78b5bbcde8aed41f37967649c +size 4566799 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..422e00bc6ca040bb668bad7acb3d7af4d181a6a6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02f268294106cb53c4d7b1eeb27da8551b04004dee5ca62c37954ee60a14b5a7 +size 5469417 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9fd2a6dfc0532f61b6ff84a5418fc2db764dafa9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1932c8e9da5de9ffc4610c38044787a9e3ff07324eb4a8702611e67894d3e81 +size 6380887 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5572d1004cd38620aacb2d31e87dd8e55bdc9f5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244a310fdb7080f9e4476bd3234534cf78109e53acc87af4a845116d959eec73 +size 7293106 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a05ede4707d2d1715cfe12211ad9ef8ce69dc93c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ea8bf9cebbba21d9d557dabdcba78fae171551954cd89dadbb48544d95a428 +size 5685514 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f483449f918150af5c55a60c000a641794277274 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2730dbc55bbb3b77576c7b9ffcdbae9730e92d416bea60030bebb917ca1e00a +size 7590308 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce1eff2bf0026bb3b6af0d4c0cc395fd59bca2b7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:744d349348382854d4ff1306420d20f266bb3d9fd5c9c4869fc9cd0ddddfdcd7 +size 4762295 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f26ad92968a91678fb708ed83626fb51c1619968 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5733e87b878fa7721e518b1b48f39d494c59a3d0812c533cb53efcd0e4d471 +size 5715433 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf18e600426952a82435d664d1720d824ffb4d33 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3284f6b8543b1995828370dc307cbae5239a46ca094bafa4644bd8504ade9b +size 6676905 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3f1d8fa11a5f146c71b42ffa7d3064b2401769f6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972a588a899cdc8be1640dc705d83237f276ec5e35d75eaad4d4b5357f0fcebc +size 7639026 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74ebdafcbb242adf1d843b7c6ecc31166c224e60 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b5e4dc704c30b2766820be18a299bdb81ce5ac31410b5dfbe9f654818caa5b +size 4645464 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5117f6ab768c399dbb450985493c845c203cad67 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8579d7cd08727034b66a11a74977e846fc0aac0866a27e3ff37676a86119f2 +size 6220042 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b808ecf0fa0dda5bce2100e08ca6b8e90fb48da8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e05b357aef033bc013c5e57d928331dbe2619702ed223b3c2e25cc899515b81 +size 3909160 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea708c8864caa8be3a1aa043dfd1c95fa104b541 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf8f11f207684f22ca91ca0affb8b00820e184aa9e086cc49623d3d07d2021be +size 4696184 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbe29af1e88b43b6428b4a5eec9aabd41f638559 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d0a26409693e080a8edfaa9aa8ae2b9b616dacab2dcd56b3f997f7d14382f9 +size 5491290 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52afd3e77931ab4b8ad50b7ce91bb9c00a4c3bf5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b3302c372a32baa19e0fca5251df4530598d2a02672b221c14752fded4582ea +size 6288023 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..25f506783f90393607e55bf28469cdd3c6ed1933 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f64f38d18e280e00b9e81a260527d77d89e51cb3f80511f777959689fa83035 +size 4394408 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e19f503cc9af18c4e57d30ed8cb0fb346dab8777 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8edabfd6b3511aa8d50ec7eb90abdbedda99c8a2209cce8200c62de422d032f0 +size 5663304 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d281e166ecde1e97af2952047d0dcc7d705d5f3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a3be64a32907e325153ac66a86257ba9b44572d7beae15a7fe81cef2371a393 +size 3480397 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e079714dfc74bd46d4620706f3dd3e8a88d08656 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fee4155f72e919ab26e70bebf1e6be60832c424c828f137152881014e5362f48 +size 4115085 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd21a4049a915ec0e8649cc94b5657978d025402 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7832818f588903690cb89fd8ecde359d8ffea3242ebdd66428ce3b766c99a852 +size 4759125 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..50e5a1ce776ffef9a711d2ab109cb02ba7e3ae55 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_arc_easy_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c676ede5ca2a8e59867e92e63ae558a77cf30ba18bfe233b7b5760b4f13d3fe1 +size 5402915 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8620cca8da91fadb11d93132b1a3a80146f9f93e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ce2346afb8240789ff2797d43b70c90a17a7354733c0588077dea7776fc6f3d +size 7283048 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cf642c9e1c54af15042699a442a6fd7a72bfc1e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9789b1f9c985d374fbebb97af16f8dcc11b5680607ab3aa66312eec6729a600f +size 11313452 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82b27b5a207635d0c36df2ee2bf23cb2b7bbc105 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0592e5d13172622409ccd17997e36083dfd041b1954fcc211a0253102f70e4 +size 7693866 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b5d2c021e7c44412e58e903e7ef7b0ad15329d4a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab76fa055817d54e6be97b54a73bff934ccae02717e9b89046840d5c845cf21 +size 9728059 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b2f69386cfba7e4363abba78550454afcf4fea6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e908893cf345895f8934e7b6d9833456d5b09d3c380a6f13325ff535cf5e14 +size 11768778 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f6a9a89f511a3f850a0abfe60aba075bc51e460 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a1f040c1694ac2980ceac399b8c2720e0213a12c3d1ea9290c664a43b0f019 +size 13790362 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d478c3ca8ceb66db756da854281d9e37ccf5c1f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28d633585a3bb4d3cf215f7307a0842ae8fd1235f41605281ad7bcfb291e250 +size 7969314 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..406ddf60340dffb12b61e52228d4bccab3c2306b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483504944c355aa7a61ce90b4f56a1cafa56bf5dfb1b9f725b8d2f7dfe96af88 +size 12335078 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61659f42bd39ec647e273968f0a5b07e4795f154 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35db404fed697e0df84bfe9febeb954ebdf23fb9e6f281303e84b1173fe08255 +size 8371596 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..022f421b26af1e0ba447ed1405c6ceda66f8b7a8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44d8b74a0f67618ee7ed529df311d63f35001feef95c63097797b8db218cda84 +size 10572938 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1abf91cd2112eb01befef233c5a31f6197c2d153 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19c048aba0d1e39aa769d5ab88f26306f950803acfb64405967b1c1117ab189 +size 12780824 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4691913f6d6e283f5678cdb6b431c248f4b2f6b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04564e062c9332ae08417d28c5b1e6da1b08c0217fc771aba98d716834b100f +size 14969660 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ffe0c860c24feb38ad14bbca7dbc6e05e65cca17 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bf66a7aef20ac1f91ba2080b94b343f46a90f0ab42d1f894d1caba261944b9a +size 8083312 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36ea4ec7027e3414ee58649f18c74925762c25e0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed19b5a4552345f04bd7ed602562b31dcfe143eaa2827bd44dac404d3db1ad0 +size 12521946 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a316121edbd28f19f7c60a0dccc32ccc6235bb9e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4756c261dc082e499ffb70e2995549e699971bc0489c26cceee95b34c46e0b +size 8501111 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..85f7470236b3f94133653d02ace5082ad85e40ee --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a834f6a59acccbfa751ba451e29414db71c3f50950cd1eba1d8a181a777bee +size 10738472 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8649005eaf1b56c700a1d031dfd6823a01ad6cca --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d89a31f6288bbd0bc69f5d1dd313bd54ddfb98efa95ec1efa2f168aa055ecbb +size 12982359 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fa5e1e6d3866e73aae51af1afc84c23831d1190 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45cb87f570baad4c1707cdc364d8deb7c98cea00ec7a9dc32a5260e0eeea6f76 +size 15207160 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a3c6bdd96bdcda3c6230ab9f23b5add29c4f58d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24a7c334f964b5deabdeca687448206b209cb76e2ea3b5372d1b7927059c75a7 +size 7327560 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7816a57263be8df7c90d06b9e8e1d0ead40bcd6b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4e7c57ae7c07578dfab59bef399f9befb71f4e22378fde35b45b29365689dd +size 11374138 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8d7eb6263430b7a2676e2e69c7b13f887d7b095 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90bf0df053d60cf0fa67db4bbc972de35fda726e8f23e40ce6e485f773094715 +size 7732337 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..35b895c8a6bf50608e6aa734112ba4169afbd151 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2de1a1e2b14fa943e0981b23505c92085654ecda017f5e2c31920468ceccc63f +size 9774664 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2dbe494624f0a2864c586589a4348e4b75735a00 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b6c893a7ab5e0097567d4081a67d01fe7143982ac478e837675c82c5f869c12 +size 11823524 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f6cfba9b233005ac4b51aea5072b29b6f647a5b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ed32cd2396d8f038381510282cf5c1288b6d27bfa30ef6a8b9fbaedfc9d408 +size 13853343 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab0dadad85982b9b7784f4168a76a7ad9b724932 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de36a40b1c7fd851757b84fc59f09813e937d96db0865249da31bd989f9b10a3 +size 7724310 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e87af0f769a77a16e8b44b716674a7cf85b5c163 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07da7b93499edf461bd5cbf2385594528a01a3ccd30173bce760dc55f6bdbde +size 11981540 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..823adee7e241dcdd17798929dca9209ee32afd0b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd15b819938a1feaca3c17d77aa4950b918ad29b580613004f89c5bc6b75ef0a +size 8142130 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e1d6816de1ffec66a09e9c98f209ffb8904d6643 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:941612e1535ebdfaf66d33b2329d714bdbb00bdbebc19eb849d35331484271a7 +size 10290057 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a75c1e29f76926c62fe4883b151909f8f4856eb1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4818eb6bc6965aadf4b48414ec589d3526319f1eb49516953f82d47493cdbbf +size 12444560 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf2f766d963b2551d0429c17a5466c037533baf2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5885f1ccd3d97e055dd9b19561dc56c735646b34684a58fb178dfc700108add +size 14580003 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb04ab15f8f5a6e1fdf85f4692ef8f0b8704cd2f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89d2b999fd5225bc0b6de27e3017fa0a44c9458902751614e6cbc32fc281309 +size 165444 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..049ac455294d14b7693f077727ad39783a599e5d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a003fb524f0d84337ee24c3c5329a166ce1fbcd1a5d74b95d03873f1ceb2fab +size 77959 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f46aad683a4365a5e47e1fa1e7b1ab488cbb7a98 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a02f123b5da5e7817cb2dcb746d2fde4915a1cb113fb0f634166bc4d4fb9b5 +size 99570 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ee5348120639762c2ce1c70be65f61f3125c9182 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea11c715e0038b8e61aa2d5e9316c13fe220e4d8072bdac7e9b742dd6085b68 +size 120726 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89a83ab172dfb6388b5d1550271e495a2fe1276b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467b61f6b38647fe1a779372d61a9c6e9d1036574b7722301c0c73db10b351d3 +size 142849 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd9bcf1b191512a33c22ac426f1e6a88ff5197fb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d39eb69b71ec3bf1d953a861d93022383345342075187182e01572f48e5a49 +size 163672 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56db4477c95418dab562c04405958dfa63b643b9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc2ef01426db5e4a0718bc2416e9a628ce14893f479959e5d51daef5156a396 +size 198699 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c +size 94141 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea9198ddb64a94f00862c64f797ac8799ba8b93e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30431eacb0ef3d9ec89acfe90bbd1f2d2345d9b378b65830e4413b4c18311efc +size 120813 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..142b083633738c6d5a6142d0ebe4fc04ad06eb21 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e03337d547749a40282860bf25009c7b93452a8796b038a6333a50294213890 +size 147025 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..675a99bae16c2e119856ae71c524e5c8e200d4e8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62290a433c332e61eb5ca39caea0463ff3c258ba37ccbab5b00dddf5cf501fe4 +size 174224 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b64d6a1797628c1ce66061ca6481d7c3f533a9c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eff8435d906e3a5821a9a7b9fc965848530ee928d76d1908fd06c1f4037fe0c +size 200134 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..725f6b9f65135ac0b220d291ef93de3b13cf448a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b553026bac559c62ad53bd679324fada1042f7afc316e6e9c94fa327b35ae7c6 +size 112602 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d +size 79780 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c23e440ea9d4923eb9680d8d71c64c4636cd8336 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004ddf5aa41ad6f20e8b660cb68e465cf46ce2cc086de3846103602ee7e404d2 +size 102075 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..530aabcc2d9f16142a81ca08a16bf047a039b31a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefefb0d45b0a7aa632e1c46ed6a5e8fb512e3d1421b1235190999ec29147e0b +size 123884 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2a75281b7cd5c48cba8458a81b4bead97f15c1f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:613a7a4a6026d99fb605512c026f18bdca89170619977f293e1feac88df3a603 +size 146668 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..477305c75597829345bc2ff2388e5620aedc4d9c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a64ea6bdacb8b8ba2842e1dcd78ad68ffd2f59689eb94068518688ef1f14f746 +size 168146 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..450dee69dcccc63f47e89be88825dfbe2be4a97e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69eb071a66b1825959aec28bdd799ce811af9fc6afff48be93f60c6d4a7b8ec8 +size 191889 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a6148b49184ec6f5212a360b1ce73be3a185e01a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94affe70441fd82a0cde310ee0c6498cc5fd5ec547ed161934fb51be8c518efc +size 89652 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b9333a7742bc47e486260361d7139c2829d932c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4731cc67adba43b6a7eb66893ed9f6258c8b1bfd7625f63c725ca63bae6133bc +size 114067 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..01773ad1f2f0e3340eff7f767edb587daec301d5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a011e2ce2af13edc22e5d0909dbb420ca230931e139d3fa2472612f0106b8fa +size 138072 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99ebe0693350e46467f7ba86a45d352733fc6d64 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1300eca7880da790244fdf8d108fa06d002bd1a82698298330e3654dd0adc709 +size 163058 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4e51a9103191a507ddf4ecb8a049a66d34693e6e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78681ebd670104e42fd7724276843237b0472b606f162afce16158e9032d3ea +size 186730 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..984dc074b533ac29e20f8382e9243f15a943b8f9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d807ab8127cc3f4fac9c2eb254f9278454ca96d85f8c93bc4e2875850274b0 +size 171927 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ee9254182238ccabbbd4c8085b91f66a90c9e5e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:945a4316cfd00fef2b9b9b1ee442d2910cd8d6f7d417daec7865cb0128d28a1a +size 81118 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d3fc3df30fcaff0c73a8653b05c82ee190df4009 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e7aa7c411cdbb9d547b64dd73ee675ad4f93c5c50ec06d83adefdbc1055afb +size 103749 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d024a2ed7b963afcd224db0a7336f129fb794098 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24576cd486a8ba7ab0d899dcbada3a20bf6d6681167a6737bcb9e884b97dd45f +size 125902 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d5f02013ce5a6fcd4c5bfa082e1dc0fa0cbfa20c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b7a84937d36b8a180fe2799bdc415556d345a61cd0fdf3dfdb61046784e198 +size 149024 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b05ebe165f1d4adb9ac497358d4f21d8b5515bed --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_cb_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b77be56b716b7541a04389445000245fcb432f4116174faa22e607d9f7b880c +size 170847 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1977dbac2ee9686968283bca202da3280807f9d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1484fa8a1e1bcb1d89658285fbcb4f16d9872f9d54f9597a5cf071cc82b4ff +size 92256 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d6c6b52f7ce775ad8a016538cc8325987a11c37 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45059fa6e506691200b4dc87fa28359c50c24b500026cc946a9344161864cf8f +size 111543 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..554950d92c13be6a9bbf3641cf6e78f9d04dfcf0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8e25cfabadc5663c81c9bd15d49ce83300bccbf096e45d7614092346228e75 +size 132071 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bfa1ae9d25c648aac381e54e1094d48bc61149da --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0a4f4654ad9760647d87dfe7308e3e482031c0dbede55b138eb5edb9f4d5b5 +size 152226 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1f679ef5b428450da83403211f29e7b246a6bfde --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed6881a1a5be2edd9beb470b81e23329cb8894f2bde78542f05ecfa7d693cbf +size 172058 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2fb42fe59458fb52d3663102e668aad7d27570e0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_best_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52af24828837bea4b6cf77d0f3214f866ed3d2908fa73c4f0a22a6700d6ac0c1 +size 192022 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4aaf40f68c84962b863a8cec7d3ae5fc605fa370 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a53f3556282e2853ed64feb7c9a3b3cc5f39ef245d297053026d36fe7ebe482 +size 87860 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a63d1a65c17488570380acb709e3699f610a1ce0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ac8033a4a21a941a219463c0b6a1e117eecad53a93cb98936a83a54fb580e8 +size 105063 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5541f7546175d086742b39eb60124e69883cc2a5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afaf4b143b8bac775d5c8aaecdc203e734de190ff48829ed0c746750258b1ed2 +size 123437 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..228fa8ecd2f33ed8fc11b59b71582684db9d4710 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99b6e78df24dd037bfe1d498485070ceeca4634280d59da158a306c2da6c6faf +size 141440 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c60e40a903120da44651454446821b7629aae4e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0548a165f6d99bbcb1176ddc5a5920437368cf51f91d2ae3b04eb90485a521c +size 159105 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..041f6a591d1a29c628b926738fc8566c8d9778c0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_cause_effect_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:054ac77da961cea44dc7d6e280ea15ca5ee03bde11b4937ea8298248fef2a2f4 +size 176936 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4d55485afc7b3d569c4074e0230b641c1a3347b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b29d0cc5fb6946ac9f423284dacc4b8cc69ec49cdda28da7759f5fc57cde16 +size 85156 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2aaa3bdf227e3bcfbb5ac4f26e11d655b0c88bc9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd04376c37877807c86fe7380880c2d3615bcf4e56b352ac55bbac960049c09 +size 101254 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc7ef9ac758ecc5c6ca6858c735be0e3a0237cc7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a353bf1c012737eb07e9ea4ac5acf397f7ac0040f5a6499ad4dfe194ecca562 +size 118485 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..815b90c4c4b42c6855b013cc76ac077223a4c2b8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbcff502e0aa070bc7b2887bb74d390c60810b456896168a9bba5b045e210a79 +size 135411 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..712e75aee05543f9037a90d955d267369409de69 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116b29a0a867ebdd3a93a176e4e2456e1e0963b8c93dcb123db85af50e4e37db +size 151970 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef99f3ddf5b750f7fcca50ab9e2c709823bed471 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_choose_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd533038b35d38a45b8be55aad50442bc19fb39b32365f6516a6e6bec22c5214 +size 168753 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d976c7c50e44a9b1f7bade29474a9d39533d302f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f6defc1bcd958039bb55fa04f3071bfa6429c626a3f0f98a669e21239bf8ab +size 96765 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6b1f7ab0c212c9cc05c6067e4660b6a9713198ee --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124e654a523148859d6409ae3a9babaf7dca66d17cfd3230fe65fb8fee545b35 +size 118266 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb2ac90dd64e3e134653a41617aee2fb46fb703e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c495b4b95f3de5c7358cd01a7a04d35d0bdaa1c0b11ab9bd112c211398943fa +size 140946 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..733cf6d573fcde62229f6e363a8de6f9a1933c4d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:512d7b75b2c03f7a7b4818fa6e6dbefecea6f0b72aa2f2db03778bfc3e308bd4 +size 163240 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e10fc605fc8c79f296934fdfc2e46b2354dce13f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71164c5cadb7bdc71c18e65fcb8e4889efbfcf530d143367925db1714f3c1c08 +size 185228 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d21429a7c667fc0b6bfb92b3b928503feeb1060 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:641d34cc350d7cb1a960ab37262946f7bd666bec9b19134aca3cb2cdf4283e1b +size 207337 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..454cfc4db73fb4248c4d47792c110b03e27515cc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef954658a02c2f895b6d7dced65a63f5886538ee56d78c9cadd37719939dc19 +size 95767 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3911c289089d36cbef7b9361977fffdafbda8fb7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf5a1af14253523767b1239daf929a4a80588685d4b301ad3aa3d9bbb86cc2dd +size 115649 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b7cbee00401f77f99ff777ce0a8614b5a340083 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbca1d01de8adb205a78ed99441f1548ba00a86d3749388d715818fc52dcba95 +size 136679 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26462499f1f15ddbc21cd27e427e77d83539a86e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c9e1291b7666e7604474a49c5260a706ee1df34f2c7e7728c5d259e56bc344 +size 157400 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae7232160cffb73fe3e08da17072691db4f56e20 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:796d9c3ce4b671e0522526e2dccecc6270a4b38f341db94f37c6820b40e2f014 +size 177750 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b4e7ae91ec568833a49e0d81f400fe0a1f6d92e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_copa_plausible_alternatives_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4818c83e82205fa9ba8060d9dc4e18c379a47e3f62bf6449dd04d7ac7ef1a19a +size 198349 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d1fd39de47dd430b8438e1988330c3d145552b0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:525d60d86806b2a45063a6750434b3df52ecc98923974b87be40b474e05df9cc +size 6843616 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dc8b44bbe56c80d8d804b2e8a6e5a1c9142e976e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad6a0ca80b23d3b62f4e106dd0e650f0298a78b473d3a9d4997f70990fd9820 +size 4484303 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6683525bff8c806804bf3a8279091650619296ba --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d824de6ee8d2e9d0b0ec3c27e520dd8e0305ef86be3066760d0123746d26a6d +size 5417430 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dea139dbd8061c36425850792a8ec714e3680ac1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa4ab73bd81cb9710c31281d2275624b4a0456dc090b6b39828b7cdc494e302 +size 6341033 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..901ac2e097b5d4a16af18da586dfb16ce97b9751 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176fca9fa2c98113e90f8f39096cdc455d9b49f97ded8e464c76bd56637abeb6 +size 7273113 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcdf98d8df0eb52591f3af12bde6dbdd496cbc22 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e564916c02767cc7e8d5a8eeb3c370a8ae1b87a5c3412dae3da573c25148171 +size 8215543 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f18c43f44e0936de23d85e53a838fc856681cdc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8aa4b595cf1279955eca8813f6ad34266a0afb7cc64e8b85480dde4eaaff3d5 +size 6934658 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcf6ca7cb5af8effc3b71becc9236653cfebb51c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb9b3627e6d285f7a15bee04537b13fa5ba2c3dbc11e7c3149bc8b4df49eda1 +size 4354923 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..813c200e7dbfec2916611b59a71d554e61998a8c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c568a4b3efd6c181ff10f52da16cc3e6c16e45ff880511d5e27a386d161c4e59 +size 5271302 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ac84b8027c9bb9a85a8d5e660f4796696fa5e4f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878cc3029aa071986c2e97a74523f08a52819258d0c5859fe391d6c54f35252a +size 6184304 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ea97de590ef12337959101148e65212584fb81f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38355879e8100de925101fabf13e992ffe024d47137984495e4023a27f6cdff5 +size 7094555 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94ed69b01c5c34a7cf75510142869e551f02d15e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ef22200759be25e3e58e5c790da4e28730e9ddd0d329abdce594775c88d56c7 +size 8011072 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2048029f81a8e1cd8e051d93ad499739a2049b10 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:454c4377cfea5d94a4ed06e2e1a411a8c30c0dca60c7d64be2d6fc62ca0b4935 +size 7192554 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3cf09ee5ab54d385d9811071df9eaee76a767a7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ece9be2e6440cfe9f3e27631dd3d1c490583f72b142a02017caef0e2903277a +size 4754086 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..50ee669456f9c0c39480f1e6ac8cbc62ea55da7b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd441423dee946a7a9b2abf1526c8edda5f6f198ad95bb0d7c18f0ebb76a4195 +size 5915091 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c5b510ba2d13c938bc8b03c6150c2e6695b8485 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a211c16c0472f98dce08837b14cc8751352b68c736d98af84fe11893a55b7d8 +size 7034228 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..925c528e5884ea03b99e6ba8be9b9da0af573b15 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3fa5ddf1711286bac59b4288b95c3f4f446d8c27f654e8f911df414e19ed38c +size 8118818 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7dca4e1a0346589d2ae495f3bf63e6afa8c841fc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52bc882e74f84c0901404f00193c16264538a5d20cba42a4208232cd5105949f +size 9195325 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2349ba80216b45b6e98fa9ba5640209679145e7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3ee0b4336ad0bac05ec6960ee6cee555a590001a593eab88a249e559fe851b +size 8599630 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5ee6743fa391313166455397b14d1ab42751866 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75a3f75da0ebdc2f7df1cbdeb23af59b81cacf066f14a4b03deabe97b672afc +size 10800238 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65ce3128a314d9d9ead93598121ef53ab26615df --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db4aedd2b92e82785048eb95c4fdfc609da1d79c7d2c693480b601b910b0f021 +size 6563483 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05f460d30b2aae4a65980749bdc23dc6608f76d2 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719e1cba0447a0edec06dd812535d29c16f59a9289b50fea9da7f191f4e5300c +size 7635348 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ef0dd1caad3902ecfeef913eaa9739e0cc05ef4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:359693a7383ae0f8fdddbb87e0dfcc2c862839b1c7be30815799e49703b2189f +size 8710910 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3aac4b311dcc400b6bfd13b89d5dcf9c2a16e841 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1607363da8c92c64ff1175fc059aa9da75298741de2ebda14944e50c6bff47b +size 9770652 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb7b8fc1bf3609806f5d52d323202eeed69bd554 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:506d2d490e7db6377f204d02e3655ac946d0ee218836282a1754fb71bf8ece6c +size 6438598 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9c8865788a5bad82785e65a1968ecd701b506988 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fd50b78555c2ac4a3977a4819ffcc9b23da0ada2161bc919a12a2f7feedbe90 +size 3908048 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e13436d06f9e5ee1a10d86399cf4e80f2973c2fd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb8e78d0711a128298dee573682e8c5ed7844bc18bf32778bc7927611e61f90c +size 4703130 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc6d3d506cb4bfa5c5da8571e7760898e3700f9e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ffb9398b7326df656a1c2bbd1f7f50a745c4cec846b3cd3a843c56224fc4dde +size 5497320 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df9bc5446a17e035c7db7ae16c3cc4b23ebdcdc7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fecf84f08c0dc426b228ceed42bc15ca0c591d5ffe2177a055aaadbf242750e5 +size 6288367 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0ced9612eb1da520943ebcce7c2b21bf52353f79 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7279d86d739e5fa7384a2152fb2c7ccc3f82e038f8b23aaaf1415551c3e46d +size 7085891 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42f28abb9131fe4f419a844952cf3f0ea145d499 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d57db5c4518271691c5cd4e1f76b4b70fb94354e7a2f189b5799c373e0a7889d +size 5771462 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ceb57ff8dc36ccde4ab4cfa982127091fca610c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0d861eef4a05d1fe62922f90fbad2a18ff69121f610c5455e22c3ebdb52fdcf +size 10408214 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2568ab3d8edf846b77a96423675596f0b908d01 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc22f837fb3e2a156d0d74842169dbbf8fa91465e39719cb640b5e91d05d8e3b +size 7496721 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74bb6ddfe02103f41dd42b95a8447a3adc41502d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f87a0b33007830c80e59879f9a93bc3bdf4720cf53ab7776b31f62aacb1b96 +size 9779292 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa8ba96a66ef3f3f3c7895ff891e1ef8520434db --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6626de28de4b4d9ef4079f789ca9c0f5baf04a66d61b45a46b5831957ed25e73 +size 11822502 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f65db67343299fc2826a43d92ab00e3c914c844f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9d4ebf9951b4e878c6ca6436b9c740f20402a42ed850a62fd627564d9ef9bb +size 14076814 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3525080d4b5dea011207aaa66bf0449b0f82a8f6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa06082e10621a0c49d0046660e5e7a5efb001449a4dfe46e612f800e747653 +size 5491708 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bfb70d222ed7c5787202488508db5f760fc60be --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da803d0d096904ac8371bcff5ca3ab0fe20487d986856c9bb2488edba4762076 +size 10107624 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9519288f8f1ee408a6a095cbb045474a6ee155ad --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:146e00b817b94015dc7921ecfaab90e3caac6c51663fbfad15ab69b29fd9f8d1 +size 7314904 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68e1cd6da2ff1e4afef894f78c354d738ba99b66 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b7c3293c330ab29ad0b529e941039f0ac6305eeeb3607293055c38a67cadbe9 +size 9570062 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f26a6070034fb8c27814331a587b82603e5cab0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3736b4e0092a29dbe9d38727556d449812b29b15f112bf2a006e3ab85dea5ba +size 11582723 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7bbf8043bcaad4fa46ff449526882685bf19a56 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9509d211eff7d87ce6d887c30e919f48a27d877127ae6a30a1475a7eccfe7d3 +size 13794802 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a131da8d8459e9567bea66b518449ed943cf8268 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca07f7b6b2c019de08caaa3bc867809c9c652f8381887b13b2a5ab6a6caf7e3 +size 5605888 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ad6390522fb0f4588989ab026400f81324199af --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2619679ab8cef01764da3ba51387f764090d1897028816098a85975512be08fd +size 10209050 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c660c46b366ca2fe598c832190509382e888847 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5920fa78163479611185d546f34277deb7be202815a51120e4fb7a56ccac64b7 +size 7377137 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d3391d008ddbbb430552a1c80d1e6365c73e9eb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83d88e0f66e5754bbc90c7150ece4acf27891d5d1404008c83caacb50edb393a +size 9645748 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e56c4d703775ddec9506c4653c54f2b00267bec7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:257e598c3340f1e8a673c15e79b895f05baed4195a3a92091b89b655244bcdb4 +size 11671908 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1980ea7ef9483c65fbd78fbc6455642c53d8fdd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf9145c04a039900d22f23a348b110770f43523ac02177b87cfb1cd97b1f132 +size 13897588 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14dbafa6058ee2b4cffb448ac191197cc3562666 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e3a4539cb0728a6825874e15dcb3c151e608a16a9690fcb3898bd7fb92ff892 +size 5618752 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98526686a77ea5a8001b2627d108f8b89377bc3f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabfbdcc9d83e7f11d5001353c2e4d5996cd188e518b79c57da1ab030134f13b +size 10193532 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2afdcdc9355261fc0bc3461abf60fb8a7d527087 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91fd2281de7b9331564779b8635ec1a6572582800171ed9ff33b9dcb28655753 +size 7365787 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8228d7e9486eb1328b707418bae7bb8563e35fd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08932b275dfdca5b635085ed18958b6c97c747707e4adca6f4deb509c0058427 +size 9632786 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7e46c7a786f4f779645f01ee59f9ebd48dc6af7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c20a9acf704000c4d5db0ee31315719ed4290e4abfb65afa147855655291352 +size 11657943 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec52c62cd6fd6aae2770168ac628acc4fb3f9483 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bd47a1722e28e77b717fe7f68f8cbb390c254baf06954f92ea5f6b9870419cc +size 13882311 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26b7a9ce7d7e8670102bbc0a59a66fe84290233e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095f1c1f7bd1d1faaf7ab6612f4bb389581fb8f1f7607fc02feff5008cc5e19c +size 5720572 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..76975863d48570a946131d13a6ec775605d1c289 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:557fbb0e4dcc57dae07506c54a84a4f803c5bd6302cd11ab40c33230f40a072d +size 10320164 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f3681ee488ef710aae48322517a8729b3967045 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e816f45c9263194cb8e2d35e65ddec814a81a815e223d8e0a4f853ca4280f1 +size 7449169 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..43719424de7fc90e58300524359d516cb16c034a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a3161da9675c1780c6d68699f3dbbe010852ec5c5ab0b03f924a151729b465 +size 9729570 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40dad4b46ef2a0551ccb6a53fb8b42d3ffe68670 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fb4df55a2b38c120387e337123c989d70a5e932b925acc9f65c4ea8694a850 +size 11770474 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7db4c8a37e2186390f36d94ecf95d847658e9394 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08856f68d876dc011ec1af013f6e41ab1172ba603cac983034d65611852c9b6a +size 14018615 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87ff35e6e24eb8e97a9f9b7cc330c34609fe1610 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a9db70aaadd436780edccf49edc12b1f60adc6a292e8a6c29edf458fb7f2bd +size 7561542 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44fba864f91bb8ea04e4edf9b55c8b8497071e66 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f981e328fbf7cc86371742810b11c384ff9daa111dc91c9e34d839db5d1af4ea +size 7356742 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a42833572f00cff9520737471d02fb6e06dc9893 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d41efa1f07c99a8efb47bc62f9169d670e95c61f2b41fbf760e919c9684b338 +size 3946372 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..01c074c6250fc6609a61289060e973ad85b7f2e9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8d4f55ea7a2d343aa061bd52c7eb658437df564cc37ae53667bdcb8b64cfb2 +size 4390098 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe7f63c477d17b3754bb008003e1eec58013c10b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8cb9cd62df0f93c5dfcf951a831cb8335538047ee3c5e165b02301aebf67114 +size 14860605 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c3cfe49ea3e055995aa9839869ead06d40427f3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069749c33d46762debc023b826c9ddec7659ed21ecbeed29ae1f34913ddad534 +size 11273752 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec40bfef1556553c4da11121b3358e627aee9f80 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98bf46c583d44a9f7958c7ff2e270cbc4a57462d418dc6e668f9ca075348d81c +size 4231870 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c482c3df9decd346f5fc00abccee2b98a02e3ee --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97bf18b2f511bd1d461a4f115da409b0aa8d1d71c9f6a47fc844359cf871a4c1 +size 2900909 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..279b29876adca4546da80462bd77ee343d0c23ee --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a20fae1ed4688dee0234330862679974a0f27c6c588a0723ee01968debf2b3 +size 3683911 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..152b4809e81160681984cd2df469952725ddd6a0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549eaf3ed0721f968897271c961e40903c8a4821416b553ae54416678c163327 +size 4458372 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..01b907b18d7727ab485dcb990e615d2f90306610 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae2965978116976fbb48bcbccba67bb42425dfe154f02e29b10ea00bdb7cb0ed +size 5254516 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..901dc94171d263250011826e49eacd09abee6b12 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25fec0d00e03173a12303ecec8d5ec2a4860246cccc0929daf30b85df5f1e76c +size 6040677 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..59edb93441738f33938684995670f6b2cb126cd0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a104b1cc87d6c7d3bbecaabf4909837d1bbc3188a919cc728e8124c4cdf84360 +size 9206514 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ff81cc328a1d9595781701dfd4c51473b68511d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f47d43c871fd1dab504102de74efb5b372bde4480e47ba7bb2cf539d577d251 +size 6542250 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bde16c4cc33597150223b745a546c4b02b110002 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66b6d65a35a8a233f1708041c907ea7e184fd988d2a44ce3951cf4fdab98c894 +size 3506702 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ae28df59f3e6f017eba737052d33a7fde5bb801 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc330145409516cff4a00d2930a1428bb6e79f7678d7d335dedc73d63c2fcb9e +size 3767512 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..346862a86cb4fcaf8580506f34563e9ac2180248 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87662a03edf748fc7f4e77bbc7eabe63f11af9299d1bedb38605fa8962df764a +size 12100404 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..46973b6f7da3acc0f2185fae455c15468b17a9d7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a76944aa9a97c8e44246decfa3c5e9a0331d82cd59923b2a9996cd4e0278c0f +size 8645280 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ffd3f10928cab7d98e18c74cf3b623421f7f58f1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9b08e21f910ade4a52962fe5f0232304d75bf3814424c47ef2ae60ee853e81 +size 3728258 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7969bc24e3f915174375e07d02c38a3b9f8da32d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c37cd0f80a26ede7638f6daf4a8ee172d9c99d41cf85d6287cebef221db2a32d +size 2557203 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f302c59ede761023908c36278aa0a25fa9b0a34d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeff4eef830a56c3f8714a55fdbebea256baf3245393d4926e27dc7c304c93f8 +size 3248305 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..672b17438a8680bfdc9c11edab4b53c9ee350b55 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2e199bf46f4a114ed94f4a1acf96593140e89222382ec3b13e2a1810ff2b0e +size 3930866 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..123cacf5b7c8d1373391fbbe86dfa87bae942cfb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3fef276191a4a6193e0d37060bed7dc0f5304ed9ff3288b40e94b73e6e9bd3d +size 4635110 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d5aa9f4aa49b1991d98055459fb7bef2a0f881e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_pick_correct_choice_index_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf3b2e67c1dac4786e60c8ce8fb597ac33dd8d96b607f5a86e11f6c50f7a0928 +size 5329371 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec7b072128ec084f8f889ae590fd957a4baf6f3a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78076a1f3b261c90bbfa83b3f09c84018ecdb29ebc22f2910d085e1bcfdc3c5e +size 4514262 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aae8b5e70fa93c55d0f40ed1c0f0852db92fc4d1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d806dc477d3139fef82789a74aa711d9de9d2b00e66ea21b5049c43ea7ceaf6b +size 3017543 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e84e7427774b3b92d762cbfd195d8aaf6646443b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b63e1d6d07dead3a99657a5c46ea9518924f04fc02a527530b9ac109cd976b6 +size 3774493 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..023eb680e3f222ebabc25c4e39b03940c1da8bf1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae440c8e07961c561b1c8eb6629707e34d52a638166e3beb0e5f1a0bccc0a5f4 +size 4518647 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..294f52d519c7f552ca7295ee1c437d74b46afeea --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe826e2399626b4083df8cd969aea8b337914e7c9e1460a398ed3f77f1f8313 +size 5293366 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66cf1191080557ffe3713cbf10365e1678755fa4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_piqa_what_is_the_correct_ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e493ab794ed93d474dff0e44315bf17874e9c3978482b5b2af1cfb82e5cb3360 +size 6054926 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c26dbe49cead8f6ed31361a2312fb57215dc5881 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:923026770835dc1d410f249ce8093aebce2be7b38a3d5aee753224ff6cf8cd22 +size 1280086 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef4a284facc3c890b82b10ec0a05b15d41328315 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b7db5e0bb4e894c2390cf46ccd9feb245e6a995a40c91bb5fe92a3b2fcf397f +size 755191 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..25e5d85bca2e541e0f85a01e8944db20d80d2762 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f88d64936946513b61d0cdf7ecb40a8c7abba0ea476cc9b9aebd1cad6a7bb89 +size 871424 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..43c751888928c2a1c854beb7abb2677b2c8b419a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3396832e4dbf29317649e49ab585e541fdc9213169ae74cde45af9fa4bcc14d +size 985856 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7efc867e2cf2821b6d78b02dd0f9c4fc913a7d8e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea8997df3325e8b5d37ae27075a72f29f3a377a96bc0f89bcf4aaa50ab2a4da +size 1098545 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c28209d04ee35721ff330f140f20f97c90592d47 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c144acd1e241db479c6bd02e53bf1e08285612442e7b0294585770947b628ffa +size 1213745 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efb8955623ddebcb2ef7ea495763198be61d3613 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca434eb1121cf89ca9273daddce7b1c77210f14179bad62c4431c011b47c0c7 +size 2365258 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63f6923ce35c11b8d32cfa1a10b518eb05b9d01b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f0c0ed4f46dd230e3737bccbaea1c06ef7ff4e3a7ebb27bfa6a6487beff5b8 +size 1779400 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2e659485357470ec461fd9713ab19e49efde99f3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de6314febb6a75f273ea9f22a79a69ab30c000ad40afac66354572730f7173d +size 2388551 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..73db8df09ed05413152326791f35ab54f55bb230 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b418c54326c3391342ddef406f47370ef94e86856a2f61bf0b7d8c72700dd3 +size 2973522 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72bc4dc2cbea9dd68f46636a0312ef2261ce8fab --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c52d8be6cee20b0a6f233bbebb3ec5c25fd7deba294ecd1542aab8a4c38391f6 +size 3555875 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f20e8d468c1cee809ffd58a3ee760847252bf462 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Direct-Question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706ab86bd7f06fae469847b80e842805defd36eeb7e90d51cb6d8a597d1ba816 +size 4144709 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2f1713d3a3c9d4a5d34bdfd959c484f8056a012 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1553e57daafb6ebf991b0bc932bf4733745f3aa6fd79c86beaccf568119df8 +size 2657868 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a13912772d1c5e6bc8271dc482d9b51b69467199 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1041551c3c64673dd3b6117aedfe67fe035ff520b3c2687c4149f98aac0692 +size 1524482 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..24634030febf9b141323a08101750d85bfa77fb6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34aa4d4575c2a32969afa44236bb0cbab1b2fd7b70afa102a23173df11fd6c0d +size 1720645 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7872cb31f797e39b63039dd63a4f4f1df85d4e28 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f61709d530ddbb35655a9b812adfd120384ab835aaae9df8c51c9c6ade72655a +size 1915647 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..659754d78635d2154e54636924dc3f7ec26f7a20 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259bafd6663887c15fc91e69a625693c7e49346ed5a4757baa10a2923096c270 +size 2107401 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c85397ab4e044888d2fb7cb167f36fb3c643aae5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c852d5b737f30e127e704d8f867da1dae255734c44ff023332e0bc1aab9459c0 +size 2301043 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6bd997ac8a28dd4a3a5e425bca7b5444d0b819cd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6fc3353a80fc228df5341e02bfbd9f8494086b22d6dddde30047f5fbfd12a43 +size 3870542 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f5aef6e4d693e7ae64e3ad0de77e36e90fa4352 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f8cae1f97bf8c433fbd437ab42957ff75e2b09fe2a26028814fa740da28fd3e +size 5272840 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3835b4bf635970569ceacd30e2ade03cabf8ebdf --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4192d25880482112ede36237734109bedd563118553ea07ac1a0a9a97c3fc2 +size 3349675 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98d164101dbf39f858cadba94469d60ff4b3e6ad --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d08f1f20c3d93c7d36d0bda1fe2063b7b4662d886f31e9e0e16bc26c8dbd32 +size 4039219 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a13114564b36c3d9d83bf4232430681fd1e9d53 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95c74a23f9cf8a47de821ad9afe57b62ff2bc49e87405d569b4ab991177bcc1 +size 4724541 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a3112f3e55510b7334082d0ea63990c82da7e3a1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7279799ff987ab4c879952f6674c7fa9e502cf7c5f5c999b23c085cbb9ba7c75 +size 5415734 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47fdfdf025c4033024763eb0d13528422a954822 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e38dec2b8e0d35a38b0f96908951a092c63c2991554d11856aa4f50f296352 +size 3740448 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e649d7db015e2c712d7a77dfb3ec310b2f05c8da --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac13ba8cf2b5c3982847a68065038c757f0a74e619de4557dea1f121401e3749 +size 5090982 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9427e440613eda004dbfc1242e903ceb279bfa7c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fedd5b140eb78cae30dd467a2dd254ea4bcdda40fbc8d4dc0385783f14adfab6 +size 3232705 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f30ad2dde2425f12cb0a084ebf2f2b5fd7078a6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:090657625d8e2b47046e061859b7a99354a158ba2f68d45ad9a708601eca72f2 +size 3896303 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f2280d1e893c03a9f02038ed6f66db6dc9298f9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3456416b35d70fa41ff54411fedd440197e390aa2db1ec68f266fde968b06d23 +size 4555629 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c0281dd82f0637a3b3f14901f2d4605a581870d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23998cde7035ab4cb7d5b89dbb62e6ecf0c5ac81e5a06ad2ab781a29d80ee693 +size 5220768 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5735a4b4951c2a524c771652d8707d65e3973b69 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5916f7cc840d952c2cc2e78f05abbcbf4ae5adee7a66726c92e786bd776dce +size 4427944 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b99f0940b1d47a65cc006375a1c8e440278bd80 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64f7dc2ab9785ce1a304a25962e7966ec38195a176ece49c19ef6444623072de +size 2974204 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2420ca750186faab0fc19116c92f12c767e45e9f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0130e77ed716698f367aff4d1b0834c67b30c8b8ea700fa51165c6e7ee71a6c4 +size 3726410 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..826c0bf9e074c37c62695015740d29a7d542b231 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa6564f3fe2a9b8119fd4dfb3d89a62545f0f1af9383ebdcce631738776cf00 +size 4478085 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..67852f59f3638c5f82bec6fc0b20b8a19ae17807 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:368d9fc23e751056ececaf405f1f7dafae6454b599a8cfb32562cb356e7cd526 +size 5229927 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b6395339966747640a728ee082cb2733445d258 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:873102cb191eee7a9bc09d676cf06bcbfd4c22ac40ca2ddd0e1c2f7d48614090 +size 5980561 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efc335c03db7bc36243c9bc2d878b564ac449803 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:738ba96cda282d041edf2ad419bdc8d91c63938e91faded70ccfe54517c9242f +size 4721762 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1e0c5044a3f6bf7f05b63addad8c0eb4639cd71a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d3896db9b6f1d89c94e9de6907e2878cce2d0bbd456181c86ded8b3d6766989 +size 3193449 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8d5e570ce7c0d0bd6c8efea0406d2705aadf464 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:913c8bd3c26439d3814b3cc21eae5aac20e5ed2b4d1d89cbebe57cd99fbf8507 +size 4016772 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91d1c955f15afaafa9c66e65d14bc3c907480f88 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd302c8f6b06c6a0f68db824e427c0d2f34cc6115fc1c31dc6490c23e9dc8a7 +size 4839282 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96b7785596be24d6bf7b3cdb1afb1a38b936d74a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41ecce01e3b14bedfd4a50475370a7115311186f2624990d9ba5a76b37d36d87 +size 5662250 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e33224607859d648d290cc97ac9989b8adfbe85f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38386b1f192982743f86848c40cfb2cb3bdb3395e36f5f7011966bf2879d59d4 +size 6483890 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a40167e6651999b7612f22efdf2725e37ba0ac0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f02925d7332b1389c0789aefcedc2cd218cdce4e51c9ff56263f163ad953dcb +size 3759030 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0162f7a899adc264533fdd0a9f7f8ca63852b509 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d41eff1af7e94efdef2a4d42d5adfa20d1245a6e432d0838bf04fc0eaa86726 +size 2435460 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1bd17f2cfa1f189f4df056c70d9c22065a8f485b --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9412a5ec8ae901389a2e22f7923ec14e494a454ffbe8bd0c9a1aa765e8747e9 +size 2985981 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00a13fb8d81260516d4c8b8610ffbd1dff4b9977 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a09adbc90419af6e954207b26b0002f057aa0462711e57bd72bc35914e90d0de +size 3534586 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9f6a871496e1cc38b2f76174160e5c42dfb87a6a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f142529805162f6ebb2e96a0f5c777067de3645345a458cfd5f38a82619b103 +size 4082938 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c89753b79af6ce130ee0aa3089e8556fa603b76 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Generate-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2fea37f6a8e01c73d789a3ce385437282eab2116f7637916d5cb6da1a28f0d4 +size 4629996 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28b5e2eb3f2a389e0217e293c01b210c21908d90 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:959f3280fbccbcaa9333952ccae63bf4b50eb05fe02cb4e772a684bcd39a605a +size 4743022 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..248dd250234249430221f87c0736374ae9a6aca7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a15b6f0b0f9db78a154db867ba49f836ba8aa44548fc34cec550770c968570fd +size 3210507 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da2e8e0454119328a09573a19aee5e2d20657a72 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab802bea34dbc184a8a92f6a3e566f1d507c5d31c6b2c60ed3eeea5310a9457e +size 4041116 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed8741090eea0f3aec106fd63bf9a6225a296cc0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90193f94aca1cc312e3d7f8d49d95f2a758ea5ba3bed956470e2692a4d403283 +size 4871324 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b46d9ebb375f19fa9708e9f7d1bf26352b7f6627 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b873091e46213249e00276fa1c8a5a265a39a144f0c37e28ba7df489119ef05 +size 5701637 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb7574d6189fd78bdaef28364e48806a051a3f22 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519a14bd01102e3fce2cc32d4497911f35f7a75a9744923a2bb3831b42691bcd +size 6530404 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f48e6b0ee24bbb828421c9cbce83e07283bee55f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44becb5e9415a6696f1dd8e99c79df1161ec4d1f5086835a5ec06ba778592bc9 +size 4681606 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00289f0821979f19d5a47981cacde63d15bbb22a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54e73ebda81174155dd0de35e41a809e693f5d4fe97153f212f9af25bf1da3a +size 3150751 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b12b6f04ca68d560699f03570e9f9281b5a00e72 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca5418d0acaafd81510523867feb305f1d9ae4ddf2fa064c61b0f012aadfb68 +size 3955391 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3878be3beb3cb8d9ecc9ff5b91aac8eb92363ef6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36413df00a46cbe62f919b8fe7acd70c90271b4b19d84e9d2d967aebea8e547 +size 4760538 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4a77ee390cc9367aed59e58be297154a06a55300 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f123979dada5d1187f012c0459d0b20f73f5db153de5e40f55ef5f97cf9d5ea3 +size 5567094 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8de1581555ea4a3202123b958616d035e9dc9bea --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244a1611bc881ecc2b33dfe4f73038d8388369df1391246ca13c5dd1ebde584d +size 6371705 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99d47913351fbf6a57611a646780fd9f7fa01e81 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a6e23aaa9ceeae143b621323f4e1c7fdac18b39ec8cc66cec0dc790ff22e46b +size 250482 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fff598249a250cb97101a5b779e9f932d43675e3 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a69755bf1c978e17bbbb1c317708e29bd80984ad69f794cc2c8dd5704049fa81 +size 351271 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b932ad2fafd0b631705537002ffbd15fd09a5251 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc75952cc1978d6e6f05d8ac17b39de2284a1b227d8edc45535533e4ee9aaeb +size 449590 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e5b4a8a994541a0fd3aad9e032f7b257e3f9d56 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f53c65321a749e0ee8c8b63e86b9e69fbaef81e17f9630baf678fd8992f557 +size 552955 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bc28e1050afc1d2c2d864be365ad60a85c75461 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df947bd385ae8c871582bddb84502447cf4a251b91fba472cb51db432c2a43d5 +size 651577 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..700369a553306e75b1684bf684e8de6b9d636913 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b217efed6726a0bd832d70b5a7d2d2415e1c89e202103cd33039148cc7ca44cc +size 747492 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..815985cc7f2d40048253bb775ddf3127c6ef8b69 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f15332b3d3a3473e20508975ef011b7a77d29287a29f6f20aa949cfb72fb9d +size 292937 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346 +size 415394 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8bc7ca3d2127459f0f50b1040d702c2fdd71f931 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ceecfcc81e7aa797a328bd9d082973c135f26a0df87131048f02ab36001ffbd +size 535369 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8bc5d63190e3d5bd053786179f202bae21a8bea --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6afeadfebad8dc71e2818b6b2abb28ec02490525e7a0f154316140b83d99182b +size 660370 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e9c07eda44961d872c3d515bb028a8b3b86c022 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b38a1a7daacfd7140719494b58d368ad27b2070be2061eb2a41379881b465a +size 780592 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a33b71ddc66190718ed9f5e4779973c29f07d30d --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053b27186457ec9fba649033a2ae63113a008cfe1b6f496f117553cb5f4f4f3c +size 898101 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f69cc4c4c78b43c2067526c125d72ed6680cdaa0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c7c425e546989b5fc17e07ad77f20784ae637ef77a1b8db807269037e4a931f +size 258304 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8cc26d17d55b7f015ed4c1832cdb6b99408b58c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a228b8087e9beb968a47a480366c004997a67744329b539eaf9d834b411ae4df +size 363043 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5136206570478566250b85e3f7a4b7a6a3400fda --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:986643de2ce571b002cea7e2967d31764f4901bd280a5cb62d2efe8703ddbe54 +size 465272 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a719761caf17b11f059820ecb676fb556f82aad1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4be4c048f7e32085ee41a37884d57f80f91e17d42cbfa3e0ec1955ba3484f70 +size 572508 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60a922280f1956b2f820e9f1f72469d20147c1a4 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e778c012004a8249a088c8997c598c0ccd432c18a4893699bb32f673da5d3d0e +size 674997 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5813e84aca2e8697de517b30dada98a74df46203 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_does-it-follow-that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37e1bf879742176d00672899e4ace6a7d8d3fd85064acca9c185f767ba43afa6 +size 774773 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4888139b7ed04bc2be563f390115fd019c0bd8cd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4194614095abe8f41706264af8cd063268149df1a22de160a731fb9f39eb519c +size 261270 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fc630b7e0e501aec5141aaa59cbe4d2b447bc97 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e892bf6611c89748094bcd012b77b6f22896e0658909f4d8c7a1fd498d55bf97 +size 367751 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..33a95f0dcd5b414cda7026004869557ff1e68c73 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d247fad43f3f056188e52372f4d0237c0c9f80171a6ad44bfea214bc408b76e +size 471933 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62cf6aecfa65fc5479a03ad8c75c38a004152a4e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f5b8f91126b077b4f3e0ccde7da4947e9b270bd5adbf08e74ae77d5b5ceeb1 +size 581111 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..555250fc4918fb362268a6cdd394701702c68bb6 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e35e3ad8af168ccb50af2d0db23545327cfe27d6c24cf69feb87999a73a06dca +size 685546 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a79e29455c5c9852b69bcf4168aba4b52ff363a1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_guaranteed-true_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b319e79597c43db5a70686e7e4188e7cf67fd3d2f53abd352c94d29b91d7e1ce +size 787249 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99e49948d25d727029117f4058280f4f2a5a3f10 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5ba03b86f6e01dbb430b07a0bf6dcfba65900cab316fd96f58e28acee55408 +size 262205 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f +size 369689 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86901c7e8d17e657f481d5f9f37a7eab48a4fa88 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fae73e8a62a0b5cba04e700b5c0ddbfb54ced245ee2d5f721e04d8ce1f55772 +size 474706 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..460f98785f1bce70225c558f817355bdb545449e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480fb2537556ac09cf507b10e59a207e7bd69e9e5c0a2a209008423a69ed9335 +size 584727 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3a8402ecee08e584f89f59ef5058dc7df47b70a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669fb016607eef350aa31c201bf8ab64b1e5d2352116c7eca494ff4638dba1a3 +size 689994 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..403136c7dd145ef47a92eb061c15add0943799c9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_superglue_rte_should-assume_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b641e796604be5eada6ace76e821efd8ff4c0830594e1b49e8e5aff9f1e1d0 +size 792534 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4acb178fe5dae6fdba69498384c3c0e41ffbe2e --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49b9b96e799a1dfa4df3d99b1996fa8961b797c871293c153aa14e70ce638162 +size 2078506 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea3a2528f78efedca45f264b937bf9b5dd0e6173 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02697d63c785f8394e6d6a5fac4d3f22eefcb0405126eeac6ff89fef3e2d5090 +size 1300182 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87787313c2a50c9b35cb1f1e5ca27793627e901a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9494ccdfcc70dc05ef4105d0349fd947c9ebe02977c22d1158a343e449e43fc7 +size 1561293 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3050d2576069e20b1b206e4398d8f999d43707bb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:851b54bec336312ab3a222054cc8e254160ba2115ba1dbcd7110513f253c8d15 +size 1822761 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da19f2e4128bdc9c770387580c44f757f62cf2eb --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acea97296c68fc0c685c7d18be70c1e7e8af51596212c829d9552705d11e6a93 +size 2083001 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4daff477b74650f8ef70a773436851ae21ec45c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_Replace_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e19958c2d9845f609232d7bdbd491949618c57e913fdd8ee783a34a31842067 +size 2343699 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c49249bb5be58d61c08c43d18ea3df98d94badf1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3ad4b4ba7f0d4bac46933baecc9e4fb54b9793fed9d76cb30e5eac451cfbd6 +size 1896222 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb0ecff7249854c61c4197cdd5e71bf0084f1d19 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7240206be260c422bbadb8b625c8a0e15c055d2b9a0f310faa5ed4b347f4e4 +size 1180447 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d56281cae43b88347e57643024e377cb935f04f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c287558ffc663f63d0841f2f7d1e1e91fa3045ca8e1568f9c08aa83da8eb515e +size 1413042 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f8728dcaec4b69465d16b42cef8d9de328706d9 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4416809626f1a1a0b4b44b53876d34f5f4d2840327dad541c2bf9a21e7133468 +size 1645867 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1342bf3a55b037424e0c27a3fbf5b70e0333280 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:505207b75398039841c9fd93568dfdb9ac0d9a6f0b0c3a9dcadc53e5c32859c6 +size 1877765 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a3aadf15e26ded0a39df43dda2652a135bdaa770 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_True-or-False_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17bdadb7e55b45ce402f3e11407913a4c32c770fd68567412cfad1acf8e023ca +size 2109867 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3c41ac111152bc7cfe31c8725c98396a818fb56 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e59f4a52c632153b46233d58086337bae185a969327098119d2b3895be44e01 +size 2020102 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0cfdbabb1e37bf281893197a137b042f047ae74 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622cca420e4240ae75b2880be113c31c1440f3bbf3b74055555c178593142914 +size 1243205 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abf81788e045776660610064c682bd5e5175f57a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e038c17ffd042379c1568e6efd3d01016bfccec6daea490691563bbbecef9c36 +size 1476355 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17f9a87b26f604bd19b0ed4acb523c63572342f0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975b5f827637f5d2ef7054b786bb66e4b42ea238ebab00202f506da0d2c0361f +size 1709943 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..027bdf12623a5d914c2bed8e0cb201e1b14fce58 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec5aeb364c8c78fdb9b2f4f97d1aaad6be88628afd044b1be8068c0d43aada8b +size 1942312 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6528e0320e987611f46536417cbae7be5a0522dc --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_does-underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed86af5e97e37a4b2c09e07791135867223606d948e8abfa9b159b4028170291 +size 2175171 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..059f4758a9e8422be0b7bfbe882ba8cce436fed8 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2761e62d9c4c8765637fd0cd2ad7220f6d3c0f57c702a93cf6fdc49bf97c9e07 +size 1939224 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe356a444f21a9f8addbe4adf3a41eceaae47af7 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2abd0a91112d4db6c5eeff0b78338afe51ba96d7912c383c8110243657d2678c +size 1205109 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a253cfa595b763b744c12bcc67992e5e7f2505f --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772c6ee7ac3c7cd5e8c77a0d9a0d70b2191891ba8629b7005177d1b2571a3eaf +size 1440862 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2cf1100eb542d7fb95167f882515d2cc17acfc5 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c38a15756d85f110fec9e5943863edc75b62f3d04be10d18f49d1b8fd800304 +size 1677003 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..315ec48eed16a58ba1ddcbd41887773512d062fd --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc8ff79917b4c35994cf7c088722f694acb69c73a6006b196711ec94c8363064 +size 1911890 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0f5692f611b8e1f30fe0b9610967311e874dbf55 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_stand-for_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:250bee00493996b389d8c6b89213e0c6bce97adb25d0953433a1e57f14a356ca +size 2147266 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f6b34e51d2aa2527032cae771f6ffe8aebfe0a9a --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dc03553e1bcbba0a56b1d64112a91887bd62d5300ae5237944e9b92d87ec1b8 +size 2033024 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51c1b1e4b86de9a8364a33380d9ddb05e9423350 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1147eda2bd9f8fed10109171490b8b22626d6498a0c1714de316b72757f54e97 +size 1257115 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6404639fc16935572cf69720730384c6ebe77ae0 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:376333fd150478ad1d31453628c6db3801a5046dfbc035c1e3e7c654e878f974 +size 1497928 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..077ebf85318204ce40122910a78e6df36e94b7d1 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd6e846e5c837f0eb75219e4725eb66f4abbdc1521504f8495604e19d55907f +size 1739162 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_4.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..67cc256c4f13e62a250b14298def129383e7304c --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28c6c0041bda3007ed295f25122129db477494807c65d3dc09bacc1509ba41e +size 1979172 diff --git a/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_5.jsonl b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..488225514b247274116b65e85a463ae65f7d2e29 --- /dev/null +++ b/4b284b17bc4/eval/examples.4b284b17bc4_winogrande_underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f06c0711ab8b093dda4d26921b89fec59a3d2b4b2a5c39ae95c9a3440e29bd7 +size 2219570 diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6d356d8b8910e1f821c49e61c322bb3a86bcc28d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4040857346605273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04358756352339084 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0759904796250538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019478615830651011 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3009878218567671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046586299284223885 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11153940555452811, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002178375350508395 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03649587266689502, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001284173883599284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14683508450255534, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032437032345681857 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05308201459552208, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013729761880117914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07180766426825755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001761388989113738 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.28987291523705844, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045326872175802165 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10594741371659425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019835178597520093 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07245769602542276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00184585307065297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2877191217238231, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004439207690226351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10637513260264994, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00204505600505615 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5d8a9e4740735e2983eedc4dcdcac5cddd876fda --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5179012826475189, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03546328546887922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07590692259956473, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015260502670476222 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3587176031003754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005304902318303979 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11715894355967386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019757967913343107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03541306461486918, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009438407351314926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17490815925289047, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003575953294927914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.054620931903283015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001228796960295478 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07063353254188104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013315866040551792 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33926359580036936, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004980489876121937 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10948455596662156, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017556314356608658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07145722539365447, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014201666549177136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.338101035904291, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004861723234525834 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11027827926137282, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018206662711825689 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..10a25febd638271752d42841e8eaf43a4ec01f92 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5353533242406296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03431413900192352 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07083033588676813, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012393618247087826 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3713979630102006, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0052635629154557445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11225032910377221, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001729127211467464 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.031078186918670876, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007042214847925669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17869319720275487, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037152494180892654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04972618028817665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00102253346010308 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06478794864331923, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010868099908157105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.342386434900962, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004791381155468985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10286097604129013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001521136220014524 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0666711337015857, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011558366976764654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3495506191555596, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004867024005105769 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10565101438548488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016072411783332626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8b5a7dd5035ef59b7e0b596e971f23b558f1fe32 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4473435878442557, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.021920475877328035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.060184346315459554, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001087884777571127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3414484174381052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005172668914728109 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.09638137782837077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014829064557002423 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.02365049876833611, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005803565276567791 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14942490983015533, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003513250894148731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.038282775688304856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008484797038289617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.05398887767175152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009593908114878796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.306677112968288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004574894720404393 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08646072645548598, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013059843771721818 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.05643326067698096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010263660613837862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31950364177604385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0047753784913865055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.09025064736984334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013804266464441872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..61cead9dd3ec07a5744397205ead9a468876a29c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.42391682641977435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.025185202302157747 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.056211864304467056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010231526331645241 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3243551913604374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005025965693402745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.09021138584698546, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014520692052325743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.021734837271928865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000537789346158442 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1381516300501002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003431494708848335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.035274756528572794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008164665662699456 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.050592287065444816, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008807998394512943 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.29101119493777294, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004368680842712586 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08124379060939273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00126253468227921 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.052801000471684074, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009509504143137485 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30284581980574177, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004598745476101752 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.08470874404935472, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001361092950551602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..217dcbffe07508c6753b10103a0866e60e43807f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4676760272424504, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02303026154350977 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.06321141539943463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001223200335826357 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.34518893665726086, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005046837181813745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10005721103740961, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016082944292364137 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.027408388290100973, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007495147001664794 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15813941094305575, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0034339025470937284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04337652772461485, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009940986270453964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.057943890660243344, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011331942696761865 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31252933987652376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004418336624345426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.09146925689558233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014605690131820356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.059802685455499036, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011813042691715043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3231337567814217, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004620411779037274 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.09443075155430865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001535767562485951 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cebe30cd362e619aa56f4508c65a8fbed84e7a12 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.06635037756207988, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00222464644174656 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.14012624989204728, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002036835675458236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.0649820340050219, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016774667522408912 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.003020102888519982, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00032547132764282983 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.006611308173835243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009951093670627143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.0031079038623638374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00034130095697030574 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.05754089923288528, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018114419198071445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.1332053573384668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018778409471417506 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.057959376823978205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013701772199315872 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.05219309540753949, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001901332766377738 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.09949694242629789, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0017781434541041129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.04917830665991281, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014301781368119162 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.1350370966328972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.027107668307824653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..736f5c74c6d75392fc942b4fc4ae3e907abebb43 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.24515040330368046, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004054348224566517 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.42594170823778976, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005787897761386721 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.2770237126205559, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004038789167972144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.09175996157947801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002601492839334607 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.1684955936702656, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004064838991261767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.10582834218904164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0026426543643199916 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.18555197093693734, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003147342164119049 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.33280053513758845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045573071529510685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.20936546936617006, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00297046089780952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.21610700955371467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0036569952801652067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.3772047553428585, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005272234747783686 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.24390484665607676, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0036238123294501058 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 2.052485569424793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12102973129799857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aec6d41bc39f93a57da2dc48bfb6e0274e81d767 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.3566584079552788, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005545339920311794 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5400091585387286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005358867844689096 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.3738311083147904, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004303874024559728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.1806299226100141, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0038767177534562857 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.27810862336642045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004344946134160203 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.18715015079380715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003209239165332672 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2774912289584729, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00456955894576886 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.43176983982503986, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046883692841029135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.291139847215283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003491774771956361 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.3096504505654974, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004863905721617155 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4751007623048103, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004978268403772034 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.3252458107512585, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0037896518417690843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 3.5977911950601484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12923912317137534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fa0e66ef8b569b07d986027d10fbf89c144eccb1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.3507235841349617, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.006063184051971503 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5514350155602691, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005182852642362103 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.362382378974461, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004512618110179221 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.18695632263623327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004276682318235075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.2924425597856253, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004413282563913562 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.18921046750479484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0033743653265960907 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2760401929409245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005081600899523718 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.44615489211207415, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004731816271399687 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.2847450223443878, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0037300768332280904 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.3065083273455118, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005390003916850601 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4874589762506608, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0048935064804825285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.31660483791521354, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004003089154824215 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 3.364926224666278, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.15045294250595048 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5dd716d20ae85079f46bd9e82428b43cb2d1f606 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.33093996934970976, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005991487675562594 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5449792771957667, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005139692154283905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.347569335324534, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004544187677600413 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.1780670412943538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00422080166488958 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.2914835377835669, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0044150558699339055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.1826637134726461, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003379682428755349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2619538812269815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0050299249467279095 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4421088823491754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004705889366270654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.2743549921011115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0037902551916263946 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.2899719423937745, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00530779434079022 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4827220286166995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004857508825913098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.30488899444485074, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00406398308359489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 3.3107172893336636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10227637753147577 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..10535bcb97ce2fb9e17dc1605213706f1f091352 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.3193273080501676, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005974116499113547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5422038885918751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005190435741567349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.33762053843234585, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004523525986261017 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.17378010156605053, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004277219617652875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.29377091837736397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004471597574308531 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.1794175550575056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003344405579036441 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2557225993181413, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005171558754142887 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4418326332600047, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004753839209134025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.2687968014233197, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0038430798579204405 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.2829548142815575, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005414729183997336 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.48224704322769923, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004901420700466331 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.298462025989663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004099301415037842 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 3.3115358177828296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08566639180105731 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a7740e22f3992899beb6a63c45177be0e258a8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.12648828841845464, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02172247828129552 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.04385963495568408, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016141799634558302 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.20910265806376718, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00260407682272209 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.05185506267199114, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010693704857640345 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.003657728710184829, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00040744794041285926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.017381693068654103, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011150256177358017 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.004093823034187833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003237805033460212 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.03973081953475384, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013217691489049088 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.20624804575610606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026094299138881873 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.048929691951989984, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008927940829166845 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.034850412608671714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014877807289763565 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.149019716230595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002039697594288268 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.03823939846444661, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009439507711198298 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..730bf61f6ee34aaa0300dc49b24f4b559c4bb949 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.9717259245306719, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05184305667078613 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.07728525564656695, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016345680555964624 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.47912345654951694, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004974585856018974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.1251494842748045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002228010113423068 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.030217086264151687, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008968155132881358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.19718798013757022, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0041103934021723375 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.048933388223398, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012848999460203356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.06575830202193088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012184254155840335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.43631558492945605, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004551703262770418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.10781056512383318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016643371248226597 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.06685866773660962, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00149258308259344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.41789578588003, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004754874400535324 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.10816019192402117, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020352482624991708 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3bcab88dd08bd3b2d7cdccfc86a572fc4a3f1970 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.5553302383124015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04732740194035888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.08832852532934997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017907725492439333 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5699592863851318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004714536643229723 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.14300003134712513, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00221162536158628 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.040843244255482615, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010916245640850226 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.29220518033840015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00429630894545052 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.0664535256463577, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001390545014519469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.0752494824229621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013331527745553307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.5158723539376079, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004556028773885754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.12321433839865718, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016493743934460554 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.07693161532405232, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001600850652295148 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5035674346293266, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004594034425007832 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1245303464212252, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020005666890666714 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d7529524f05a40d4377da5096c3d13cdc313ae94 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.6393485578889786, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05502650807288692 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.09384501026064822, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020410705936949164 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5594718362988982, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004835901700325949 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.14778204636533104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021535294935283475 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.04531974070359654, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012592530156795225 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.2943813172867371, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004307507870651324 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.07150806038871549, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013846757650982076 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.07902797606364308, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015876424977483495 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.49738396026840304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004670214814101292 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.12572390453967686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016646524639872145 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08205420153622084, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017980621481149767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.49588185251631867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004634355511167163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.12939899168462599, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019528328894905256 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0d4b04cd927485cf62e095745416dfda0e549fdd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.7027172131809374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05464254396711413 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.10010463739366897, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002451958960487023 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5497072816752133, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005022860494341607 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.15265161943963146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023054471134340703 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.04962506210717931, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001556539762450491 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.29783112596150063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0044507361953258365 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.0756101747685517, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015266803509634318 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.08430872709932842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020229841753327373 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4865738026311325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004864018248102004 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.12964486472277573, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018553204638499313 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08814511726089463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021838335215906522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.4895059624812954, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0047832687395796825 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.13454630567628098, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021043161354972286 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9fa8d750b746bb9899c44e93aa3d9445b24c88b2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.6824798233982887, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04389086131276499 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.10174735745648503, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024884464079584444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5382688832041609, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004925292282078669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.15505433986152437, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002379813501347791 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.05132100913201794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016726572160033631 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.2917616762862299, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004373173732071418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.07734137021121697, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001591701564073588 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.08461663803530442, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020159579534064975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4721845695274402, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004823954126413361 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.1300997256412491, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001870157247818775 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08987573937521823, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002214880051358518 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.4806380167892775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004741206948484179 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.13698687887451672, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002131522469829751 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0e0dbf88e8b1dabad366acdc0422f9bd9ad1e90b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.02706027614634666, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0007199959223496352 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.21871619349540652, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003017122387430669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.046262195996473646, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010671021290240828 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.005489404852576963, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00037168133237378876 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.04039720237090506, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0018373898311850336 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.0091292497595853, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005586233596109211 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.02627865667490291, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006333002013249642 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.21539356134726143, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0028718466577210507 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.04506442218172945, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.000951352027550179 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.023947674613616042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0006311347984113742 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.19592189387116501, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002541067542265314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.040980292597500946, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009311721807975657 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.15649947819833196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0164476915271142 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cfb3119b5efb197f7398cb9d60518ab3cda7dacd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.13566963010336247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001901900628554032 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6825350408773228, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003845047297379057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.21539745536997254, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023973358203774717 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.05837651126421021, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011430214532306153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.32049306904101077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003907206521752479 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.09345632716516253, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001519666475238801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10694697019167072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014098708843255005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5679246988503801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.003975927481974845 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1711923400170551, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017590691181499992 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.11553086397885941, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001655229291094829 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5900036748532257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0036908035987317585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.1836183182793589, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021016203672280464 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.6599294185467146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06211968556668088 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..da20cf94e3f16e7b47c192a168e46184abf30961 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.1287133172816397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016808581165882275 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.700127439476741, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0037713353423981066 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2076581040011104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022583455874752344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.057317013209503706, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000967554778182446 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3482915132800189, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00417921721931668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.09362913479276391, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014080197389295243 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10016020267379452, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011971639013113444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5771110494449467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004193435707875633 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.16305870413789514, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016446312009551617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.11082374580358195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001491597958210396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6106710174543206, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003704970101429629 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.17891433463701623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002014498326601367 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.016001208228411, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06460792855397608 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f8ef0a83d71f5b01774fdfa9f8797eb9c5af2d3d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.11266596272845417, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014964276623119837 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6895629319342107, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0039035968599018256 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.18531216357578198, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020715765879598448 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.0490979915579544, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008487279552315122 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.33860846873242856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004229418238586871 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.08163089196343765, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012508375480738728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08742234681738852, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010302549899947866 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.573502299945498, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004398266028999487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1452630444217927, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001454000064654399 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.09661481768181748, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013130442610157457 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6010245565717771, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003825525802603016 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.15908058820614102, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001825274022822222 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.905820614761778, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05980040525962388 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e64a873ef7ead8b933fc222063f19472878d82ed --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.11665256703134419, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001621644318040768 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6784675206475869, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004050953554566147 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.18934839167613163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00214187386551334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.05132787287975547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000890879748268458 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3387961329017755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004335693857367359 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.0843786630967621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00127055713033487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.08975734228271094, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011007534157180517 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5600296137109959, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004483398306149866 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.14739292124783474, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015062281210773386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.10024251913081043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014111520702234548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5919031167415147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0038677271600221276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.16288936969969903, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001883263572481922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.9502927707808198, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.052020548700027196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f173de632529260c553f9bf275e4de93598edeac --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.13927878202099392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020669175591838764 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.685945936711341, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0039463810264872105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.21838667940415663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002647140626260128 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.06409430762337304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011913214183825302 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.35060843893599064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004265349732243715 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.10182499744322755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001652327275662303 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10609511200918728, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014594305881602467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5591004118426898, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004296337590666952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.16835209567162987, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019166110953087441 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.11992638153693579, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018004459291189343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6004050096251264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0038569357942678053 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.18842254040460846, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002322513232328268 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.226426427718422, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07967413364321198 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..623de536114e332c376540fac28cc0763e19daca --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.022502146837926656, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00039190309284914334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.19344726390485775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00254983973899585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.03898895789773015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0006319532341075637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.0023435938414272757, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00010979866573243933 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.02355849492139947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010345098847200228 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.004145988607839967, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0001880250546353047 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.021854072209224234, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0003652930614699315 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.18931671006162032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024402267670285537 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.037891534322081845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005877375532899554 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.019010396571724965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00031345492195533365 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.16545014623836463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002138025338260489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.03294863446012075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005052256590344441 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.06125988659412747, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.006732229818369253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..99f18e9daf6a24a8d6649d1338654de5fde1506d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.05975795472897865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013335748577968784 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.42130399574332783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.006021456162891838 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.09987693823507093, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019433674289780272 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.01988607904073932, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007117135743174414 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.1562830460937769, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004166853394878266 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.03326346351515616, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010503739288322067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.05146250597168572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010354352926742175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.3777395630694112, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005232167400098075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.08653005164164256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015090364266964335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.05159147675400835, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011924678247232138 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.3715339371365095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005597099196018005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.08634306012268002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017368588952515506 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.6395614972095213, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03517585023209597 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9f0e805c59a542c7f4497c402e244f077140159f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.0810153812005263, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012511509455719108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.5539166378964278, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0047292784942071205 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.13518295850423986, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001797688945586576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.030941146425207795, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006330818483411955 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.24102157803692914, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0040746001964615045 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.05216469490783778, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000960104338852003 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.06909862720510897, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009250350731313912 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.4944830893295965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0043942354024226 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.11606876587008064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013292695114441179 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07020253091326639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011333955676522578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.4865962462473607, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0044821465897910496 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11717633383150924, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001626596491079524 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.121353825084264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037725280779123294 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b48274e1469fbb712a4f1290b4099ed445c2dd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08857774552753468, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013096992843302045 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6136927754214567, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004518995907883919 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.14844541358150928, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001876503606273822 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.036937522767553584, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007034293513284538 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.2927059910268213, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004314941715310091 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.06259164725081237, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010634444975241323 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07355997157054911, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009262242992983204 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5376590137054139, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004237557928355675 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.12422704982040522, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013236189729225278 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.07743844850036462, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011897972880537623 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5449619727275773, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004481006887033573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.12987847358588697, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017109839553052014 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.4361174109813004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04805971189618587 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fd5b8e1b83f934b7140b049fb0fd99fd61e0e456 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1061802370663775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001463735554331979 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6843250436064637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003920444970220284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.17657657718723166, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020678023491144965 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.047497391617128334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000803679569171621 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.34713674890385776, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004140778426080258 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.07984816777629551, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011943148273671116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.08171760415769097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009627992963593435 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5631982903026473, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.003904405962830157 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.13715386462196175, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013567926683826159 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.09467046528011347, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013521830799601206 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6186687061302965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004013369252904057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.15756804588459467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019178471738246978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.7466592730628139, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06692844403557563 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e40610b93c6ed62672d38bc5f7895d2a2d3afd2e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1107893274792466, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015002868666159005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.682687883229354, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003949108407059522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.18316360410296145, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021020976278752416 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.05017272336825218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008414952830314042 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.3508191888121574, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00424171066823648 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.08392662509619025, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012407470721852972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.08485730190536447, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009914986606501136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5574721443850629, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.003989848256957862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.14161914296738579, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013917592431469871 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.09855924666321038, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013739812469034743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6138525490419964, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004009808040489546 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.16308062545421093, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019353462436310985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.8970740397337433, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06599212740506186 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..985e0c68b80c9921c4a01cea2e73df11190e82ae --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17950975675556238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001914110077898584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.3173563571239822, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026609188157218433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.21366320551854953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018302570738398985 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03857602767598888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008009614211947492 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07079824622444279, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015441980491765989 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.046175339585206684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009130716302254005 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12327264643230568, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011861060865238978 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2279772883295184, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002052566227453621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.1487525060765686, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011699273254385182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16610983563072396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017654810205492105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.29490158184592447, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024968605272683683 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.19795971817373118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016914247514331865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.055966635720875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08686542114749111 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..484ed665cadcfa66aeddcf2e94acf637ddfc3be3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.1645016598415173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020439988941701897 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.2781090142745825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00283011336228724 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.1917572377223911, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001964109694244107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03451158809220734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000862899520574188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06057288990931606, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015023541556298495 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.040168134051973815, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008924396157130113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12025147568699371, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013845922051473774 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.21167965788899928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022235680991272095 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.1418918052948833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001334398810370599 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.15192216506378955, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018955299167400376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2574177460768254, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002636255512192128 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.17718406703037073, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018199571065040082 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.8472449992080195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05972184050829182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8b87c5b40b7b37c00eac8954b87009c0748fc54d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17461621417538908, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002031440517917525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.2964362262793112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002726714099481616 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.20405321174077107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019056600496191807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03893762449906625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008525122868332632 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06838081813859713, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015611393255752124 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04567156337896173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000932894552583276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12669938791067256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013586831428520094 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2241091285172054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002190856032138392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14991267136518538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012964821918338744 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16135042684996723, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018812047542648442 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2751530306927472, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002569597423808091 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.188778720271424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017684778684466625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.1250913371882443, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08082994225847336 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..76eaf81d9b3d7c191bc75b47e54726c84603e72e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.15230744068697982, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022959707383516507 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.24820187689883588, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032867277020760042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.17160204720456523, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022197524833034343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03434424273820165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009243231784645324 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.058823449262171576, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015444564017563487 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.03899411935423918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009187136392638074 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.11092147540747452, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016240093919999271 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.18694301859386991, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025880219875020465 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.12583346122758107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001558801054784864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.14135092515979492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021286008140915364 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.23101489053303867, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003064522846659888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1593908947935989, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002057263273890102 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.198529629762632, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.065300185227075 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c548e9cedc40adb08330d6f03d13c1a8f85091f8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.05083357669029093, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019357918297942475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.08244342719669248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028813872714789964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.054987533485356926, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001890886367853537 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.01117210874945532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005995255967837975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0202786766574289, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00114540984571034 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.012617379158558721, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006313655807193013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.038366665167606055, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014542518656553282 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.06396990192776628, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022687929818678404 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.04157458484226149, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014009278175922768 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.0469155306552092, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017939164451876647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.07620716657990878, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002680492989639914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.0506618129825057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017421779361228911 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.520053083116806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.039623345914897065 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8d9cedd17f1f6aa5a50ba000bc3e86b2677b6f31 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.00861372538113473, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001009916194724285 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.01235013958324705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001203646367559825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.008446934607939634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000836413601735138 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.001811642633420617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00027281114466191735 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0027137133448325683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003698010920981975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0019124752375349172, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00025472485058213127 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.006548532654933444, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008249280853205021 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.009381116428004776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009128471950554741 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.00624239280071284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006013075457035342 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.00793806088766393, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009498773132576785 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.011369439233893231, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001105032966952515 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.007713721590588463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007579891955786283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 5.517328369410883e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.0386230327964954e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eb2dc6baf0dab1de285146e529f60be4992144a4 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.07582764050130288, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014190110762892887 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.1277933977634297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0022684792145317057 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.08828349309152168, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015115974554693625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.009815067973854086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004155165586017958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.01808910255093881, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008644066184478445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.011691902088528949, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004869755165002716 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.06629364537173689, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011431141940482985 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.11462267518707464, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020002236200532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.07791270770462665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012475871998712364 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.0708523421701339, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013109444182449496 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.11963990635370186, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0021050935160865825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.08254932336997271, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00139813861005186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.5075417133761574, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03895909978746295 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..faf4041931c385b11aa4d9cbbc16ae76d0689926 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.12719095564056437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00177458933509712 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.21244487177301377, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002669065247115643 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.14743483455043024, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018102937951181683 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.019336079923150524, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006341515576063969 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.03414861395030515, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011732733204742757 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.022783938421878452, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007214151081414727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.08932246378264151, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011470443498572222 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.15500596932030242, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001969158666967818 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.1046400973633858, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001187021079229482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.11886530728516936, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016395583632656235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.19926746759843972, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002492327836587634 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.13791245377427, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016717888556597065 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.2257876930057892, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05098211417674737 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f1aef7eb2763fc22f44b6b48568fa624d04584db --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.16106457000008795, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002292064983729894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.2557520463315039, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002954728954094822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.18051522966442277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002061705179353923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03652753511175213, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000971777928411179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.05873911371100809, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015215967074205675 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.040384981669924526, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009583825110008918 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.12029197082375237, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016567061948962215 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1961569067857406, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00230187114523418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.13548549449383468, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014439794973907486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.14978318904580395, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021402940102766984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.23874291009412155, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002780798318742396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.16793091858839554, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019139229473260492 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.2113811064047963, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08314019884439218 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bbc2eadae5e5a09c1dfeaf48dfcac0ec46430310 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.1533759665863088, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0025577920564815457 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.2259942077891069, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033053436745626315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.16139635448455195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002249446575446669 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03725286699889529, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012332919944079443 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.05566813880650291, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016306556541296991 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.03839113878587267, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009971143418407548 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.11690535506105171, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0019433797646747934 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.17591155413008638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026478755663225566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.12314296805478218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016547329508161137 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.14302393847897105, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002412541062432112 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.21074872905474742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003105762840068013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.15022458367211336, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020979925530018205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.4654367545222904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07024206603649888 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0b643e627da319b9a5a53b8d90ab4bc42d006c83 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.05028568853458266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00208707409282097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.07186995080563745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002683339801036182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.05011197616686206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018346039701914467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.01253202087368826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007848832756426618 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.017990065250321798, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001031089985634214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.012281079448761325, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000653869447400363 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.039231023055013595, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016071013078014019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.05718233871131803, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021546633328687506 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.03907293229046315, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013997352977999286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.04676947454126935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019385598696383512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.0667844061216085, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025123745789632553 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.04654151233042153, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017091787582805187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.4508825944565298, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03559978337994854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..14667f4d361ec9d375c7da97996c314ce080c8d6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.007347042331656005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.000855877168700121 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.010778250545529063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011521636177528214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.0073426086769494935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007675421686666748 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.001710980048638282, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002755932507822362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.002533909675938538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003864838960639161 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0017130160893721097, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00024830222441035233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.005509333596115347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006321643878647678 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.00833978450728592, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009079115070139992 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.005567943512045762, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005784419503900142 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.006862990886278549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008064303620221591 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.009971153288718336, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010652484370260787 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.006800973372764665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007105353563633134 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 5.6409965833073724e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.1870470936219164e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a75b82dfbd97a4025273cf8c62aa59fde6f7f9ad --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.15985049689784436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003621011460260559 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.13728275328074455, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002052435724642858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.11036144862640505, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001530599636812561 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.027302086171180434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018954157009048764 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.016538074725194387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008215165441681659 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.013469176886654546, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005757882354899623 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.14013802103333775, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0032342892555596208 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.12221875356209057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017527389452773581 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.09678143768792878, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012407236257212446 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.15058787526521, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0034756128206833727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.12914383726578268, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0019154210955539695 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.10348080739372295, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014111115580161292 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.4796152263707372, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04020140828753968 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..db2601891936580037ffa64859e866d6900343b7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.14983169230821397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001875244981058499 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2495117966397059, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025670277663872557 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.17228713075884045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017266755452744172 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.026135894449625118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000877367958825746 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.04414741214905309, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013248833452923223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.029442252351593513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007924688244414296 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.10567365961028638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013254739675144156 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.18141239888145783, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019407502811784976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.12211374752033236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011442930963585947 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.14040216161795577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017456836256513886 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.23471174419778962, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002419985972720382 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.16158045703046053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001603451683480107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.466848818979777, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05122046143290242 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b5e9abe4da2564b17d4d66f93c658bbd14cfe581 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.18601116127457196, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002674455746095039 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2697340694293458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027730262294329102 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.19387864929813403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019373404999633163 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.04394534305996648, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014119764995516235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.05981581118058386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001480173209074023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.042503871652830684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009697312508853417 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.14114910909948458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00212420941260633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.2082056428265491, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022135365734890456 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.14644335466030722, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013693635698585327 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.17326036096446687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0025172862394438143 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.2520550560532216, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002612078427237868 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.18052324198395442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018002340718791736 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.1241997955779706, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07242902905742744 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9991a673108398d08e2777681dafd0341d4470 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.17904260819617532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0034025414878592264 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.22026524786669358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032630989926899674 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.16464113343789846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002337449087567628 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.04756210696143349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019178497340389886 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.05249440732325283, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001516559702111135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.03902103124926395, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010593881502234943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.13877691598479244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002822308452152884 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.17071662509195945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025830517349758665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1254589966504649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017313167474427463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.16705087756637185, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003249700349640652 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.20445349405368257, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003053057869293393 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.15262792584886165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002174966680287157 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.452893619605998, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07535908896046421 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b575463c73c0c28fd2e1736a35c6f88849404da0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.05505736228678805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024837468725238195 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.06470336830124988, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002537880582413385 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.04778674517582264, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018365614759514285 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.014779446264054693, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012182175752722738 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.014759184580045196, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009302529142526923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.01084532274916111, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006463021300909807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.04401215074366241, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020765428405668932 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.05148347306396624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020437326538755756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.03728476607430857, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001410339676510439 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.051354143409556074, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023530881508590775 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.05990804305583379, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023627429149512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.044201424272258295, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017014625957187778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.2501992622083412, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.029972656605070418 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..772132f5e532aa2515cb6d4690c50237ba7c4b6f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.007051967760789057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.000875522796789484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.008622048628992848, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.000979421104357186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.006582381210133649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000736046358117549 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0017517411283309776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00042676011787971466 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0017169194523458937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0002723182367088392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0013869187743711499, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00023793902364453525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.005694743781016978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007231881422995928 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.007034117275652774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008076270617686096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.005255733664967675, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005774560163471757 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.006656694900838437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008364652583978511 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.008085611227571524, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009219645931116729 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.0061682356318462596, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006919242043320082 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 5.897323116804045e-10, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.7226783506844502e-09 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..745c5800090029b2f47a3ac33e5ec6c315e1e1a7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1498248405216555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018423908624139891 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25526647462429297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002588155452951415 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1751592790344999, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017874794849447823 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02866254588676867, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007271444111163778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05162759513352221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014037908842865225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.033925070200158246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008240877531204439 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11555678082540916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012586001477032337 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20559491764149926, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002138172468666332 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13711284339871613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001264815697203119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13771072102711007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016874302363856556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23587046561796698, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024016834853047104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16124043802649227, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016384576957041044 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.484894171357125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04279862989387049 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a84b8a2a873fe8c04e12b1aec3258df721efab --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19820460200268925, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002420333361877177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3126809375051542, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002841672003398339 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2199694812178742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002010819735875878 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05016728772414271, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001264223371544086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07815311881303431, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017034183791022172 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.053937307211284244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001075193073237261 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13997435500321276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017712606376717578 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22624387436643148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022018166295221108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15551270440786846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013566776773220371 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18565508806059028, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002281622879542436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29360401456830726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026791632190064567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20618836045677164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018846829640657573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.6577106650236018, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07159113573701131 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c38ff4ca4d258067dfd65107bafc1147d05dbee9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2156784752655553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002732677109431224 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.31237685950514665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027963244796416183 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22435918885551207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018967794998520616 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05738474692529928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015267876059967937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07970531997370003, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016700905255101184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05625290668830642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010633219356063186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.15451467159603913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002139183021303883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.225376744790094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021771714204925977 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15917844337997755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013165721234956872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.20314308377906012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002612094861988029 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29423268640214134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002642162937712342 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2111183297537402, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017924385498513864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.8678009037418817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037508767497823454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a371d5c749d70a7ea892cf7925f0e89fffd4e139 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.18405392984121038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0029940021228191355 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25514944671406053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033459375347498267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.18429548803283216, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002297153333126352 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04819840246533481, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015092856540081493 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06631560044043883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016628335172097389 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04688317854067561, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001094346737838606 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13375051095662321, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002380892659084607 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1853146721328658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025854840001956502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13173075629033756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016475788902520988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1733338698061136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028479993519003807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2401891556702282, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003169734302090452 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1733778342320142, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021681294491408274 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.9935371246792863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08389052572374653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c8569f6bcc8276eafbdabdfbe150d5bfc6bfb16 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.056788240211910736, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022506997974891542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08339625174295712, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029290040087097207 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05659885761358278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001930223829324733 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.014380667558554284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000977357379524512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02101316087357165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011619283931715888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013810868807903593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006930003786904605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04257610439946902, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017464834049321358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06302882516092573, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022681193452949904 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.042007409754473535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014258890197756744 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05318358596123625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021166691321526723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07838773551352705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002768443855799422 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05299986098950862, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018068900698462816 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.5461443377554994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.044799916123802616 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..86de4c906a76b5963e84d7085c0f330c0b202b30 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008911405879685974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009422475593785767 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.014063917251247313, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001396048114694196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009231042835921754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008787834704319035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0023383395560896076, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003211681514690954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.004206929070053043, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006053899081506379 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0024939038248004536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00031409966961336354 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.006838517996278174, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007309899843193323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010882093390231358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011052580830821516 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007056657402327181, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006761968053255695 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.008357520892672053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008835066902880334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.013172143042073951, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013147147386398047 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008623638497104687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008181729546128778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0112557087204399e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.4150938941127893e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f47a55ebb1c4394200de2d636bdad8c1df6f9b3b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.05237091241441722, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009605690159925948 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.08326091502282784, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001307194264865998 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.05960995824024046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009498209060065967 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0011869125521613468, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00014357479408292278 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0016283614649422807, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00016925819879762434 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0012597496537706083, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00013199105164764973 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.05023321535529295, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008739713332714877 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.08083866411125382, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012488146734351867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.05747907925182631, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008796408111632294 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.04836041522625723, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008749326026700274 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.07770833418809467, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012165480160224367 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.05521751020238309, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008638920844959852 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.07299605737537797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0185742385092221 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c66a98444ed904728363263efa24ff19099ca0f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.1343039642172445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016732214768695325 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.21562840393454535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0023741026821102597 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1528764985075287, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016363848989828594 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.018046982061484945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005964805823451741 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.030411699104941473, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011304776755259463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.02069013916427929, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006612995194022327 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09295669984502772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010567113830758765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.15447310700799974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017294150710180575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10660240871433446, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010219647383725968 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.12687184785116162, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015609172625269986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.20412846443023683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0022261282748651857 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.1444397478005342, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015191037848864151 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.1223009951089067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05836654112265766 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d1f76fc72bfb5ea66a7e8a203aeb65407df0c8d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.12697080832224822, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018201089851012445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.20109011349921127, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025569830771983836 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.14316491558968603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017639356738778811 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.018246718615969375, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006418388179326544 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.029736909566824603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011083029361886995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.020465708060419606, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000671449947746698 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09139918661616583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012236115566095836 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.14881163413712634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018937511898562988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10367762364897098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011709222919166911 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.11904750892035158, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017004264999703578 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.1886874000288314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023745592779610444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.1341766534995056, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016336963572172586 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.1180377312448153, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06708589951037792 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f1aff51214ee1e46705986543b170fa059fa5d36 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.09887369502675063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002043227921916099 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.14678993430035586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028179673635622318 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.10634798502269208, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019791669918603184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.01567989332792064, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006694052515306127 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.024427777089035076, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010678311796068322 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.01711652805678942, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006882433105651278 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.07436580031645514, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001457445017450415 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.11271151612770153, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021453041661070715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.07999450335755226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013938218605260453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09222600214130718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018861337709654303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.13747754955861127, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002634806979634438 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.09936266385012239, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018384025373419349 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.2836466923526264, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07070516293414274 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e770bac89bcbfb8855531cdb9145ceacb4d1157c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.025095421938052895, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012461801615902318 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.039170985603147426, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0019196761418163904 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.02713454046170108, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012975310299864612 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0040053841270205085, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003368144516166286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.006847830102416033, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005830833698761628 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0045440484711843825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003623167516261232 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.01906700487359207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009287450742508868 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.029866238646993826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014553209547808461 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.020373823655558135, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009374343240357042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.023282946639617354, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001159003398883688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.036183810616452226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0017667839555253213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.02507998631189558, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011959812138736497 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.0870839562978014, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.006785360071011486 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..98294b5f410b27ee4486210839039b2e5516532c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.0021405897631643946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0003541239789070288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.0038253034844642347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0005823536138290148 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.00249534987383561, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0003873473209623069 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0003169846199202593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 8.619486969926879e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0005250548080533436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00015216407757586244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.00036639163657650276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00010049770707316975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.001603458706659755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0002602089443308959 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.002861894235467422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0004267622874298756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0018743787593694622, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0002863935366559193 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.001980619561518416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0003290309068301346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.0035607312643202195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0005431230446018559 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0023091312121054073, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0003592857430542886 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 8.093999898733972e-15, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.0779878610297242e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e583b7f269d8582faae78ec72677a4d0929fad10 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732958 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7591710a4fd92e4a7866dcde37ea8ed64bbba285 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01480686473373886 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2e7cdba57ee9768abc92d4a4c730e4f6bdb9f7fb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.352, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01511040450564866 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.361, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015195720118175117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..92a1d8b524ccd46231279ee50d9a43fbb30f6865 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015019206922356951 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.361, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015195720118175113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..07c69f213948f72a24e86e073f4d4dc02fe16f6c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014965960710224472 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014965960710224475 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9bbd09fd8dc95dac9425250b50ecdf7bb39de55d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.348, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01507060460376841 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014876872027456736 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f47d21646efe8b087f5e92fb3eddb9be43d177f1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014955087918653598 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01488827258820394 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..10e944f8a1086e0978f6111d8702a0ccf71d3222 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811485 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732961 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..825899b38ca00380c67ca090cb93940f51b51274 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014955087918653595 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928362 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d3791e8c7cba4b4e5c3098cdffc5a5414976a05d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.352, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015110404505648658 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015090650341444233 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..25c30b6517fbf999261efc686369291b93dbaccf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014998131348402707 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.353, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01512017260548369 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4fb8e4ba36f6a00483d350676c58f52e450fcdab --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.356, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01514904265930662 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.351, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015100563798316405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e649faa86146f4472a396fa03ce4ad4d9cf4d1bb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014933117490932573 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014976758771620347 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b601e7495ac87ecb35fa62b838f2e105478f95af --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bcbd9334d80677eb25cb1ae436c30be060a6bee8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055235 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015008706182121731 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a17fdb223032a47f3f3e35234f6b3b61390a1480 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.359, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015177264224798597 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.0150806639915631 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..697af564cc5dc942cad979428bc499d8a28ac180 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01495508791865359 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402714 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0fae85e8c27c63c9594f8e2010b81a2c1d21dee1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0149550879186536 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015039986742055237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4fcb8ff3bf54c5dd3079126679077a7a97f79706 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015080663991563098 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2568779d93d84780aa41b0972e18f271e4ee1660 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..562a01e6365c0230ab4cd201b51d95a56ba4c84e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456732 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015039986742055235 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fc2167fda686a01dec56e93a23655620746c9e5b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928357 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab3f967e5d9eb0538a4b0cdada706b6c5694e146 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203943 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928359 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ea995f2bdea705f907ce07da720f640dace99037 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229857 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..abd42f67e09eb9e36998cfcb25305ba1bdae215d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014976758771620347 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928359 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7a0ca1069799f0656e87224f81c5748a1a294636 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811485 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0c4189b0c9c302ccdf75a7d3e0e593d4c902e813 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.356, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015149042659306625 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce8b59628086ed9ea629d29c37af0da316fad63a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015090650341444236 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015039986742055235 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..426a8b0583b7411a188d7a6cc6565e3daa98376e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014853842487270334 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015029633724408948 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7835928e9949bd2e644a860d40ea71cec96a8252 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014842213153411237 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01495508791865359 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..db9afe318a58d8a863ac85697edd18900abab8df --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229871 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015090650341444233 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e0b0c99253d7a04fd5d27246b528f09d62d2a01 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087971 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014770821817934645 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..987808556fdcc1934c88e0e638f2f89fdea59d2c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..afaa18e435c20c9fb5b534d08bf744be2c7a4e7c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.0149550879186536 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..11bd6a8e31cf5c26be539b43a15c6f96d106970b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014645596385722692 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.304, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014553205687950453 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d67aab009533a1829c45e8666ce0796f4ea5123a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014671272822977885 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014746404865473477 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a25ed906a6f0c294570ddb7a79d5c181bbf20997 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014806864733738854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8f607075a21dd8db99359abd9f6468cd6eaa4e89 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473479 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01470919305605713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..85b5f942f30fdd530eef33e81db801c222920155 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014976758771620335 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203926 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..21d74c574d06134f237b83283e12448a4f9bfc75 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087973 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014671272822977886 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a353ac1989eb23b6a2d1240b7cd5ee7f1b6168bd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.303, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014539683710535265 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.31, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014632638658632903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5c4901c4a2e5b4b63f9da31fb8eaad852882f5e8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014566646394664397 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.0146966319607925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cdb38b0f07ecd42ef296a9384b040a378315d90b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01489959724281148 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014922019523732963 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7a535b214fe1d1906be461495b7c8b14eef91373 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1d444939db4c7ed89c96e4d97b5f741e570cfc7c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014794927843348628 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014782913600996686 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..17b63adb2f5a47c423fd921247915467b76a1d62 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014770821817934654 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014746404865473479 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8c263313cd896c1370647a5533c3c25f77af5d9b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014899597242811492 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.31, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014632638658632896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..120ea9125703c3691da45eac477ced2c7950b690 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014830507204541035 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014709193056057127 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9b2a4158380187cf4ad10c80bd272003b0f81685 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014645596385722692 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014922019523732972 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6c9ab567599d1c61f476b7f3245c10dd07d5e8ae --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6e4a4f24f849edcac4deddaff8469382276c2c65 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014933117490932575 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203933 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9774e92fb9a4b73f59fb854c1b83bdea78ff154e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014933117490932577 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014899597242811485 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c931984d50bd14324ed03ad994b41a762346c2e0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229868 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.0149550879186536 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7e2e2d51e8f9586636476596045c35ef79f42b4d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01491084616422987 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014944140233795018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..612086bb5e3770712adfb19174b048be84d75ddb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229859 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e06d4f28a8caf2a3a400430571d27f7f531e6c16 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014758652303574888 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014721675438880215 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5e4dc4ad4710b6542d7dc4e75cda9deb757c5794 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014782913600996686 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5f8a4e58296c5391133c427eabadc1e2979fef9c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014782913600996686 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..48fc3d4a2f3d346efdfe4fabfa46e0858ca99b42 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014888272588203928 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.31, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014632638658632896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..94ef9d967720f11cc833c0ce622632ee099fd07d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473475 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01478291360099667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4411164665324c424f503e018dbe0d2fb62aee5f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013622434813136774 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013613950010225598 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8369019da6a4e6a93621b8fa74fea7937b485d77 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251947 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f071fe244b5f121e9ec8f99db1de216cb9ebd488 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013579531277800917 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013622434813136772 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d8a332318658ce658c78af327e527c8787043452 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821474 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3475, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013751753243291854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d785946c62f2d27281dbb9b4c458634f4b43b0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.31666666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013434078660827388 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013471620929769139 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1703c111262bf5714586a6d7fe8c3870d2a3edb7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3308333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013588208070708992 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932879 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0a3ee521f143974ca009fa5ace9f0eef0a7462c2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01362243481313677 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013517438120881645 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3849388a59ddf8cd660e61d49614da7b46d821fc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225606 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac4588135cb30877f0fcba290efe13eb6fc6800 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01342456883035645 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013296358936471103 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..37b02943bac3cd33e2d969259b7cb00677cd78a7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225605 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc7a8cd49097b82c41f296ab0a223b6ed1542a3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31666666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013434078660827384 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013443538681348056 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fa00634da6c5653d87c16479a276817af85a1a09 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013415009084004868 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013480882752851548 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b6723d400cccec3edcf526088f801f9d8de81e55 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013544340907003665 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6141136ba64b05a3d6dc45dac9d21c209dcbf050 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013664144006618265 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c7f40f291aa4947349e351d293582dc0bdf8e0f7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013376268790982096 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3125, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013386029277441229 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..04ec5a66a91404f034b8244e653b262e5762a1e8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225605 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013490095282989521 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..311140ca37467d68d08a86bb1b8dd02a98d74211 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013462309712005136 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932879 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..12971a7f75541fe92e6dcb27f164c943dc73dc8a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013499258621103245 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013471620929769135 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..69f519223da0d310104f5c3ef77ee8137e0e5de8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013424568830356452 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6b18a6adae11512fec3e276d81ae2b4a9b8a42ca --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2e524c5977e048315ba87777b151190ad18325af --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01359683672948517 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013526454480351023 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac4c3fc5525594c4a3fb23f9bc63158adfc6865 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013570806258433633 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f74b7afa183de42bb94c0de6923721d68d82ea9f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3425, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013704669762934727 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01364760294240639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5f7382a04c48afb08f1182f5259838c1c2f6caea --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013622434813136774 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c7659fec24552d11322b207fd1d85c0bfbc20301 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932882 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb91b1805ee3dc57b94341c2cecb5114c0523e4d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013622434813136774 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..24672af39aa0820b2caf13d43abc9261fa995ffd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013490095282989521 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013452948996996303 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cc440a65ef253bea8af5162a05ad8f128ded300e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013664144006618268 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103244 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b152e293402cc88b5997cba6c8a1314d902c5bc6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251947 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013605417345710528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ef3847d3255cc23853399c07812883d65dea7f95 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013452948996996296 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01357080625843362 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..83aaf58427f560cf98dd263a3732778e8711a6fd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23293515358361774, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012352507042617405 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23293515358361774, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012352507042617405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71f053d67e031f9be6bb2a51059c24fb4afcf4bd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012336718284948856 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012336718284948856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f92878ee5ebad0f2c60f67224d271776389555e4 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.22013651877133106, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012108124883460978 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.22013651877133106, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012108124883460978 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c0479a5a378e641acd20b882a3391aaf1cd7df --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.22525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012207839995407305 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.22525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012207839995407305 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4c1c441f8cb252712c41bce76f3ed51bd01bf72b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24829351535836178, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012624912868089765 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24829351535836178, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012624912868089765 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..df5ea1488281b6905837afb7c97b353e2c3669b1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012336718284948854 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012336718284948854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..54bbbc010d8bb97302f09e23ab33ae601ffc8ca1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012928933196496344 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2901023890784983, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01326157367752078 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a29e6cbcf5ca822185ffab1a4fd4e86f2ebb32c9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2790102389078498, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013106784883601346 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2858361774744027, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013203196088537367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9da606df4c1be09fffd7dd0f4093d6326c14e23e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25426621160409557, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01272499994515773 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.27474402730375425, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013044617212771227 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..54ee8e49d92c8d8f6a6cf33a81df4f56fe6e008d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01266819862131543 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.27474402730375425, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013044617212771227 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2d861204f4f9b3f9c1594cb34886a4f46cb3618f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01279455375428868 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012980954547659556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f5c4263e5e5a26c0da5efecf82b52ca64349cdc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012639407111926433 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2815699658703072, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013143376735009019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e8e3ecc534fe46855f30dba1ea7ee4afa3887d1c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012445770028026208 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2781569965870307, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013094469919538793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..656048333e2a44c707696efea26511c26eb7d753 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23890784982935154, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012461071376316614 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012780770562768409 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..698a529a1ac263ecfe5136beb997d687b04151fc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587094 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.25341296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012710896778378606 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..018ec40245a8c30b48fd00f98e63e01258668a60 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24658703071672355, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012595726268790127 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012696728980207706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1eb9b7d7dd6121cc3c9e3081fee1e3ad86defd9e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24658703071672355, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012595726268790127 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27559726962457337, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01305716965576184 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a7bfacbddaae18c622ff1d143c397d4d7332bfe8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012506564839739432 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012780770562768405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0d46ac6deb0a5973d07fc00b7c0ab639766f4861 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012288926760890797 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012288926760890797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..82ba1aeedd4d2dbaed3eb0eaa72417948f272954 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.22440273037542663, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01219140493860383 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.22440273037542663, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01219140493860383 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6c6aa63658a7368a25bb8aa98da154c3c70d766b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.21928327645051193, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012091245787615728 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.21928327645051193, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012091245787615728 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8e9533af0fdf8654d1a0aae5c21a575182f58f6b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.21928327645051193, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012091245787615739 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.21928327645051193, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012091245787615739 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ea5d5f1955b024264ac67e21213aedf981a7f0b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004748 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012399451855004748 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..59c7dfcdcc6e199da335fca24b38bf5009aea676 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012368225378507148 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012368225378507148 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f5e18b828d3b05934020f3ccbb356687ee1662b5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313969 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2815699658703072, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01314337673500901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d34b6ba4ca0d59a834c526bafa5c3afd872f9913 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26621160409556316, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01291577478152322 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2960750853242321, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013340916085246263 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..513a828f4138245feb60eebc29a98e8ec6b82fd7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012942030195136416 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2841296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013179442447653887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e99f5147f9f6de7105fa6c9174bb6a67f969212a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26023890784982934, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012821930225112552 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.27559726962457337, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01305716965576184 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0b1e420871a208e114bf69cef7c2a5edbd985dc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012653835621466646 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.27303754266211605, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013019332762635732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..60b29d674fb219036948794936935fb400053b3a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_challenge_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012682496334042967 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012942030195136414 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..227d3549c97dabc39e1441ce9e6d3775abf8f5e7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23274410774410775, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.0086711691205793 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23274410774410775, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.0086711691205793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1e31c9055fbec274e98eefefde3b5a19114b14e0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23274410774410775, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008671169120579301 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23274410774410775, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008671169120579301 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..68a00db673e0925752335ee319374eea4da7d711 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008757032594354026 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008757032594354026 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d35b7cafd3ccfc2353a4c5359455ac76150551ce --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008809171744720559 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008809171744720559 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..25f8af15f7ad86858c85e191ce5e558c634099cf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23863636363636365, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008746465140706126 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23863636363636365, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008746465140706126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..26f1c6b9bc7345ab39344d7479736c083b81f9ad --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23737373737373738, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008730525906362441 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23737373737373738, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008730525906362441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2150c86b3f5c77a4ec64d3110b5a964fcfd62db8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.35563973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009822854395535487 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.31186868686868685, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009505823345817654 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..818feaadd5516e95b0837cf62d21ccba375b2f48 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3442760942760943, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009749495321590819 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.31397306397306396, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00952324533521551 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..70cd6d6bb12655ace15c03c7b4a1ce8c385c8838 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.33207070707070707, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009663817543072694 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3085016835016835, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00947747234297813 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..78fc262ae065c3b3f98a709bbea125db51e90c74 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3345959595959596, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009682137724327905 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.30765993265993263, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00947029257583118 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7dac0020d08044e786ad8aa37449858a8995e625 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3400673400673401, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009720765494805281 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.32365319865319864, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009600478182273787 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..01c80d931c19e1a8698829e37b713337b2290289 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.335016835016835, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00968516076593236 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3181818181818182, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009557408782506374 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..06009536d178714fcfe56a7bf1d8d26601a94464 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2958754208754209, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009365854134140067 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.28114478114478114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009224735470287007 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..556212913e859cbf6f40cee8aba4a7228dd14739 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.30303030303030304, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009430140669278959 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.31565656565656564, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009537019245566084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..665397220606357fd9df30808902c0bfc8b3adf6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3181818181818182, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009557408782506374 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.32954545454545453, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009645184190953861 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a0c2921d69c29b62416289471c83850c953c61 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3202861952861953, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00957415266873942 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3333333333333333, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009673016668133383 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab0830b6905b881c9cc48a3663c09770aa75cc94 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.30134680134680136, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009415259879351623 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.31607744107744107, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009540440071928289 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..274326aa589fa6b9856e57360fd4a7ea5aa0dcee --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.29419191919191917, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009350328648861737 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.30597643097643096, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00945582203642662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..530de395764eeeaf7485074742a637f6dd6625ad --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24242424242424243, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00879365151648508 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24242424242424243, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00879365151648508 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0acd5c7a73521c10d8e732d250cd0614df297df --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2478956228956229, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008860162361464027 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2478956228956229, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008860162361464027 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..208cb2ba25bb8d3026d002e0234613de8066b03a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008757032594354026 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008757032594354026 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1bf81c34d01cc043e4fa467e86cf18b4b7502abc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00882458861121908 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00882458861121908 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a61bea0fdd76b183849b793c66a322069d1b0d1f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25126262626262624, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008900141191221641 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25126262626262624, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008900141191221641 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6075e7b65706c7313b29c896a83592ac40ad35a9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008885233166386385 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008885233166386385 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e795c0bc3e841192df77db10d97d68e7cc9821f6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.35269360269360267, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009804420599378657 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.31986531986531985, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00957082182057359 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b03e41e50ad64448cfb7918f13e7ba3ccc587842 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3400673400673401, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009720765494805283 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3063973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00945945357339833 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58d8884b038baf149029ed71d537c046e3f9e93a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3409090909090909, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00972657959342402 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3122895622895623, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009509325983631458 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..01b663fc712d733fae7eb3ee16f858a3099329ed --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.32575757575757575, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009616642976885971 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30513468013468015, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00944853109416391 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dd2451f18d8702b02aaaadb31d34235be9701ad3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3287037037037037, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00963890316702217 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3042929292929293, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009441202922359185 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..12d60cf8943559013e2a1336205921e93e82c8aa --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_arc_easy_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3282828282828283, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009635749509262166 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3068181818181818, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009463075835198943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d64fa30b4d15b7fa76b08ca5fe8ece4d34e25077 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5496666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009085074954912698 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.625, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008840308272346428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a1d2af06deefa8a6279255b0cadc65cbafcabd8b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.56, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00906425508467605 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6123333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008896822947561608 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..906d19bd35b6f7319863c2bf25d03141c8a2b8c9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.58, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009012606487132143 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.623, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008849657553427542 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6302215ed9f9a0fa413e685443b0346bd74d31 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5823333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009005596833757831 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.621, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.0088588464102222 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2a3d2b55c34584a2c7b826426f2dc61dd742ab04 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.587, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008990955404907169 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.624, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008845002997512763 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d524e58cb36349610ba08042cca4d557b6a5c4cc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5886666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008985524690229495 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6206666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00886036232472252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5585810c2a9bdefa752648566c6ceff1e2600e0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6233333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884811049411477 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5423333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009097447488896784 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..343f2da7a3bf8c1570f91b8c5578adc9fa4c2a3a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5856666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008995223478188034 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5773333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009020364414843638 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2016fd1ef2c2f631e8c7529df1a603135be68df --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6053333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008925330066832188 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.6023333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008936959925716907 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..63c20ec282611950937ec6cd5d0de313505e7687 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6026666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008935685051576502 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5963333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00895916952266258 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..48210fc77acb01934cefdf62f72640762214e7f7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6043333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008929245712536294 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5946666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008965091467970754 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a904437e3dd39d54d5b82dfed2316002dcf09c69 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.604, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008930542249025198 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.596, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008960362494453694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..14559c2b48776b8383b73fcf158482c267b55f7d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008846558976258922 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.603, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00893440584870012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1aef6215b2db2f510198e64fb10f2ec4f9724c91 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5576666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009069303681923062 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5476666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009088646624339615 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ff7a193ccbd8e41ec1026e03e918c58ed5b254c6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5663333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009049526374650797 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.554, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009076827433934433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..da620115be3e219c77b26e29071ca0cc635146c5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5706666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009038582451449425 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.546, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00909150987738652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..23deffa50efb236263c6f08aadbc7263b39719b2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5726666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009033293159951224 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5613333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009061278956794627 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..abaa0a66218b598258d30f1d84bbef633c0b3d9e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5673333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00904706345689798 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.553, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009078792586293545 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..28af009ae03f6e31e5af39f5773c38c1eb40d106 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.611, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008902401412932075 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5006666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009130223008005275 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..81d13d762003270cd83165593c255234822cea40 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6203333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008861873799148995 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6233333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00884811049411477 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..15eba88fbcea0f137492e6788476a02f2b68486a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.623, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884965755342756 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6213333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008857326053368308 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..492edcfe3cabfef1403ed41239b2fb21aa82c906 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6233333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884811049411477 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6226666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00885120015653439 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c1070ca07a209958983a549379d0be754b31433b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.621, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008858846410222197 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6196666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008864883436857793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..081d403b823223a44402be755849c82bb54d7adc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6223333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00885273830576469 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6196666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008864883436857793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b92c8348996e8dafa207ffa1b8170fbb6e29ddca --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.606, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008922697920438163 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b2967a04172315c7504ed426af5cbe86494c16ef --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5746666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009027853030468712 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6173333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008875277637761275 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0f7ba200587802e4c2c3122bdbb10a0badb6ffea --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5926666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008972056373066367 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6173333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008875277637761277 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..43b167a45fc92c5b9bfe8fafcc9c88f198826e8c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.595, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008963915658236387 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6123333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008896822947561611 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c78402de27117bcb6ea900fc70cf4b08106d3e88 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.577, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009021315205815771 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6136666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00889117431069549 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d8f059d7c78829cf1524f7f0e7d7d969cd05ae2d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5723333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009034185176145654 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6086666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008911995272576807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..43e6a2a734b08ca66a28aaa7bbca589e450bf28c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.1940928270042194, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..088a5e556ba8c79e9a820d2c705c7ec93e3ad4ba --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.22169059011164274, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd9e5908d3a7d14ed2b644621e318067c2c3b7c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0663363415035954 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2593837535014005, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..41c58c89252ca5cee66707694621ea06c6780b3e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.24839948783610755, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8cc2fb9fa948dc8a3956913541e330e175ac5a68 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809221 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2198067632850241, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..19c11520691e7888e4f4e68fca10035c72be2f1b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_GPT-3-style_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809221 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2183052617835226, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..69c600681ee5ffbb0ceedf593b7348fa71a53b56 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813057 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2850877192982456, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8373e3698e7bf3cb4e296cc2e9148350635fbb0f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4795a1d0fd7968558dadd7ccaaca64e97280d634 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.25438596491228077, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..55d98526adcd1bf3fea5930aa5db299129bfdc3f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930824 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.20028724376550458, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f4d44ee525fb59959fdd97705782c8c1a0b6e7c4 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809221 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2183052617835226, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b7813b05c5ccf9ea4594b5361db0064ce5141b2d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0663363415035954 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2641898864809082, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d5016d0e42178346396b67be942e944cb12e527 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2580185317177477, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1254ca72ec2a1000591d9fa3877031bc3d70845f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..948ea6cadd19e1889fe75df930bf90a0a031ab88 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.27619047619047615, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8ed5848ec33a4bf2ff3f0bec77d3dab5acaacc6f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0672477765493766 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.33283950617283947, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..64e24fe128edf8437525812648a1283a40f64b3b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3328395061728395, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ae59e0f8377884d5e44128212248a8323690e94c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_can-we-infer_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06741998624632421 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.35176007116533237, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..104b19e1626db4a0d651f80da9b04c461c7244a0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.336846728151076, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1709af40a5e4aeaed4d376a3cffbbe62e8e49dee --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.27441920164292133, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3a83c4a27a6a510b8bbee8d47d3ab1d9d41a2ece --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.16071428571428573, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.04952230059306299 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.15573630249667678, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a64bfe20a6a078ee1eabb02e802f062e6a3a6ab0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.16071428571428573, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.049522300593062986 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.14387464387464388, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bd07ff2b18195227862a0acf494e48c57dc36a86 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.14285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.04718416136255829 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.12557319223985888, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..081968aa7c5c0307188209460363ee0c9619ba93 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.16071428571428573, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.04952230059306299 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.14033189033189034, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1487468af636bceb3376967697de3f876b28b5b5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3076923076923077, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..529daeb7a04d14b52b33a7a231bef8d3e6b32efd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.26652142338416845, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8b6e6396b36d94512d34a75e6e1946933783a90d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.32269503546099293, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e79c4ebe30abafb29b0f573b0e7f4054dc4c2233 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2976100628930818, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3c699c01101de57779d780de6d03e78d3b17d9c8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.48214285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0673769750864465 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3461728395061729, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d77f74288b4ff65ec9d58f41990940c0c988e1d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_cb_justified-in-saying_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2976100628930818, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e007cf7f7fd6b293dc2ec3ad608732a13743079c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.55, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049999999999999996 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..100f99c3cb36dc059f1fb586b57e4de3d9d0b6f6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ae5af83ef298d19080f770d5f9b832a6653fe226 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.36, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.048241815132442176 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2d2d5db864001fc412b7a0a5f46b19a75e9caf66 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..35425b660191ea1908db8a9bf1e31f26a44c33c1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050161355804659205 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0fcc5d4599381e974bf06ad533efdf8bb69bbaa7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_best_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4adec88994b3315d8be970ffa3f0da89ad7c73b5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5fefa8fe3c2e21d61a90dff65c85ec1f3080c0b6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c86a567c920be5d8c939206b8e085d440d29a6d6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049431107042371025 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049236596391733084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70ba32cdd22abcc4108e2b342227c44dc51b2190 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0558d50a4c7150e69427efd37a8ef866f38a8fcf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04902071300001974 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..255918f4723931dddc6a6f4ee1a209fff0a4c555 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_cause_effect_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049999999999999996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a71bab3b943b19d2f12b6598d4ec0c57ec4e20f3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562427 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0bce390c677bdfbf4fa040b957471deec33c157 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.38, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.048783173121456316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d1362264e3cb3b17ffacb943fbb689e839898253 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.36, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04824181513244218 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6a17eec639b9d201f5ab2ba5add9fc51b3efbd75 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8ba76ce5c77809f63c1c2b48b3211d0f62d76305 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb20c7269028b582e95523184b8abce39e640a3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_choose_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2b297441986fc7f8de0af9da05734b24c63b1719 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fc06166ee42107b8db9c81f34f2a790805f0cb00 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1f53ed365593feba2b464a86da2960f2966ff7c3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a6a9b0da2241b1b797b55b5414e9de2cfdeaecff --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..00133d56635e8748414c301c4af7b107d3286acc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049431107042371025 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e304917f21f48f3e76d4438e5e14b38857acb3b7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049431107042371025 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..05204c89bef16394f03fedb3a6af28bfcb296642 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562427 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..91d5032397e17d163de4592d1777e80d2e84c794 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237101 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.35, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.047937248544110196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c51cdfbfd10e6faa27e8ef6e0eefc7fb1e08c62c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04902071300001974 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..099437b74ec0ecd319a161b07ec2c6e437e6894f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b2468140f49c162ebd8dc9ba6e329ffb00bbdcab --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dbd7173b3adad056925185551946e282947f6325 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_copa_plausible_alternatives_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049999999999999996 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5eac8efa00f0062679546b4e1b85f1bc4254af53 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 3.872234562531536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06666687313033742 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.19449892872154664, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016649751999105959 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.3385859719691564, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026943011480723986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.24189630901934833, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019324120985675024 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.07891999672740428, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0009533683551406194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.14092640846495855, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017170890317364676 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.09892905722529392, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001163722813056667 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.15770316249517122, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012736741855482734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.27888219312603624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022960014811976687 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.19713523690512094, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015249596621099413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.17297504164263205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015157865911726413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.30157958831453335, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024874143749410256 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.21518082639820849, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017666395173976037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5bd63a41a9401d3cb722332726f3bf1949011013 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 5.618664355824302, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.044804380544830956 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.29980096617944, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018479476291498587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5510960644199242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002746177251645845 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.3790777063050893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019180894806166623 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.12320524930026791, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011603794001722478 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.23411042101520732, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021839181570230963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1571930586851638, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013851703119846389 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.21150876369324337, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012091371590184875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.3973772754817481, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002348890772602287 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2692553660223294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013312984394904634 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.24763246470479972, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016397591465259363 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.45603718052824205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025930969811509105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3132026852747841, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017579190044477355 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7f106f6fd7e44fb588e3aec9ec532445f7a66a91 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.655264751599302, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10851166271873992 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.2956786617020848, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018018971024399673 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5505653372006829, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026480715686419227 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.37482466732781167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001820505134270105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.129469707240088, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00121806826731611 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.24899455762459116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002246273347531876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.16543669174208636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001418641213100885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2175954785088815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012481117214859327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41307647476574705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002319703375666559 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.27750839928052384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013211747767623314 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2472195816112961, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016439382219917379 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.46060025676338434, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002548116733582456 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3133486537665277, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017193781609667741 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..64289dc90f28ba2ac960e13865c0ad1d2928f822 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.948031260618786, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07024191698575197 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3028962873149275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018453669075132997 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5503482491585948, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026466497124214964 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.37972552437025653, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018092314859358045 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.13494336292795747, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012826406801381255 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2522143776432092, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022660646785606113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1703052547809578, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014604429313616653 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22176702009033705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012999603001234818 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41046709529612835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023382620871218933 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2796335792236481, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001342957090112707 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2546972130694942, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001706082034205406 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.46258844899711243, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025482851759882335 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3191175618704876, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017351084798555157 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..943f132d4e982215566492b4293cea5f6b391f1b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.11987499239522, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12056984540140626 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.30702881034304425, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018940787804765146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5510843355772259, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026497395614324347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.38334638913624297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018564450302232401 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1383407701093842, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013023724651524862 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2550398171391987, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022789288990834686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1738253944796353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014854824402731792 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2225382181098862, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001346023397016377 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.40621675306005106, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023173016257193608 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2793510383067617, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013892803599562522 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2605328118487642, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017504023543387641 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4677978217704759, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002582278022445715 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.32528129826527175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017876332019525448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8abefc2537c0cd1e43259e0b7f89b0f3cb982c70 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.021064993034315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11474627928215739 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.30826387603129907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018679719801617597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5491523180108668, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026220387381628244 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.38381893127843775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018139236221588497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1382607118725467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013102373270393998 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25338957755007935, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023051813874414608 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.17329021394802077, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014896565428026973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22310499125513847, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0013603626862817944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.4041141573216587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023508428676916305 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2791853032877538, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013976915078093702 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.26191481097823616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017583830712396412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4668488083352266, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025930890163178466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3260232619607743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017795108045555923 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec9375ab8ee42ac72918cfd3318058f41add78d9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 1.2087041243674348, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06236227717440753 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.11495488513218448, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014917009813178457 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.2228480616093966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0020952477514755564 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.1485373299435639, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016885326851467698 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.01961510433394365, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0006727987776785344 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.03657357239352748, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0011960431470597172 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.02500994962430241, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0008315526871574698 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.09374319199704093, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0010902607092754888 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.18550669125605201, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0017172608070746052 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.12205181393918246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012795921662070245 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.10182213242486479, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0012349205591824877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.19958826927621215, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0017982055619464277 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.13207144996991402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014103766498503702 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5433b97642f86d96aab7cc2ed39009a76fe3b4d7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 5.719467204595487, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05467102028072658 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30689525431041903, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016979120981340232 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5607471544318147, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002541857266880532 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3880684705597522, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017293341079469464 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.12815271668435907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011402205486632784 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.24170118147382424, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021562937828079123 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16340881202208163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013545945899839753 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.20611685586596215, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001198478500170298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.38367500882495165, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023144118800144126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2620939439124173, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013281943135113716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2540076031390351, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015610224671734501 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.46438460329961756, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002459731462210942 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32115446566223654, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016529379356976753 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..338e73c01e833d28b270e96ed63e1a58cf332eec --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.481462554213176, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0637597415756679 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31304294257033033, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001675584026068997 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5791447618132454, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002476899866394818 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.397978553188765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017070834320949354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1361700393455817, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011628001288050507 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2603882016032195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002228895122171891 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17467867245016275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013955055510223417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21315768275684321, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011907007297547669 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4014777565448129, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023252703119155016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27248947489993885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013417737031681963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2617186737971144, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015398800543719527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.48463177955108755, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002439535597275367 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3327776090849803, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016458286188702437 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d257a2b12a39a709d39fae14ef1f7f898b19cae8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.612323545023084, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05976238792927671 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3112721062235333, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016625710557320614 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5797972107422691, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024927181964562407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3967724891293664, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017005575062401018 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1371955035707705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011750480699642973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2640373903872155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022501962149717035 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17648552604551038, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014153455920465827 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21252206793017903, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011987552894719596 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4019456170099069, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023015152807916543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27221279373131524, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013520964327518468 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2596822226642286, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015551841205914923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.48377583683476527, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002472550681565558 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3309974761576321, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016730505404799294 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d71d52e1c822f7a098559e5f710571a3cf74df82 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.691991628747666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08350250197448322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3071216799151067, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016645873768075543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5734199732733007, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002497421870329201 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3918497625319794, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017118070244260361 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13621235371774604, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011845432721289762 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2624365602666334, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022614408095196375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17525069265082474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001423044315562278 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2100790609152195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012057172066683016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3985723300660603, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023232420666500823 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26936703256276495, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013634469715740345 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2581428748840792, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001562414889046062 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4817784670028705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024626011315168057 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32924292332328525, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016762151102916476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e43cf92bf1a0e03b9fa008f97a04d5064dac196b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.645972521435882, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09437494975947229 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3064735649925744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016464375750162158 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5729264801132828, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024516925929830738 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.39134978775086127, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016911013819947294 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13656095219775433, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011845029911870509 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2634948500997642, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022812481290132693 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17585670830781294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001429411829744073 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21048241965429831, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011953684611401595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.40033920081936836, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023247277061194146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2702295976370569, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013593970744284988 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25867956586812857, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015561827508017755 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4833218028427562, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002451881437295078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3301539965432325, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016676064941407525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b11bf3d396626ffd3e63d6e7dd0eda6f2e1264d9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.0028333333333333335, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0009561880076163621 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.00021398046398046395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 7.182900764312899e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.0003977278759887456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00013354742540894128 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.0028333333333333335, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0009561880076163621 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.00021398046398046395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 7.182900764312899e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.0003977278759887456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00013354742540894128 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.0028333333333333335, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0009561880076163621 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.00021398046398046395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 7.182900764312899e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.0003977278759887456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00013354742540894128 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..47c6d151069495f80fea720201736e3c3fb9217f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.7785930657042605, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07371286925187409 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.09307418766156685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003831133208744367 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.09684372086710462, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0037674098990848615 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.07707231946910663, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0029012639839078266 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.026646794320064136, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011779628965901102 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.040281858969099534, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017553221623290288 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.030860375813935463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013215709327802443 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.07194869702103061, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0033904984440571335 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.06646442818559543, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002627445124677514 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.05293870853728192, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0019913751107059636 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.08237065814087174, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003597616882105883 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.08104209712761277, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003180633123309235 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.06475443481713601, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024499768948500517 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..68d17f08c6bf113e847ba3c4f0eec30ce040fd24 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 4.024301729421313, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1532981962559809 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.1602164847749438, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004262488194816882 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.1893467230974042, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0047311550193664745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.15080808821145256, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0036526531895815233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.053157839459133875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001489446683458996 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.08110347423194934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023151112057970876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.062337691922640125, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017277056488068175 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.11906005873487642, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003676903873427953 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.13037070564313896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0033405273229848032 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.10356717573719904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002527891994310027 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.1386315725589492, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003950043680135484 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.15770240014439066, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0039943713450391076 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.12582494031130287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0030784867770067177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d5dcecd62fbca09cd658d63c6800edfe1654f07c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 6.2616432240038, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.17231457870322375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.20703579454147295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004332835339427102 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.2544471431249528, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004998201805906469 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.20303028288911848, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003846560801683136 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0728023032757386, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016348952484523923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.11015181488739925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002540705745285619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.08500284986690841, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018852577618806415 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.15147184243295325, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003718090407697094 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.1755748249833486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0035736277120166807 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.13949830107514838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002682661526674531 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.17716302997626837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0039987377595068554 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.21119592100785614, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004224651124239091 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.16864868735595545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0032412006854200303 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..384cf0a5c0d0ead9b4b41bc1162203b4871b7ce8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 7.796909762932712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1669092241005445 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.23562398578259758, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0043331973616364825 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.2918086256369194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.005035638323372833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2339004008254913, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003879962429844055 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0868161625025297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017252832094021909 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.1301381299164276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0026742073604647753 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.1009343907225879, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019833361884621687 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.1717453473169215, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0037266451879112016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.20219125520831865, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003636988165748045 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.16128440664165175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0027278295370788417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.20100861293322395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004005483075655645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.24237757228826143, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004281416671478345 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.19430984547637892, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00328399177028088 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0026e2d83fbc93757fc3f99165f354c45d1f1e65 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 8.504901044297757, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1823982585890325 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.24905192651947664, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004201658035798914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.31491991748172854, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004981037624003543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2534732144080346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0038630129775201146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.09568433998600458, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017454509782120025 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.14154088138404378, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002652205184771609 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.11077316594795349, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019906677332412748 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.1812665699547391, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003574420490219075 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.22010316835835358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003620300580738604 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.17650040533909794, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0027472261983493805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.21189846325175168, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0038691558037997923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.26224286165281196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00426905910632388 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.211084813103085, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003295971549920327 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..47d2f9752e9e467a317becd201a91cdd5183a550 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.3138256134465956, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.042478650177872466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.09054875368611116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001766981005220654 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.09899686345340121, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0016339321555193618 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.08626819478230045, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0014217358109946006 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.004635097118571576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0004584060579508488 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.006688039413188141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0004649040639428354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.004707141554710639, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00037478881160494376 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.08868499326830791, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0016895151305050993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.09761537714635501, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0015968584648120006 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.08482739955206746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001374786595816378 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.0697631127415905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014027633376290944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.07822809105458635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.001337355132253774 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.06674662648443055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011005680176848465 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9255dd8453f0c24fe4bd28210cdf2203f85fa34c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.380638446426456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09557609009378211 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3328975043204133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003509121285453912 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45270988550034613, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003062231284181297 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3400267632274026, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020403696620665606 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1499329665878703, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002263691832530101 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1983614972096706, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020825028564397568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1486663277769484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015136508138340347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2592088269282877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002698465205645672 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36228982914822633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0026355915533943 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2676344730573564, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001562435233832024 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.27037933017473875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003062674460499446 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3663561677071641, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027976045204321166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2753891028069307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019025285983907253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac97505b9da1dc4488e5b581ed55918787306fa --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.3634920937473884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08616925114576995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2954297857860177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029411840632251563 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48120558575759376, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002844478586577598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.33740317942041553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019381935089587043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.13374093712808155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018836984171556573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2183048706264126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002121311703914965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1507673483604289, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014399337503822303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.23342217539614568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002214559299952568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3907575640409568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002505134070265779 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.26968155815162015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014592699636009493 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.23886641857979662, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0025627121020893886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3892155963722792, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002698506480779706 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.272599814561145, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00182733683050086 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1c8ba5f86476342e579708973e9e62bcb50656e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.870066226903575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09429297050700895 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2995136417218204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002877943933866991 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4870699752407154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028159974008536274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3444162792721549, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019646659016549933 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.13989458331193982, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019259857294848428 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2283186786335389, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002178927464111703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.158839720125521, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014985646156960436 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.24052403808712888, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0022515901969111986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.4008413679522952, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0025096595704963047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2794048970004934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015380955393133962 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.24501488471812358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002592614810126833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.39799000178268285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002715969584528833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2812645652371936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00189815704910076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..df03d22a9206dcbbcf096e60a2ba337bea2e6a34 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.017381678234455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09166907850331675 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.30143650107181685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029314795240309452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4898031820273571, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027687333603014837 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3465967840398284, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019574393578079014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.14062404051839908, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019112056232477572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2300710505484132, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021943432548559823 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.16013598883167798, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015096765844560063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2422982418880056, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0022053611865584053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.4046980533581673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024846126430562907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2820802687702902, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015095048991938657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2479193537232055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0025849971540480303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40376209522673123, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002721581361466135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.28522597669249417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018960405914839836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..39f5e9e57ef31a184660257cd89d7427bf4e4cb3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.260211722864896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12365349150703955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3111425574072635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002912308644972262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.486978690036314, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002680144522191287 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3539789382700311, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002014807799247425 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.14647797107879432, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001938088224804308 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2296020100709576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021177366913279795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1648812739511937, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015410229627803459 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.24849239666244133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002231342628042193 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.39930639747976754, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002405155865675547 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2855363745230388, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015231421878303527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.25960917213172013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0026365144213955465 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4057201628355578, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002662489705069069 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.29494158509832297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019724923487253014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9ab2795465909723763ee5ab017b912e863026a6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 4.369170070060735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07204262900742721 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.19059607371932916, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0015976396764760467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.41030667797285486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002620865522372885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.2540920126266791, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001743002313846714 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.07958464128539072, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0008512115391560173 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.17899027774981593, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018510907312350095 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1073793884770636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001057524513675077 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.1612888816485327, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012295381787847557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.3522789855005096, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002286155396163342 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.2161433172109618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013779255059734295 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.16374077920845426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014355224351735455 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.35431534893364613, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002559304615115808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.21871213194709838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001635496810687772 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9b998e8b2452df71fb2c4b7f9c2808032c4e2376 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.211882508400509, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06087984691073597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.36654808520126797, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021595231400698836 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5825981312150091, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002574992415760553 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4396792572537879, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020547864942206586 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1645678333861915, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001432238933148995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2677310092553899, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021604481761519365 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19855266031915028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015519283833039233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2570869498638546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001512490055714042 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41635046992448654, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002345198987228168 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3103214994230373, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015536997171868116 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3030665303851319, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019368940114554841 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.48277562928021334, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002499697912373815 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3637273198283416, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019205698683061002 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..59e2706777e82d92b3dbb4e9731c20132d428120 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.618865785291484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09097368527602766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3662955696861521, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020996329194513177 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5898222196381344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002594156432810366 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4418673874038735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020138555153139262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16735791685138235, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014286796631250598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27701539052345003, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022736180715884347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20344292743727435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015925311127849093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25828930577862486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001478479770624807 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4238497421384899, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002419962083852209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3135424508873694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015516029637538471 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3055329010748124, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001915478709605836 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.49320303301582513, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025853778948590807 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.36881863339193455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001931324334873031 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..819f7d006127e9b0ca71bc88de3b14468a2613c7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.838654430562631, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08770664641031416 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3653208109803443, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002089278039587839 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.592412984015862, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025780992158462137 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.44203444996828867, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020077674797794494 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16785818992713525, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001444636379498952 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27957982192927644, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002322764141427059 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20463135769763866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001622776743332051 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.256612100970035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014662808098234782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.42390466259959425, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002389221135189317 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.31244127951805134, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015421579875842658 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3065499840629947, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001929720013741525 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4978186288299029, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002597275043020314 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3710630525199719, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019551761163624595 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4c5c3bda3b2f5e7cddc371827720602739b99834 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.664692810007541, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09761641585182086 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3575562551113939, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021077592166897385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5827020964603545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025902702211649093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4334753650888552, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020383137826383165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1624053149694478, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014639579406940794 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27102562833445276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022990242634122787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19812463968549573, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00163502903837983 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2508887474422941, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014913740208745742 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4154733525023283, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023523244972575128 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30580137424286685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001562672024634994 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.29999696611259524, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019295094756439808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4897297628793384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002565917788303965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3638439110752512, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001949294337253243 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2a2e1dcaa616d57664c9118d59c9ae747f6a79c5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.621008002795477, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10561546867270089 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.35130413207029, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020584837208460355 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5752142038046015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025882386282357122 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.42664757166768275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001997906445835858 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15915263498125684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014446749684239855 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26695747744909765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022973676919300907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1945014895681582, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016230097540462368 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24911377312443383, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014837091652911921 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41458252977023213, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002378711814640577 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30416823372321367, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015597528882425847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2956310081550243, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001909657845289513 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4844948513886233, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025773079411811253 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.35902864736200923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019327436427338909 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..894b4c8cce5df6ae3d52cac86db5d53ab5659694 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.09973135507547863, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0015973080444745867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2483551581802608, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036884551976866927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.14045583841997622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0021460672635315163 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.013401782513852779, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006463045667296502 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.034653579743229065, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001719993783733488 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0190566029197429, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009165165117914078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07968673916612877, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011529481491568863 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.20035694919481709, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0028106923696229165 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11250509517032704, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0015602744886009001 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.07993531686685008, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012877020733687047 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.20117384720414933, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0031333624545092204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.11288459242315821, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017514781321482647 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7010026542087479, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05557886575567746 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f1774a683237e8b7fc26fddc9766c5267aa17ed4 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11279192477107143, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016032349108474262 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2766207479271877, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036066236500548418 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.15840047240909824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00214692641322964 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.013486589799660116, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006573025385863716 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.033834009894690674, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016640769892439025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.01908469081931983, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.000926119882035849 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07903088461894958, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001042052923479771 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.195968848936739, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002491056986604568 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11129001725840247, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0014098818626850234 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09067322769906408, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001252995528243287 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22440113001075607, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029610037715897537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1276356932508086, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0016950763004042622 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.6799429561855435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.04879251393094118 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58103b05de70bddaf9cc41814532a79576f63c9f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11759195430339829, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017689078804778012 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2889675810942405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004062189732858682 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1651965564409125, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023832852201559826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.018563434665803503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008384469201564846 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.047169939804170904, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002175973099128802 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.026322118720045605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011845918036721787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08578950235723362, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012213960229954062 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.21259477004430985, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002927850647938266 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12076659101965195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016553933643729203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09390304577868243, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014156962162275915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.23292557695865002, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034258792471330533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.13222073865796682, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019276011969453877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.0018438511511791, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.04226334665257509 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5f25d5755e6c247fff95b067bab26cddba312f7a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.12268247063515418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002004429738071348 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2920661828398265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004556228488458248 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.16959674585893536, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002631406491027894 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.022908311173769465, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009443759203422804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.057536503485596614, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024061542462623067 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.03227615271942288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013255534293243935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09180476596307438, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001465280362062083 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.22042631379574948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034317489548799755 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12716380123453083, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001928363853274409 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09794481566650097, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016340921798746898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.23499521663243025, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038347753688523496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.13571148483109557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021714373357817397 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.3657013486015295, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08729701107016043 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..522d72da907bbf9dc66993795c78f979d8ef4408 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.0353729182277329, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002120724556038934 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.06963538400891243, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004245614128488828 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.0434980036557313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002547351830150303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.006434256872019694, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006047069516458604 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.015043103963332192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0014409503521463944 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.00873416530365632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008131278526907818 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.02693683498375084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016356740431801658 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.0526495618772081, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003212351040820403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.03275624297934461, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019019226248188242 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.02896015737664471, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017797342286746572 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.056639676651518825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003532055034648066 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.03526865532078303, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020893045270848074 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.6251466968093098, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11921280575698076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d18e0f5b7ed98d600d2074d6dd6427975ea4184 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017130559457731787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.0001583205885417987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 8.10965586465461e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.0003020119153708095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0001543461059925123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017130559457731787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.0001583205885417987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 8.10965586465461e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.0003020119153708095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0001543461059925123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017130559457731787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.0001583205885417987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 8.10965586465461e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.0003020119153708095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0001543461059925123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2a2a268aa21ce1263676115425a8c3bd868d5d3c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.17644165930352093, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0035159930377663164 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.31669581782847406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004204880715544353 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.21057429412703157, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029620022295376264 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.045892492699293866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0019747730351747164 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.08074272013017356, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027066202276012723 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05349637631115593, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018866213566015306 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.13966279995339326, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0027977834380213088 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2533248436654788, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034708540063103682 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16723801755038487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023715164713580613 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.13757208272049673, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0028514349754435358 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2496171915878273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003711257561558963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16467108129832378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002492627666300806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.0794867877910552, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08783103175339281 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fd520607fc7194073b4f7011083f2c200a779b75 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.1452348716004801, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018745609056768366 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.354637619733734, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004310412338891105 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20366433099160464, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002506589804140991 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03631887785067335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011535908212142747 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09256525737403981, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029692107599791377 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.051542487477497304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001622782922318877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11435270633499948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014769836849210006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.281332040329211, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035948252196224143 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16066302558679957, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020034483691429915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.1148601995557349, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016090320139797805 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2832784836907842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038965527110954635 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.161499696273688, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002187584389664099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.063408955221386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10793829466409724 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..704912474186e7e60434a0357e54d0fedafd308a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14803725413512445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018733512973581588 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3610354652240794, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004389296174890975 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20750338741773555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025080223518937884 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03764444720923887, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011831632104640343 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09572292100591462, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.003053817939922839 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05337831779753894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001659922934200303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.117013811273496, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001487754889983635 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.28688812754471305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003583873941321553 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16422047885591626, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020030333826574695 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11725652432083132, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016010042579447314 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2890122736547096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003981206683676861 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16481921665835544, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021846730064098525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.0662695038715833, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11604502835868223 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6c287452a49d68746732fa30f751b9d5711e3622 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.15005408001279993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022275262688393857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.34931766468701486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004799937617922755 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20491006822687677, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002761147760944239 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03817756156834947, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012790098539207952 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09333832339595143, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0031111903067984276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.052933674983345634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017273248399238827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11653214452758791, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017452636081294359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.27290106515166107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038631652216362072 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.15924458772595346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002153213639538366 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11841489997398834, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018745424755303739 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2773235770549664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00417220414799562 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16175793225879814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023311720661425635 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.187340339866472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09337748219810847 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..791e487129f7a657094ca8204b9772d05d70e9d9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.051357937177083465, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0034115765564453455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.08957854109700515, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0050933482072906084 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.05673653310470254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030997047693230627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.011627742228248835, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012945900748551743 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.023933291444758153, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002099710700842424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.01418426797251855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0012124277033053392 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.04126617524992539, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0029705705175859863 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.06969693031985601, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003989487538391626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.04423726916636083, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002428377343138416 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.04252467493794107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0030391760211394485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.07216556113564128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004193160744760016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.045736152774752646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0025402110740203494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.1685643584135952, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1951566077821766 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bf13340fea7ea802a7b1745d4a27431a91a76dbf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.002815097659210261, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007550078312773055 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.002058439850730106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005489718025062792 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.002332190655682947, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006168325099045509 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.000355815568079719, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00020880203605271082 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0003037449971412236, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00018804668706895537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.000325473526945072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00019692751752173486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0019160135596418518, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005319338017808152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0014383859186266749, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004043038910360008 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0016104525385042783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00044591521297227443 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.0021304217929180094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005877695890086486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.0016128812346314246, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004569974976967699 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.001800332071457902, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0004998974285323676 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 9.155484870335624e-42, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.4620320117295877e-34 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd9c43f042165eb8a6338fc60ff9e3845d15905 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.17021358510341336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024373039382226077 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3520563857681371, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0043918973093069365 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.21991779087563262, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025921031610263 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03871189783801819, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013940343459980615 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0841631095984103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002861696063072104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.050742967235947956, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016968813344534614 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.12632426569550884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001895537050144316 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.262981285619352, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034911247032713426 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1634042310463717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002018854408540066 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.13144702322249674, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001966787320258499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2761271441560725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0039020054143034 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17070284614092165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021883919787213176 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0719375915266327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09381739892136316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9cb00702ae94bae4021984ed75ab3486298e52 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13144344568204896, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018269600623641938 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.32351107047900046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0042227591238073415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18467573869385082, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024560883653668973 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.027812878457341133, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009822890456735115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07174244190002714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002623646025136939 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03956915695649403, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013969113381600835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10177712280555466, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013642070339291124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2530115289147634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003361296390437375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1433445973017209, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018589035220238875 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10392255150260828, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014996096643151138 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.25806957261490177, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003642123743587609 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1463591812261117, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020474760089749054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5179351179741758, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05443901746520919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ce1b0073c59c10320afd6e1532e284314b96d557 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1310122719121047, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001774463078853173 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.32197642898756434, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004128514222989525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18395936513372504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023841684164883858 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.028592074721851848, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010146723977319822 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07420088979991933, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027242327105131063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.040730679478674064, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014445249170945054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10506688444375611, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013615120559406814 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2609222630071341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034251450207401918 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14791071749656537, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018605423267814124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1039952495955331, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00147131239785375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2579710996210495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036391736637925577 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1463682832676705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020126152822738364 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5610996318449655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.061361742735683046 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7efd5582078c7748b57c94b6d9fa278523c6d7eb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1284212760059797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002041598803417733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3028097339054088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004615535937102858 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.17637720631581139, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00264797834039018 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.026902495307978534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010371639486970465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06717388422150615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027080652362817086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03767895922224648, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001448043764579111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1032652428870357, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001609764469770287 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.24490235352225267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003766948264946798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14193633867939442, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002090431283179229 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10103180270559059, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016751299955245328 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2403006525007405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003943762632099877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13894288832668733, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002188039489816412 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5762937299614814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07362827411239845 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c114cf2f4c0a1f6f725884bccbeda3474c2c84dd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.041468733586705914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025971833623889785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.07627594081632273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004409224115860876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.048911634240246346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002765697903406571 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.009048514108124975, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012181947613622947 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01699435587690579, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001625353914816647 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01035393012550112, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009785561170831303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03361424158025485, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002192257614857122 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06101985571016838, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035062484157730514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03907075670579812, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002182547417010533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03401363654888806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0022524026754152373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0612418372651088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035630181436990096 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.039302040446406394, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022230245548452298 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.7513821615574038, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07905589981346285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f46178bd94ec16c07fe21cb67f25ec770af1053c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003214034101498322, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009221332972255172 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002755372289086832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007486991780011711 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002904097352119159, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008025657506213006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004553870087049595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002660277152615564 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0002936371804296332, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00017278177414040624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0003555930988203656, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00020825306662602857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0023980769070289483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006896763285113808 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0020343184807463843, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005525851742486296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0021617867610418057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000600765218788033 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0027490625210994384, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008051771911473355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0023634542664537216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006464968490936459 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024822527344372726, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006926475370230472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 5.9664964945316196e-36, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.37663460549604e-30 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f9f44e152a810de91e4038d682a7199d9d89b31 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.15669364098257, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0023593186592923633 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.342374371356989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00437907614669953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20603426954244244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002503133782526908 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.030783905399410026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0014282737935382643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.06881031698129042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00251481083212002 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.040126835769534804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014339486984846503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1118036127326331, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018294125382753313 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.24498174699159114, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033060394110144436 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.146668869960305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018236466465731617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.1218098460935552, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001924491999404007 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2695259798153811, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037470358825042837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16072295732233483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020479535637844687 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.6729531930201382, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10373342822948405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f6da5e821b8ff68db2106997994cf5a530ec974c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14788910472235636, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018389112071502275 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3632558549763597, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004249174048494396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20772684063267086, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024574107856270774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03422805026149446, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010929716541644215 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08684664107270823, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002805600167681186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04851112854401421, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015396516708931478 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10960862089079661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013730570572271972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.27132452858310807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003336994048105982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1542221581796846, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018545206679636554 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11840342830898004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015232398298886394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2933510485102284, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037203141797391708 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.1666793386233674, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020689435117509725 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9783686260260467, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1607197494307949 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ebdc6ca33309b987044fd359af3f78ab303704b0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14591017846113652, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018829813606970626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.35417878162606353, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004336112520719766 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20419784491514162, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025189481605346693 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.033450171038277396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011255630755179697 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08454708459773036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028776060737512313 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04736923476037229, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015840684340098893 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.112357445163031, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014098869920409992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.27457572511057443, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003414209315676662 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15747382683561614, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019009865513009152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11456495811701795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015800062086523089 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.28097827542002, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038590856032283075 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16076832155716242, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021544746480995827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.802744798920398, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05920522944838898 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e9409a6de8fbd7aee616b9a69a58274d2feb0d62 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14153741165892486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0020927973441702535 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.33105961459502653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004650854800318841 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.19422961589893378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026855658611884946 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.031230937266789643, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011103593069502417 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.07676489034975985, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027744764471766274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04353291741965738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015306953449422075 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10809159570024703, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001596172312046679 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.25411156370365817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036584523222178214 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1484909183587473, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020672313290265535 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11098479481711339, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017470432310918238 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.26198935148766644, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004040675001072013 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15261940522006723, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002267767106091909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.7884006934583645, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07907574142987076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4d77276e5c6e2c91d4136598353ec97d525d5d3f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.04322202252738077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0026080541844490843 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.08145947706027855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004597925269874288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.05196928184468956, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028568557569465304 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.009217798283394814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011515928095724962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.01852184368380728, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001676813297073238 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.011212198666180598, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009995094999208916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.034066440428070936, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00214328249850724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.06359700022888996, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036377298500940803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.04056089023030101, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002247569677623111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.03534210706251266, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0022064701869351773 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.06678926323593559, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038482831513349976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.04238835402403162, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023646209795898624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 0.858896407235953, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.17813375842071116 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..baf54cba062f0746d66dcbad7ca335560f7085d8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.002820827522912925, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008241014146566761 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.0023517396771233516, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006955198818749466 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.0024619745886219103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007068894891377003 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0006013749644563891, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00034827715329002203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0004582288780401988, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.000276736360710651 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.0005008107704990395, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00029259545868901905 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.00242534330718512, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0007134131592000302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.002077455090168221, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0006420787798082092 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.0021415097697000925, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0006302603088867335 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.0026888839947429815, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0007810704185094188 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.0022737730468411124, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006780275889829637 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.002363959396267095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006794393822266892 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 6.730767350313837e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 5.433189914171102e-32 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..486f6e930a1afca7372a047bcd4b7e0d1d0b791c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.161357867973778, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0023312739071830115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3579687540791487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0045453513416916095 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.2156313325024613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002659844715910497 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.03816216737627533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0014424649587105316 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.08755656958993782, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030000299859141506 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.05124089074244038, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017434147699145796 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.12090520179487235, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018371253252162907 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2697345298612283, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003627327469111352 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.16164669459673872, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020770948236222796 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.1263110602681917, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019426636581592435 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2828419243164692, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003990709561595801 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.16919936670173066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022655905836503737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 2.139796123448687, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0864912451995421 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d20e95303948d1295a91fa2d46b8284991d1efd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13025847232053914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018599814789393584 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.31675206292581976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004227418702250003 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.1823978250185138, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002494889694305653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.027325104152554167, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001013865586628979 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0693976720523522, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0026208374515650262 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.03871722499957788, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014304008993493518 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10265566343076016, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013991604181805165 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.251846472183418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033529856967831223 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14404400088909086, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018950818451793567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10154113890914356, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014807942774655064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2494785849680061, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035726690764623942 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1425722908096705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020191945274259963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.4741686816314712, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09394903896491678 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa06df8797c75478bfe02e0eaa23bd92c71d3910 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.12565737450340297, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017756118340194749 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.301251337982613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004081353108976665 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.17515450965695337, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002375995326570658 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.02725985494322299, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010257708768464891 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06822620908929833, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002624357062351836 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.038463882894735665, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014439165288524466 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10562763001104193, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014305250731057942 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2550603689921568, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034548246638929424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14750967049873018, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019352926679694342 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.09586724631609989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001432813745557231 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.23246625054729966, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034962734013648925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1340651218539847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019565808957794375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.4891373477876866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0811523557434027 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..14d4ebf02351dd98961a3a3551f7e9d48361b6a6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.12133492152104836, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002108052911078574 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.2771118310610424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004513875830290629 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.1652692842917284, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002687544705180198 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.02620916447329947, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011062000625487128 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.062338878703100085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002685465546018337 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0361588906854937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015014165247081573 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10257894420601281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017009747933540017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.23624832791965056, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038268204334818137 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14006875772839858, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021980071144352764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.09313729488739347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001717942721208578 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2143290252431487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038151394078170886 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1270213480349254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022098783786563886 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.548542398410883, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07300875765901975 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca99d63c8840e05781dfc5548e3c05ad0a43828 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.03643719370186493, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002327364890907121 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.06520608390203138, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003996161845682704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.042930043440632525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002579054260477061 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.007917882560037967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009203895438615683 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.01504105351920977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015317801562663841 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.009582916301059853, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009704986418888822 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.030403765142791965, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019637434849462013 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.05414283521586396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033076203364920415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0355906132515583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021268636919251588 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.028885184139424556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019316925983647367 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.050662384755815255, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0031895292441193332 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.033437432695987035, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020528905348075315 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.6256689661157525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07517644959617945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4c5121317ba6f9319faa8f32934043d146c42482 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.0011435105774728416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006698929504092044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.00020164583843829126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00012654106887528574 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0003382885458357156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0002086842561151499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.0011435105774728416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006698929504092044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.00020164583843829126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00012654106887528574 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0003382885458357156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0002086842561151499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.0011435105774728416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006698929504092044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.00020164583843829126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00012654106887528574 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0003382885458357156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0002086842561151499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 5.331681904215219e-233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e7aa4818cd5c220b7963f3aa6b5346b8db8d7648 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 8.221264640592608, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.3149873650760696 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.16666080011920595, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005180719422543184 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.501863559860732, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.009052996765184864 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.21211270156193437, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005814456021753165 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.12083386201007094, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.004471674153000206 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.3806965470945097, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008597586629652064 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.15599026594193496, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005128636671935841 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.16072996710527454, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005070936585051591 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.48913954648097346, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.009013481487297286 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.20516494642415065, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0057181465206832 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.16227516864724326, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005116365519276017 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.4895892873231678, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.009023257251500322 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.20678919293239362, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005766515768579625 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..87e0e119cdadd6062b1f1912ac1dee7054054504 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 7.3767510595275665, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.31999310082457355 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.2215000417152911, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006137200329200616 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6794277393295427, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0066774062595464905 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.26918713196522137, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005971404751580413 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.1640201975855851, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005503885160452421 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5160107151183727, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008200440904778427 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.20124233569826713, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005612862896818122 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.2140530349109963, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006041148360807656 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6614267819116432, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006904677170599779 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2608873390569801, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0059334583892896664 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.21594311252863366, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006068594142965329 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6630530525028233, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006882384898937598 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.26275507037270174, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005950371049088624 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8b676ee7050de623c341f6791dcfd673ab5e6527 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 10.989723799423952, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.477322210631878 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.40990335902092456, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008259416445477983 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6715421937524536, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006833449843147891 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.4219856738666251, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007684410627868399 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.31002907544350694, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.007809148641476673 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5179977182718715, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008225556639305274 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.3238830207743833, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00755763096612864 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.39649832180640104, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.008182538521597163 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6546910349804671, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007082687223632161 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.41063474482635426, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007718338436949308 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.39933775034518904, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008215822611471774 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6566914458481494, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007055405166544278 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.41268141482914944, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007720124425223064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d80599f4c4e848cd37e7e2a6ca08de3e4fe7b6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 15.589010417058658, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.8454556706430587 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.5135186861705211, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008431281414525968 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6766130266106622, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006905636373535094 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.5126931501254426, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007956454706339308 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.4022833285618224, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008346085939704194 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5340662726903487, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008140102426002193 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.4057090348109076, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.008083151105761782 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.49968417841017443, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00841093839820918 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6615311276531707, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007119822079266015 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.5011072943138228, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.008026772259796028 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.5022494859348401, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008415389296058761 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6647270305912438, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007078089811566776 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.5031731006894711, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.008007126558781245 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..494dd8cd61fcd5e401d7a903ff9dd196cde4568f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 20.004791024670315, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.073121182179684 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.5635809903885909, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008037675550817377 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.675003625813131, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006934430967374778 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.5564854580202747, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007723304287288785 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.44094340540720767, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008336049944543478 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5353191759273204, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00816164151577736 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.44072198270594876, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.008143626907731532 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.5481052255869446, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0080708932692548 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.660534735042919, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007153754571776756 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.5439527567369827, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007832639006320164 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.5513165852733105, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008062156731256905 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6634891599008953, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007102976763556776 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.5461795795622169, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0078049936381923745 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e6a0a3b673b98d827ed64ded0895e1bc5040f5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 20.541530615378996, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.0236650658463775 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.5751633897784844, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00803435176216989 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6898127295322619, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006762762659826006 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.5698885703517087, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007714826040679644 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.4546948153669257, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008366514761157184 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.551114824772799, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00808762271709775 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.4560672630321141, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.008171926975401585 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.5612455638728635, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.008091161331947056 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6761124712337756, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006983397662954877 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.5584117042533167, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007831607552652558 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.563673777021132, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008077158591177138 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6788119556165896, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006943167870722146 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.5603248083479772, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0078052598358510986 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..48a450331426ba6173b2fb06d4d5e3ae27c73f89 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49455930359085964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665133500637059 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49455930359085964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665133500637059 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7d2da3e0c0a252af04f7c059aac30e063eac45d0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738877 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae42f410fb231fe390a3775b92814cf7fc77250 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.514689880304679, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011660788281735496 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.514689880304679, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011660788281735496 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dca0618adb6d204775b42cbf3abcac6f5d7ce6b8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5130576713819369, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011661845375886342 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5130576713819369, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011661845375886342 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b5590d9df6773454caaf114369485fe829dea43c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664470424044972 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664470424044972 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7d19566dd094c98a752531fb11d29e18cd136cff --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5087051142546246, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664055982032838 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5087051142546246, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664055982032838 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fbd063d55a987470108e9f31006e1cca97a9699b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1653247598984174, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.013821408415080715 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.020998722136358804, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005269403955655725 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.23228388738077216, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0041360813271565395 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.036532603399439284, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008337337093122241 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0030927874561869893, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014944861884636616 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.0386031419214451, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018914429245019343 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005465981531797976, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.000254177372520646 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01917199471590868, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00045663889214525273 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.2161115069458247, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003797851830611 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.033434992892598596, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000727186829587635 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01720510228861565, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00042829503630886036 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1977154227864982, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036870557526130065 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.029988024300834148, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006784161057869979 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f185d94e7c6363bc40ce371ccfd6066232ec2bef --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.17208205991093362, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010301725465366625 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.0197808533467512, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006637399084174438 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.2083765310213374, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004054556695081749 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03341660597066338, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008614668722471364 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0030943760026781223, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0001632745006225758 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.036249687836481304, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018967653687895932 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005423476681292554, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00027465476938201355 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018317454340834892, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005782023821136248 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19659411245081346, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003796357362538508 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.0310540742053096, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007532823292737799 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016131297257791454, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005516670663513384 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1776065844884305, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036553543805523184 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.027295011757363707, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007014745436916857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7e1fb3de6c4d90dbc8773e1d27026733b7c7fd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.13535549963348786, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.011883201814221686 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019974450931119343, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0009096575382126053 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.20327780750289512, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004138109405329968 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03167331762534985, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008149162454634007 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002913124216946272, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00025124507871194965 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03375022580735237, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018971946690473117 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004712505847751591, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002441580237771217 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018532639910942308, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0008013412795519539 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19189422414566185, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003826189608547334 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029502879197560596, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007158939543230575 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016551043276073225, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0008244428247702725 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17348668108248785, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0037039820894878926 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02594274780299628, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006803333166678133 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cff27299318e9c30cbd86b08527b1149a165be2f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.13494230997950132, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.009681091491781743 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01911132845089878, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006891049637900843 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.1986412875937398, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003917458930279225 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03186472547854922, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008686349358294461 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0028206675747611733, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00019060442122968908 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03252120345407891, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019655949748724573 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.00470586424330017, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00027245099685908755 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.017666871030393803, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005944257003565911 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.18735984838330555, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0036829206769154214 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029595321454027084, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007712454590830175 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.0156124167337213, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005611219330915058 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17009443127881965, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035528405308160285 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.0260842453327782, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007064564831714856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f3073b44a263a72fae6f64a3401996c63541d260 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.11573444053320703, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.008126260546620632 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018273055931518078, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006213051033422689 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.1929448451365874, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004110007510165885 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.030447754959100907, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007971323310690517 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002439514135649695, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00013971466675209783 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.033060693641110854, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021387293538605323 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004310884060921524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00023951170742409153 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.016854212838326506, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005620151575706898 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.17976699456632925, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0038004506713869165 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02807329381234764, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007021802242750015 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015034023539825695, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005141904845083435 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16691215660017894, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003747360877378185 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02508537471034259, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006359302429150094 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e1fd77d3a20c5a3c5b4da42e8ad689277c7ce7f3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.12248518122547715, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007751047228685078 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018924493431835766, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006692135261600222 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19719638092839015, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004120526129777433 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03135317378479231, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008162298173479573 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002778538269095852, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00018494204805708875 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.035803100604613836, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021320541894636206 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.0046971364054093695, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00025307514350424073 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01729661551542472, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000579508348294998 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.18308780779648284, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003805938347899692 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.028760590162420697, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007251430881917033 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015684253519749444, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005565161139387542 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1712948423769965, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003765879887375436 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.0260467389410542, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006803550833664029 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..de72bca6917ce2d1cb9e9582df806c8b1e50d758 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6795aa9899e6810483d45bac93475b142f2a35b9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49782372143634385, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738873 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49782372143634385, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738873 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..27cbdd545f514a68a1eebc1a397894623e0111f8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49347116430903154, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664829595210969 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49347116430903154, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664829595210969 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2fef79885e6dcdb02a563c99e88d39d34dec6c57 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4880304678998912, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011662480968070049 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4880304678998912, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011662480968070049 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d2d80ea6cbf413775497328ea9be3715fd16667e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5195865070729053, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011656869979288456 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5195865070729053, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011656869979288456 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..73c0c28e6aff461b1a3f930dd7a50491953fb286 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664470424044981 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5076169749727966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664470424044981 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5ff3e9a98b310f99e5ac3a0585ab788e9b1a4140 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.559847660500544, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011581954727227395 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5522306855277476, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01160199979686681 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..930daae9857abb52397c901beb97fcc8f0683924 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5418933623503809, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011624803747232126 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5424374319912949, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011623729421518132 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..53a2c302a74e14fbe0e71a0ac4cc4dd5ac9a42de --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5321001088139282, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011641758014820126 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5342763873775843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011638380213532437 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b204517791b8cfb868e2ff9cc80ad7102a00872e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5310119695321001, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011643363511107457 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5348204570184983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011637500993815848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..710eca8025b5052e90d945d397c92a6c868c15fc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5413492927094669, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01162586411331582 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5424374319912949, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011623729421518134 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5a4aeb4b9ae59436cc4dc03a79fc44831848c9ff --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5386289445048966, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011630956681145912 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5478781284004353, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011612217507379627 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..69bbef93b0cd3463a48cf3674835f578a857f540 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.623, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01533317012577985 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.548, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015746235865880677 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7e307a321482c96d1461af77e7f9fad07bf168 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.698, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014526080235459544 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.686, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01468399195108797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6f24e2d305d7980406bb8168859e4618d54553a2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.715, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014282120955200482 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.698, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014526080235459546 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1faf0df0f690944ff3f91022f5cea681c559256a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.71, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014356395999905689 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.709, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01437099598237794 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..59f4de05625855d57bff6461d42d2853cd59dbf6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.717, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014251810906481754 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.724, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01414298497574067 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5beea29d235fcc975f1e591cd5c0285afd276c37 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.727, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014095022868717595 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.726, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014111099288259587 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..143a7c7aae2ad7401d41c124d53cb6206b578dc2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.876, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01042749887234397 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.804, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012559527926707352 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..86122f755c6133e78eae95432353ab1d3bb9ee12 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.913, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008916866630745906 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.876, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.010427498872343972 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aee582c2787da51fa3fcc1d67e7e201686adbcf8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.914, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008870325962594766 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.893, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009779910359847165 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a5dd36608cfc5cf268031da3de2f5ba1f2f6686b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.92, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00858333697775365 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.914, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008870325962594766 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f08f61868024f496558e9a20b4a226628d5a736a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.922, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008484573530118588 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.914, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008870325962594766 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7631294b878563d069b170a7db7a1e947a25d8a3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Direct-Question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.924, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008384169266796386 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.919, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008632121032139967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3a21ba2df6b163f068b59cd016f950a24f408e8c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.486, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015813097547730984 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.419, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015610338967577794 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dcc676c19b89c8efe5838f9e5e6cd6a7b87c1aaa --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.517, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01581015372983343 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.477, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015802554246726098 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2cb51c3d4eb0e12164f73f4c5205b0e473e80189 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.51, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015816135752773207 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.48, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015806639423035167 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..05f403239b303d6133f31ec1798a4a6683677b11 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.529, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0157926694516289 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.486, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015813097547730984 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3b1ac0bc5fb025053a6a439b4e8ad52b6cf8dc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.545, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01575510149834709 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.492, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015817274929209008 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..026964e2c4f5cd618ae13b767a4d0befcd644e89 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.547, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015749255189977582 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.506, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015818160898606715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ab7b01652e221aa772e2274b224e07f72d5d6535 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.627, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01530049362292281 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.534, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015782683329937625 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..95a53c20accce368beb4fafc6b84246ac40416aa --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.51, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015816135752773203 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.472, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015794475789511476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe6f9359500066219eb4a1f8bd12bd1a2ea093c2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.583, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015599819048769618 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.537, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015775927227262423 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..94ca399dfab87b25a1fd9e10b2f507080a721348 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.595, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015531136990453047 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.555, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015723301886760944 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..22c099c695d6eb6f6338cbf04bfe98a077b06531 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.599, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015506109745498318 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.58, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015615500115072957 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c95f762e1f0b0d14062cd4aeed3cc8990c195943 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.585, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015589035185604623 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.563, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015693223928730377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..96d24cf22ef9243d2255a2b6196de76f047736d5 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.6, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01549968516584259 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.519, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015807874268505853 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f7a0ee99d571b52da9b448923b8e28d12faff4f9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.585, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015589035185604632 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.544, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015757928553979172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e58da4aedd990ebbd99e8c85fadd01071cf55d08 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.608, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015445859463771297 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.59, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015560917136921672 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6dc3a06c80b19f3a15d8c3d7c18e2d17d8e80df3 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.637, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015213890444671276 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.599, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015506109745498318 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0cb0f3b892fae4a173f39784a10a133df7b4319d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.62, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015356947477797575 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.624, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015325105508898134 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6e4130cf94c321ecfcc66005dcb7d0f1fbd5eff7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.625, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015316971293620996 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.609, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01543882629468179 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3a0709d4a90b702cc0de573139aa4ade09109b5d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4778193479422769, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011551049647290307 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4975948690539818, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562298481438053 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c687a23258c3536fdf1473ac140fea304e947be6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4585783003741315, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011522687288692527 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4767504008551577, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011549925483927456 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e1f9cbfd6de39ac37ccc66ae5a714ea45c8a1582 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46980224478888294, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011541325320336618 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4740780331373597, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546883081384896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9351eece162ff01c5c06b5bc4c7fe7227fb3dd99 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46125066809192944, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011527657726586461 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4649919828968466, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011534056494505864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62e324122af955e7c10fcddab52c9ece4d80f386 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4436130411544629, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01148867172507346 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4585783003741315, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011522687288692525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a6a2d04da790fccaff79dbc5b4fb7dde08e21f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4462854088722608, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011495517440721683 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4569748797434527, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011519544865928056 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..572032cab02b550b8bae471e5e39f2a7ee7e8049 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4890432923570283, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559655791130727 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5264564404061999, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546234813777409 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b8b1dc8bc8ddd494302daa79c3e00702eb991f59 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.46980224478888294, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011541325320336616 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4965259219668626, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156215314916829 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa5ba6a5930d802a42f75120394e679bd9ab6d4e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4660609299839658, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011535764881641411 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011555016408505476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..02427d2a21a9273b4d944a410f7ea5ee61dbe244 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.47247461250668094, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01154489847386459 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4836985569214324, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011556285484521572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d28bb0f57433cf2e62b0665ccdfe971933604b6c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.46392303580972744, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01153229486915312 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4756814537680385, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011548748301487319 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2742e3da04ee3ace9ca031da89c01ac576712cc1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4708711918760021, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011542794417345717 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4735435595938001, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546234813777393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e825c36994f24244fd57c38839e10e3b8c46bb34 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9728fcdd9201d0c2fa83165c5387a5db1cba8c5d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b502d9896fbf2aae91dc0008133ca5c2e20d61cb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e550c77183296480920f1def6c539d64536406d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..36e4d7b3dc6a8b89559b32eedd4ec6ad0aa6e6a2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..11a0331dd96dbc5e666c66818ac15ce7ee23aa23 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e40e0a8735f1ded89c22ddd66eb818da40be7e2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4751469802244789, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01154813982307477 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.521111704970604, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011552120807053812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e3a6403dfdfb98f22564382ef48c53bafc4cb7e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.47995724211651525, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011553138977961007 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4938535542490647, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156155858904076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8e0f0373d04ec1efc315632d336e23c24a04f552 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4730090860502405, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011545573278697235 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.48583645109567075, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011557792331301667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..afbc825f79d68208777902e202e42f2a1747eaed --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4655264564404062, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011534917341355132 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.47995724211651525, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011553138977961008 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..354112655f3978d1362c406e728c63cfcb566fd2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.46125066809192944, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011527657726586461 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.46766435061464456, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011538227692217273 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..da67dcf644ea4e527b5c0096a57c4c992a19bd5f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4462854088722608, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011495517440721682 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011528611805439893 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64b7196390a1c9408d567924be4309d91a24fecb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.5114911811865313, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559378273599123 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5259219668626403, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546883081384903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ed4a63a90d83ed0d70e923354a21be48d963e3e0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.48957776590058794, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559920087347776 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5136290753607696, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558135970599896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c446be0c145b8c15836ace8c2bf55d8bdc638dab --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4949225013361839, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011561836054238777 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5114911811865313, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559378273599126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..411f66c109969bd313a2ae52bb2ee708f5eb2e70 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4938535542490647, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011561558589040757 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4997327632282202, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562430600098489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d81d4fce17de94fb166d2be2b93d9e77d8d77dda --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.5077498663816141, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01156104327886355 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5179048637092464, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011555016408505476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ac6869e177395b9cee07196dffad28e972cb7dbb --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4938535542490647, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01156155858904076 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5152324959914484, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011557065368348293 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f4ad69c146d2b0abfa672a8a41fd3f4ecd407462 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..76f072ed1a3598732a62eb32ef823ce538555cd0 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4d95e7d92bebd3825fdb9a421dca488c3e315529 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..38c40ccf7bce5da99eabaff4d15fdc8b2bc34557 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366422 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3688c9393ac7720d53851d389fab6e297ccff7d7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.555956678700361, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029907396333795997 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5776173285198556, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029731622646495887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5e8e844789bb9157cad024074364a10747f55be6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5667870036101083, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029826764082138267 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5595667870036101, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029882123363118726 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..583ac6ac1422841b790ffedce1511a87f66141e9 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300727231673172 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..793e21b5b09cd6b674429dccad183e59dd262aed --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b21aa4a94ea1b0f26adca908f6b957277053716b --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..00b2cbdf364eca015fc8bfd02ff8aefe1bee6c4f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030039730592197812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..23a660fc0de7c6a0a90d9013a6eeb27e111ef1aa --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366426 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..42a1b4ba93a4252b2cad517e3894de55a879cedd --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ae925157f88bd7fc3e102062d4139b5ad46b9811 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300727231673172 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7f78eb0f498441151dced0095b00be0e0dcd59 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..837a289c92cc20f89cf5876848406b60d016188f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030039730592197812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8d21a2db61faf73ff1701ce616233ec8f4788131 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030072723167317184 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ad8071347e927e390073a485c362a0700c298e52 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317184 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..085cbd1d4deebe6b4563fe587cb1bb01fcf661da --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03000984891252912 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5451263537906137, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.02997363649541526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ea5696fb3017b7266a0695e49c0201087d6c5132 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..503f09da8d0486d75ca5a8876aa607b8ae5ec095 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fae0cce7381d5152ae2b015214d6bc22b9de5f2d --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..62327b3769d4f53c28bab142f61d033ea4c5418a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b80a3343ebb0b8076daa9592206192021d46ddb1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030009848912529113 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.555956678700361, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029907396333795997 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2d797be05346c88a926a3181c46fe0625703f5bc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.555956678700361, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029907396333795997 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5b79a9bdae578974a92e82951cf53b7df6883e62 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0ec272b6c8987d761bfcdcf52c52235375821f2 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2e0ae21a23a95e1bbd7193b272554c7e69355b88 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3054218dfed75cb8018da3d8dca6e5adef24ec21 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03003973059219781 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03006330041190266 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ce3ce3755c7b153f8bc36301950a22606a3c8eea --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5523465703971119, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02993107036293953 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..338e7b82d61d96acbafba73fa13d5ee73f7e47a7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_superglue_rte_should-assume_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec94ace18520f8fccb242af68de49a7c8798804a --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405237625922564 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..270a94a7a777030e512c5c98c1d221399da182e7 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228584 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ceee10521f835c78f7dcc5c1f35f4930c5a86e9c --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497704 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.516179952644041, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404512613097859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4f39148e5bfdf3d8f451016c93484b917f5404 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5217048145224941, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014039239216484627 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.526440410418311, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014032823874407229 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8debafe30db9c23670701b5d14e934cf25fb1319 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5224940805051302, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014038257824059892 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5201262825572218, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014041096664344329 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..24f512ddfb33a64b6dfed69393d919a7cc885db6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_Replace_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01404827882040562 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5185477505919495, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014042813708888378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b879196c85231a2522b4e66eb61c0e241b5bf0e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1c25b0582dd5f511f20184679f87bc4d350ea7e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.48855564325177586, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048804199859325 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2ee50593bfe5816d3e845b4ffa4ec3cd7c5c2e1e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8e3701a44adc4fe6a7cabe0175fd940f2459b544 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049512 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228571 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1e0ac0c3c46104c912e1df9a79edec98db21d566 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616445 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824189 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..99a41efd0eb6b3eee2c3f79bef9d3ed75d899778 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_True-or-False_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915852 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5114443567482242, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048804199859322 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4a912034219fe0250dfca85edc015a46994505cc --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049512 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824192 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b06a9a5be0c236e8f231ed49d993f06dc645e77f --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076906 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.48145224940805054, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014042813708888378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b91a2a944fdbc778f5aa639591b3392cc2a6ccdf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529007 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a66a8a3a813b66e023e47227378ea57127cdeda6 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5153906866614049, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014045826789783666 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57b3badcf888caa89f07e71752811f4ab47bc634 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367592 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824194 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91721980198b53d40bb0bb85f26bcdfb8eb2ad05 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228577 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bc3e4865932669b1d046b59052c3376cb6425809 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140492945362904 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052131146915864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a621f2447ebdd0cc674486e5284daf5527684d46 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915848 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367596 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ff2a080affddee882d16e9698ce42751ab51e9a1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405237625922564 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.48855564325177586, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048804199859325 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9259243c92ba1bd9f152be7f016cff9159143475 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915848 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..94decb7949da8268116c267512cb317f79485e32 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824192 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405248130604952 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..239bceeeea7de7071b6f26aaf23023ba5dc3f417 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_stand-for_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.48382004735595896, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014045126130978601 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225629 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_0.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d67db99484020276bbf0cec656a52afb69057ca1 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5138121546961326, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014047122916440419 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_1.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..18b550cd502cfecb99672dd698dc19769001b8a8 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616441 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.48303078137332284, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014044390401612976 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_2.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9f9f8bd1a0c3b69bb0cf423340576a40c1d2abaf --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824189 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_3.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..00309bdfb338e872f03e56b27ffbbc8ef5be9b2e --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790513 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052131146915852 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_4.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ece711aa10036172fa2e52a75f56a05b35715289 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076892 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225632 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_5.json b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3a5b9dcd638c156a453fe9f06cd1b0a6db5c5049 --- /dev/null +++ b/4b284b17bc4/eval/slim.4b284b17bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367585 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bc08c8ba16033b9c20f9c4b8c194a0952a94edf --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985d759e19d861b3ec496049e94a0be1c8fcfc3e64fd34c39b92199f106fb96f +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1786fc092c8b9e835daae6680736226a4fd28bd --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5e323edc4c8a0f31e41c4bb73bbd1546a320f4a8037177998a79d153c1f3417 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1944109830a46b8bf12c6d8656160e98e970c5a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef70caee3ae22e1215be31f6e598c6b30795c414675633c8623a9c3bfb0712dc +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2e3dd1c1d108cccf593959e1b6179880e0207d9 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77e2eba12854a360cd2a02a65801d6e2277db44d0a11470f68d6a6529d5b3fa +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b79966cfdc3a63456398e727733dbb1de0da7d0a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2157fa9becae27b649434b8f8e9c5ce823d58b1a16f3cacdcf18559aad927a53 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7734dc61398b6919adf98fd6c6b64c9d8eec9eb6 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb45cbf18837bfc5fd773dd5223a351384ced1876923b62bcf4322b5a24f0c9 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a9036abcfafb8906897b6131629d9271f87a17b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88c5e21757c5b245111cef45253e93928ae3f8d83577fdaeb1d76765fdab0eb1 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..740ea00b4e80dc728c03ad9ad7530c6e05fb4f2b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff745a804874849e82887ce21418f170bc16a4eecb3d04f503384e9d9c96ef1 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ff0c0f9e78bc2ce8c7f9020a049c5c8c45a7ddf --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbd973b70184bc3514195ed90fb851d3747695231617c7f0f06d3c4791a25fc9 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f68f1e824e04555eedf3a3e04a5a03c9afa4684c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eacd15163385b8615ce07d6d2ba40baae1feed81f53780e13349654a5e05919 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c48529336a9397383e36258d6883390e5f445103 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:074a128a3482b2c7f2e6eb602078b39a7ff73984a4ec6c326ac53fadf1a47795 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a61641b96156d074c9aa4a3dc3b514001730d23 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:565df3ae68996fac4b2a4c7d773ca5a8ab0dbc4fe95638f4ca224d9f99d8582a +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63ce05e250317f66fde30e62258e708b3ca1ee1e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e202b80432f2c7895206add28f1ad8f4f20fe3858ae50f9a3778c690c8cabaa8 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e449c2b56243530d804564a1b5614e7a5538005 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fce3912e13db92272005db173a6d6878b1f718ccf09c8dc25d25bfcc8e7e735 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f22da484ee615f33a3a79e50665793f0b78f67b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b18e866d6f865f5c50f3112eec680ebf4e13475868ebafd6fa4e8d4a4b74ae81 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98e07c2f4da0d687a0b88f2e129b16c2eb83be36 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c1da93d43db7de0be4760cde15a6b06b659def2dbe2319a3006326d1573077b +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34c14d6e7fe6fd081243daace13a6978cea406a4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2508dd36ce979c8d4d1ad51fd3b478a74ac670168a5959e66883e91f4627326b +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6458ff5072e7ff9a6c46622d03e43a6e4e50170 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba1454bea3bda4a8bf1b51ddef174630af4f3da6175c436048dd6f1a43f3c06 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cfc2b55f7046051b179efbf420c477c417e7d8d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abda2c3cd91ed38544a9cf9c811a64b13f396f5e25b9c0fd7ef2d5318a09296e +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1063d0075fe59066295e06fe090113ca46dc0f50 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b050d8fb04ed1e713d703e3c3317bd4b3cc074f697aa8de7df673420d43e7bf7 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32f329a6e4957b1dee09917f6861f1e1bf398f61 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c404e5e4014383570f85d1172f69d77ac7fcddd7e985a0b58b7cb627d084ba54 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..389cb9d5f3ea58b9dceb3c638ce631961ad7d7a8 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a717e79d2b55445d0c9f0e10c0e98646a6001a90f7df4b79a89073a0e49c725b +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33e01f17bd1eb2a58f79d512c3b119cdbdcd1a30 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc96c74450d498382d8c8421ac745cf5016cdbac35ec66dd031f0698d447ef1 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..115b8a45d19519b2826416f542842e98e853e7f0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3cb516e84e54950af1d2cc0f8cf833af7f3148fe50801a5c8f95ea4820b52b +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..784136cb80767646d2f80f9836f6d9cefb97735a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04e06e5db229437ce5f67632c0c0bc312ce80cee7b355a97d7b8400a451e7f99 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e7df0ca2d2cf321a2f92f351476bbc159df9b00 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e771c5d0f61129fb56a76a7d58985744383186cad426164f95d5a7c2b1e0e534 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0014769b0b2f2934bedafd7cf6e9b6126c5f2ae --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1058940e3c553da2f8e5cae6da17c17f763e430ec06cab6d2180de4188bcc25b +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a7b3fae24d38fef989cae73a79a20faca30f3a9 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c85d832e8325a1c6b60fd6d717e6ffde36d82ccf1245e6b9b93daf05e9df0e3 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98d64ffec82c1c7b1684be45e1a69d4fc8fe3627 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:598781d5e10d4b53ed1aad38f571bcaabe425473aa6b142c4148c9555e317f3f +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fbf1292ffb4cf560ceeeedc9330b3b2031d1edd --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8124d6f6e9b575ffb9c72ea953270bcb69879bceb2b94bc9cae563daa77ca4 +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..862f72429a4fece1c9a9d5e4f1187f35d74e0a50 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149c916420f6f0ec9d2238f0c8fe0aaa9a853916ccca39555d787ec0af7b212f +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a5383122dd86ead907ec2bcc99cc0b8e4ca059d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7764d59300e0608e6329442fe8b7265b83e6be4d401d077b4700f18cb63987d0 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..badf27818c2aa9659be3a0baa2ab2c2e9db582ef --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b392a609ada72f8d4ee0334dda92d5a09e562cfcb70a272f133754122cc8a402 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1290344b77e5312ccb531726955fe917aee23bde --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cf01008fba7d1c6254a997056e2ca2b93d3bda0e3f07ba9cc1f769b16348c79 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf4ffb7c649c711e7dcf8944acb5cc48d88f03c2 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a1b046e6e5d4ef0229da19258744f00f893f77e84b3c95ccd66a542ca74ebd +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5ca351d098e0a71d27f4a1f76d089fd96dcfee1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f37f25f28261e624d88610a371722205908a6dbe69c8704d71d5a7a13da8b4ce +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c12e94676b431564c9f2dc94c5c0d0e167723a09 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383163a730d7b4d7fcd3661baee5aed8dc6babe7a1cceb3f2bde267cd20c0422 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94af1491493ced1a8ae68114c5d6e8877003ceee --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b6c236bf78c7e20bdf138f791d3c2ae3c9357de89dd1d88061cf5046874eaa9 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0297ddbf8244027c4056e701eb39baf33c11c1f4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143c079922c00b4389dedc978827bee00aa5d506d6007e420514c5cf9928dc16 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31592d02aa3fd098d2ed2310da8ab0528032279a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27ad346f4914f855adc499e20af2453575e91ed3795b49e37b2b932d674dbd3 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0796843f3128d4d4dc9f93d7fd82f0d097052655 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4732c2b40d40d61f9ab99c133c593e5aa262d039a8b061653534a499130c2bbc +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b6952edbaeaa15c9adea483803b0235fc4d02bb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24db4f32d2749a9e2c724d54741a4e3a7adbb022fb01e87315440aae53573622 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea57ae0bf344706c0c93201d66719b1dbba5b7f6 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae16c8cd8b6b7a6e420abebbfa1caa74aedc5e699a29785aec8991cbd5e25f3 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ab69b42456743dcc541e8c1e582db029928bbe1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de635c5bdae2da30ad009a4fc35484f5d0b5e35feda182f5cde704bb4426f724 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b83d5fe2b05917fe9d53c47850f9c944ba7c67ef --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a76b10dc4eb7657cff39e3ca0125a559685a55aad39c18790981920d1361e9a +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23be396d76fa99728a09722adf13f26fc61e48f7 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6762f0803903b1cc71be6ee63a62a4903251f681cf10d1ed295aeec0b483c42a +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27dc63d0bbc5690f96d886d5029802812b2cc381 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8e74eefb5605d8a13669b396f6c50ad5b1818ce73d98a1bee387741f071185d +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ac85bfff77ece0ca7d9f87e4ddac4719d6dc472 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5798101907519eeab8e442df54795f8977591a3dff03e1eb1924f3876989f1a4 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87e0d15805e2c5e1b97a0b1735511b659b420c07 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1ffea2252150f4c9bae9c7d95e8467ad4d1d373738752e386dad01155e00f80 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e838bab2ea3907bf8ed547405487cfe91e7962d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:389c002dd7c1ede637fdec54bc8cc95cdbd23281c0e9c06a53c858690f706fd6 +size 199058797 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c8898ca5f11eca60b2cc4eae2d97df4750b6f76 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:795c2086c503361e3a6e7c67f1a1cb434d74e14eb11a4376555690d2170171ee +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..571b3d80b9e3607724493ef5d339132e782e21a2 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30ed6551b3669a64bbcb4f8b44cfb534dc3dde09824c03d594003aeb0c23ac65 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b904a1fa9ac4366ac994d625742883b39b4d57e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbe7cf48f01ac73224055fddc9b74db98434fb5c12306d90a27f92f8454bc59e +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76add61d8ab9cfa4e7881df616f2fe5c966285c8 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297714bfb5ae5f69f4ab3946bee47071cd525dec1cb93ed448f94d400d955d55 +size 199058733 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d95392dcb8b40dc09251b8aab578b9d6a880614 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e21255ffcd7400a25a1b5dae0d4a1c9522a35ae4b6f4358cfd023731944f67a +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10ab4eb3b493e3d01856a86a263ce01351aa9e8c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff46e0d4f2688f421ef3c59de028146f104e155f994cc98a27c4ec62595ec12 +size 199058669 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a21592c5ddb71f37c982d36e783acfa34278a0a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17c49444df660845d8b61b561757c0cba785082fb9a5fdedc03dd4a70726d73e +size 199058925 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4377289d618096177da897774c76a26d0cecd6fb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76d0e97c1ed0dd8f71626505735ff6b6853e8422e619b327c057b516a3a6cf6 +size 199058925 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b80f85f01c3d422237176f5a12981f4fe327c7d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9c6fa072378eaeb3c2a17f54d616df1325d3f79b043b4b7efa22e149820852 +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37bfbc3c0cc39ccb22573b7f340a65ec8eb9a26f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a884f56ba8a9f7eb0859e097891b70da14701eec4023772a7b28f88cd32a2e3f +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..344a6e5764e2df483c1551782a647e3f3b1a0c43 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:963674a0aaad7bc72792380f04db7bff9151210ef70231d28181ba43bf327ddc +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..593eb664f9bfeeb0aa335d80cc04380a00885aab --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:844280e425e47674c9142c8495af15d91ff1654f4c8d5258ffc10ec15b69df01 +size 199058605 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e72b3160747a171e75a5e1d4452592ca2ad138f1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4711f2c88a6c6f93c37ff6db16f8576f764201f4d2a35f52a1effa4827d1e360 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2685588597c4e660cb2c8ac21fae6899eaccb050 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34facac28f337c299280f0b766cd3d725de20199276676960dc62b31929171a1 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2527ab9568ba31c8c7556fcb86a5426b372fa115 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bcf2d9a8db82cafd624516a8caecd465921bd59c90e6715fb8e7e754eaf29b0 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc1689de1a71b68c191157d1f4112f3c246c1824 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6bd07c1a1cf71bfb65ab38caa022a63d4fdc7c88b20f2b2378c8645a1df8aff +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..873b3859554cdc33af80880ccaaebf5f67e8a1d4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4cc20ef3b8e6a1d8b5a77e11ae33c79ea38521920f356997e554323e572a7bf +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..098d03bec8c3280d3cf3ca424ffe5b3ef2e14027 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce0bde9e092b74798423c38a7c92957ffa78baffe240049d156907c7529404c5 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..898cbe0a4c3cda05eff301c6ec5a87fd16a7513d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684f1823875eba070f86f23876d8c448f22d2472f6d8bb86970160447cd708db +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..691f9d08aabc28a6dbc591f8536ee140988f3fac --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c2c6ba071560401315ec38ca0cc6f9624568204ef7adb80e952d54ee3606976 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2213641db0548d8286c85ef10b285264bf5ed7ee --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39627a7173e4cb31bd63cd96a1117c4ac5ea347e77956fc9fabe30861ae38f9a +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe278c17b1a8384c1fc25bdb67c0be5fb7ec21c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0363175abda4c75f98aebf94d13477c2e4ba5074cc38ff7e1fe668d24747fbf6 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbeabad89b1d84982b9726e8f4cf553e916105e0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f34cb197d34b8751a4ab1fdd1e984787b710cb3da20e027d7119cd550d9427af +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6924c90c9f0835978b3cbab730901c458f0673e0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0313afafb9b76392482c728138994dad07480ff3142acbc74f0a4ca7ba2412d5 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5940d01748a57218e5ba49d9e8cfa1dd43dbe452 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc30983a34abec888f85357fe78f43c4f1df10620314dddb80761647e5d0114e +size 199058978 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37b3cd1004fbd76596b875e1426e1e7db8e182de --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01dfbad1831c5a0ef5b087abe9690d7e4c5dabd9a6017a2282be28023fd55e78 +size 199058978 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21f96cca935c3c92ad1fe6fa37c464464a572211 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6395998c1dec75fdb8544a99e87561afdb1b294bef016f6ab986889860af75c +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46e6f6eca559e064a8d796d3fc2c5824856b8cf3 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c450cbfb54c7c38e9bfd55a949a5ce9b74d5ef3ce985d7a33df4d9f42036872d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45be582fe4b86294ffa55ae2ab409ef4db83295f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:569b0b5de7a0aa392d738f0ab745093f069717c1c3b2e688e4e5ada495aaf561 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98b4d9967ce9b0b256d082ac28a26f5e91246bfd --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38daa83c5e23b7354246a7ebb5358cb06a73d4c64f1ccc60badfa37bff94a2f3 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c29e02c801800e01ea5a454d80a9cdda6892aac7 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b79730be50ae507bfc72e324c785583bd10a90f00f3abcb32d31c7a8c42ee5fe +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24fe45c7c99e266384bdb9448d7a39fb2f009920 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:768fbcafcf106d9282fe0f369088d3dd2ab667b44e2ff2d45013c6858506a05f +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d34a4cfd455e09b412c87ac7a39c68645fd5d36 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d3d4702c39879105edb2ea2d3aacfaaf23f7393ab796b15036a1796aa6e3de +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..210353d2b68f057c0776ec3568d631eef00d6e01 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b05cdfda015a8601e6ff389c9d255fe43776d958f9c4f2faf5b555c7a4a9d78 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31b8df0f004c75a492eaa6945e1bc28743a12da3 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47354dd2d8d43e606f60fff4884e3065f2a415a128341fcae4262a743e031de2 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f653269099a5165b854e45165ced48bc2ab94de8 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99d36c9bed2d766af397acc84ba1da05a9ebe53c1eaa6f08e51fcdabc334c922 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3001eeac1a9fc2dee3c7e1c12b905125d4931d0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78beea594421f8f2189f1bf8eff4de1303dec51a7de7e63b1a2a76a88dc155d3 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2644e27b9637b5d987928b73684681ff3d47c66f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf954fab8b408ffa6c6a23632830619c009d62e03e57e46f345abf1b2d7a332 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..856a20bd93c578b4b8b93bfa9075d71cab1d9a4e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b46f5d1d842fc13516931d5eba621105dc20dba471296304f8bf1f4924804e58 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dd7b3db9f538d417ea7db5edc45fc26d443b5a3 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b5e693194db7bbc363eb75c867d8e1dc79a042d944dd17620fb69972b2f91f +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07856645540071b612e73ac9da6db0752cd82ca1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aff3ae7e424370448e3d887c1e142e351b12691701d2e3b031ecbd18fc493902 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5902b1a0c091caf361790c2d1fade9285bf7e3df --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1629a01be9cc9458d80fa388d78292084c3a85a928bc10c35e7cef3dbf91497 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1ba37415499a6b6f58c4f5981f4ff907846a799 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4260aa939d962cde05a9f432d30b3f355ad200950cbcdd04590c432ee25fcbd5 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af2696c8f2619a159d75436753cec226a3bb0705 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f7fccc413f43aeccf70b0a5331ea5b400c8bde2d3cf6362325bad2e9aeeee4 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac18cbd8128855d4861420f140830e40c1b203a0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7dd1a8e56e56202da82c9b6c297d3803eb5baca2dde032bbda4d6a4e6734c79 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44a91a1a8e124af8f3701797ef0598d8860e5d95 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06abbf4d98be5f7908d17dfaef5039207ae5f090df1c112b141fac5c76758d7d +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b1dca9c0880c3b9d505d781259963af1c8b1613 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91947c937e87d24abc0eb71305a0184c766cc27029a5ce236e9ba5d49500b2a1 +size 199058594 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bf2e62fd28a4f0a30df8fb8e097ecfd8a002352 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc86927678735e978a6c72c819cfe5fe1290d5613264724279035188c7b866f +size 199058594 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d672a5a4ffacde3f24fb19bbfe6175cf9295b6d5 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aef93ab9146062b6474dd7ada938d963ebbde14ba19de16cac041a6214cc4fa +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..068ed36cc8612d818bbafca4899b19e6a7599809 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8362069183966626e61c0277b051b2e925c39eba61996f1bd5c9f561d0efa8 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f51915e7858705e79519ec0f6f8df117c6a7a2c6 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9de4bea1e9d4db5eea9e0bb681c3ade3e8c4cb5dda2a543997352fcef73047c +size 199058711 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd9e33cffb8ad65b4d9dd1e4963137183c7caec0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd1d9457bda70a9bbe7f2492b1f94981816f4cc9ecef9f51b617c11ed4c02709 +size 199058711 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c60e321d35d46e04e9552a8e2fdb4b5d8ab3155 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5814d78585baee5796508b6b9f06cbcc582a4cd56e780db1821cc203c5f3e69b +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73493fcc6048a1fb0a66dd81e73243fae490dd06 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63437070f5af19832f99dd12c20b50c8de67733eb36760b9f314a894c2b159a +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f36dba34ddec8d4a55527224e452fde5d1afa1bb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8bb9f8d7a594a9fc76563bf8c1e493cab6b4f41bc9904ba1498dbd15d802b9 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ab1148dd4968381cfc3c59675f284c5237efb1b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09320179709b9a6b1b95fc6c14d61117166d8690f6b184aee130d4ae4142ccc +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5773fe10d889a69668c3479a056f9201da2764d0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d53607f90986cf513aed55c822b1be00e33bca9b8f174833a91fc96d21afb6 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d92e777da20a63d74b4a3d717277b3599b6f6de --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce386a040a923a687d53315ce6a61870a034d2e3281163537df7674b5868d911 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68481c4f70eb4b517b629146825ef4017833e42a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e1d98ac957944b7e5415e8cb2d59377d21cf076eb9a7732242266d2bbd0e5e1 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ed523121a5578594c22c51122b86b9956e7936 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9845ac7b0ce475068dd60c3383de037c9e114836d4136a228a12b7c215a59e +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10f5f2c54aad24431ca3219320a78b45e743ef8a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a91356b950fd9a36cf126a93f1ac7f27d82bb46aa06e7372e854646ca12170 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e5c113e5747a9fd353df18601704f6ba91e06a8 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bcf7588fb2a8bb3f9b7a02ca7c949840f17510a3ac54c2cdc0e4338a6b6d03 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8675547c364adde9b66064b76168e5b1bfa541c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9402f8193a52142cb0e0907123b74f02769c86b0bee57648ffc020d91ef0d86c +size 199058594 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f16509bf69e388b1ef0091304f618c9b8106c94b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68c6b86cb63cbbae4b92b66284caabf2ce6cc167547c4ebc0a491e5f3069a1db +size 199058594 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5912c8bb0f3133a28081193469540d815b402aae --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70fd9ae06065a5e5070314750fb9836dc76ea243701bf100521c99288d82d6c +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4266052e696dbb605f38e68e1f39036f2eb2de0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e48d54232e52c92ca66924f6640cf3531daa0be22de8ff1ac295f285dadee4 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bdd59536c2f1eb7475ca181c96bf644da28d182 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:960d8fa225cdd2c36fd8c7f42e99e49b6039ba7eb1999e16609ce11c92e6c31f +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d0a380340a5fbbe0bd4c9a3fff76696b4eaaa77 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc71efbd69c6ba960bc1021ae62751dac3dec070fd3206f419a6af0064c583a +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14cf26f705e17324d0f43e9f99a7bd74d5c839fe --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c965c4c17985d533f1ccebce71b911cb0f10eab26fa14c2ed030e0223f2602 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdea6ed200ccb1f93c49554472c5614cd11bf584 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b614ad03b6b26702f1807b468eb412d072d819ae5560b0694c09b925e06878 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a73358939c4a7dbbafab4860e8d127edc2e8cf0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87639992f90385477f9ff3ad3eda380476cf7c07df74a38316a27cdebc9e73a +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..988d3f933ebc39d66462cdee0085beb524ed0ba0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc3c321722b4a513b55334eece79ac8ab52bd77fc5c0ad7fc6ece67094b236c7 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9edd27eaf8ab7e9d644e691264dfce3714c8166d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7bc85a762530869b120ae85301250fe0d2ee0b8dc04b28691718aea1989da4 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..746af046545ed3ecfc03e9d19a827d0458a8b6e7 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a293d21bf9c6173130e517a3edd6837f5ed434d81a480804fd13df3df9a46ab0 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2eb09acd62170276919823aa17e0c17e6c09d0ae --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1592900fd98b0d014b09cd86d8e088aad141cd984bbbb9ee861070efd3460783 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..993d40ca851cd6af339b3be9dddb27b94e7b5964 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e199550ea47e284457a259863bd49c8c8fdf63f3aa5b75c324fe42012a44da15 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce3bfcfe7068116f15d706b0a75a173e3c78e922 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08bdba15d728257a686a38855e3efda81c1982e531065ad81898c9531249108d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dff2023df4db394935fde5f862797bfe2e0b66cd --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20263013400ef9f560eefbb6c3678bbccace7719b4e30a0d045bbe26ff40aa80 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06307d66b0a60dd72f964b0b94e87b6a5275c72f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce30da03bf2af95de0798f2cc45bc857de099c48e7b5618dbdab3307377f079a +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f09970b650632ec196d103916129788ac439e717 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ca7836eb61fee646a3b79d51d7bd83f03e0fa7fe3463c4def26fe0b7e8a3cd +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d4b57f5e8a65229a58588d138cc406841ecc66 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24a75fa5b0f55839fc1d9255ce0e14376367109535bc52848d9f73eb8e1cecac +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3910d2f8dcd87c295f2724e51a9191a0857a8a87 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b066218bebd4109cdc8c1b5a31c44590ad417a37fdee8a304f7166a8fd6f2504 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4deb7b925c6c8a70ba779d874fdacd4d47d2de3 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c1314f5f6672b158f5d3bede75d55646e452ed4e2c7e0f345209c6b4f02e5ca +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dab0a6ae0bea2cb3cae2e75a228a29fd441a5ae1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ac3c7d69f0d14c940fa1902117d1e7825fc23a211de56cafed9b8798da8a5f +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b70e633ed8fae248c077fb59bd5a2a25f85dcb3 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5ec4046c1091a14dd51517721cb092d4fd368d139d486c2e81121508c43a14d +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e54de8ad371ab787f8524b5d8e03106d37a16819 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d3ca4384a94f73d5d7ccba461f8b78e286580d60ce6c953bbc2b1ecf8c72ca0 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84ebe0a026afe09723a8bccbdb8fde2c2265cd8b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44782ad9e5ccd20dfa3ba3dbc67819f0427bedd39897079e0c0319be7b421214 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84bbc1e5fed8c6337c25fb6fd9f1fa50005d9ddf --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb45c94ea99f1e1cb1ff8ce089e883b2fe43abb06661159cf053b5d9bb7f0cd +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00c2ac1b7608b2860bd81120dca1089018e3b6fb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b31a7fa09ba025d120386920887f6b093327d40eb932cd5d9f80f9e6784879f2 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b68b04a42bbe286c2c90b43296a77d3e0ca5d7d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7fa07e67a23e607d5178a2e7ad6fbe0b11538b9e787f7c5053861181e343b05 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fde1f10c1f44bf9dd146676ce4953b57b486325 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be943111a7d3fefcf6f4bde891f6a5f80f19b2d837308be3d2fe0e11e4c0ddb +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e22f7990504a8b5426627991d3df4265cd7ab450 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acc6646392ce3c4209ac9a11aad077e711614bcfa5aa512163eeb9e8d859580 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..344211c4274cb5995ea8960f2a5ea198b4d3fcf1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65cf26af69a44eea6c26b089759d25f23aa8f1c864e0cbea7111239d0a848043 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa2b88936153bccc9b79bf7989c591372ed77760 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28642eca082cb4a86571cab4735de42f10d13e1f760480de42c6c5b1868c8281 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8763bae6a6af0ff41f74c9841f2b792928149c15 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e4a1cd93581e3f730195271a571f157aec90b717071290c4f80caf00c4e9bc +size 199058775 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d21b6b5b4d4303665bb62040b40d66653d2b06ed --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2811b4a1093f916d18ca5cbb549d37c7ee5e003ac7d424acc20bceeca127e221 +size 199058775 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7075b4c91c7b5c0f268bedf76e756680072432c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0fb48f1da14deea9dd3d509326d05d22e5f6739c5ff5d6c4595771d7a1dd31 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bce21f5c581af9d2d594f224f2d86cebd67963bd --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e191ae16acf847fd55f995e87acb979f6ddd6ec1a3301d10e209b6f6193eee5 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6710e0402c791a11c93446998ac78f6d8d7ad863 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b65cb1217997009cb18b1acd72bce74b426eb51d322fe1c9057994f3ed27ea0 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26fef7adcda1ccbea3a89ebf7c02ec934888e146 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670dd5e652281c1716f3ceedfebb4e4a3890de213b50226659fb86706990733e +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00b0c1cca218565d699ec995de4eb642bd764548 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b8cb2519e47b38fa08922da2f6b740276d51cf6e88570a08b5593ea46d00aa +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd07287f1a303ceda69c0ec3475971b571fe5653 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5782644d2c3168e1fd654621a46e3ac1e30870a343feaca17cbb346787f6232 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d6f486d815752320ae8c71eb19c222e5546e89e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418cd8bfc9016a3125f7a693510791188724d65d93f6f6e62c1297d560583f56 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28f1f1e82331bff75df6542a7f651fb33de5dfe8 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c8fd1949e0aa39f6d30b00c3b05a6c44881ad93e12c205748571be2d9210e1 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8832917ac5bf56426ae7fd8afea5dc06a406738c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46cde52f889d193afb3d2b9e0d345fe2f9457eb9e525083ffc4d73d4600034a +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b972cdea5ec185fda59c7ee149ab85040d1db09d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd45d5bf566c3cd40114f28fea5bc44f0380ad23bfd0186a36b777c4775cf90 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5c86d42c53882e1134b15cd3957cf2af18f6f01 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff87081d43384a3bea2bcfd898b7690d196038b872ad9d53fdc0913ef695933 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ea05cf43e0ee901d50b2f50d6cc0a46f3be5a6a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8fabbc6036ea96d48ebdc26d4b7c9ba26020af566f1287cfe1c497cd860643 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9860c13e4a26c539f187326b0174f66d9802fbfa --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa35adb504cf11fa3a7eab5864dfdccf2070b8b03061d9a50c6ab3e191090fe2 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d9ecbeff5c19c7a0414023b4dc82997dd81cea2 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc9e3d43a06f667eb8afd7be16d27b51e51012a818ffd64505f5a6401fcedced +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45622d4bf11f1b0876173f90b33b3fe86bc43982 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22409df037fcd83df346aa53fc11ab810ee8e2f8f31d00642d14e20841414f57 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03ca84f6aac91d660a4d0ba5b8e0ab1fba06ea43 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5091a0120c51ab9ca44a941cb977b47d10227145909fc6c66c6fd8fdbcc6ccf2 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15e28f358716d96e0f34087956a7e2b2115593aa --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:946f962a5dd5c8d16a6a85975528b3696c8b17f12dcec67d1e2e8ac4d83bb50d +size 199058914 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..538e0d0c6b27f8681b4ea29640f5f4facfaabfa4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:447d63f3611a8ad16e92f854c0e42849c9eb53728a22d8883be7f54ef6d27e63 +size 199058914 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9229bf5fc1670e297f7f8f9467c3aa6cf412a2 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2956d588af03a13855b985639b6b30be2d35270bc141ee64d8ffd37585f39af2 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..300bf4c1556a9100cea89bac80b12002cddfcd93 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728bd755f0bae390b109366899c1169cb1bc17ce6d3c9a80766716a16a43b5a7 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4270b76ba966ef23dd4a7c16c830edf84495ea6d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca4d9a1396e970ea08af254dc3ef11916853c3ef430548a81124651defbec85 +size 199058711 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c31eeb66b47fecec1db4c2031bc22b82bff9287e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1529c9c21e40d929f09e34c919f7f15e0857bb17709cada77b9268d5dc48905b +size 199058711 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6265d5c3a22dff54e261241c7efcfaaa61e86e9e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa53febb60be603a70461d3f1bdfe5451ed4fef424ca4efcd00710444956e7d +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dba28fb51baa97e33523e1c610335f1ab20220f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98aee525033f0ea8ed63cbda142a90204bc3ef089163e5bd623a9ed9639d1ec4 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84b1732f745ac4c710115be5a723b6c6594f5960 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:101e10f92391141a8517f787bfc81f31e6e7aecae278263b134e3091da8cc7da +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6c934ae124992ccb26d5dcc9fff52d2179e02f4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee9c42fc7bd68145fb26dc31248a10aeaf9624397c676392287f6578893cadd +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae599f6fb6c874d806759a1ca8ef504ed4d0ab79 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ae355707afb1ca05c452d3f66c04b4392e900ffd5daddb6c36a939dc742bfe1 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0466d8a5b2c7407e30c302f11795ef50022d4aeb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7026658e0d1c0a59cdd19647c587f451f8adb1582decce5d31809381e137fa1d +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..627cb74cfc2acb4a9ab466d30ccbb698f1818542 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8f05b6e33e7a3d113fe95b275071f60b7477b377a8ad9438fc11cb0e0578ae +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9f127d47e98d27b77e6ea4958ab047a1af765c4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7e66488a351e4d51da45c45acb89a572996e2fce50d906904f5d8d90601e69 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f430261bbd096598cfbb44c01f6c6339953de948 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba641a781b9d2785eb8ea12c4c3094c0beacf3759bdaa88f57b28086b3aecf4e +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53e639fa4a033c8ae66c7b1cc2df44be97de4fc5 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daac7a0433f2d1ce9f871888abb75741671c794d637055fc3bc67c031f2d051d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..541950e58a481049184513f51c4799a477bcbdca --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa18739e0d9c42e8bfcf5c7a1c7d2aa0f775bc30b3d7eb7544f1d06f179df43 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96ca7eb28b53e94ec0acabc4987fe66bc1126754 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bcb74c09d936c64b4bad50a095dff7bf3eb6d46a2151e97bf3e0b891633342d +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e781df892718a47dd2986fa013c4d2a6d7c48520 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809636985551dd2af67a46940f80f6b94022a96e68a5a9d2438ee088719b9f75 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a35948537eeffe832cb08bb79c5c1e4531eb1da2 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6c0052c714d1ecf18f2220d1ad345a91a4db56ccfd6e3b766b28ebaed992d8 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4a2c44ba29b0f6379d8e1fdf316be4ba86d3460 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47a25c15d3ceb474fc1ec23cb59e35fd2d5e362b77cbdea321acbe43dd15768 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b2ca06dc9ff396ae887225325ab9bcfb5eefcec --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73442b1749d4dc93bbd7a9466fc0256112aa2c7dd3a09064e0bd5fca4295fdc5 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9382af22ec1d71ea3543afb83758c0b3b2abab0d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db18f510ad9fde45c87cee9fe02b811cc7c7a71c0b1899f6c8cb0fdf2f58194d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5382eb2bc27be1a9d6d0fbd1da51a9905314c247 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167bcf5368e9ffebb8ce9a43081b7740941076fd71bf2280452c56d3c3ee95c7 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ec64b7a7b6d084d3a7279013c6234a86047eaad --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366b2b63d9b0d1f98fb3c1bb7b7d2a065bb8b65fa685f9d1587ff28db07822e0 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..832a93db65744f6f77ea339e18f52bed65f98d35 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe7e9624f58d6b2ac2f58d03a83b8ac382c497f06637966261357dc0e51abe7 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f08e8333a9f54887659203f36b79cf62647051d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad18d4adec15d4940a09e8f05e7634700524cfa629ed95cea59157e4aa22f685 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef48fe217073c74cdddbd2488abb33402c8e89b7 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aee2fcb82b6a5c3db6fc0b930d2d8603a3820b7d416532fe4f11198fcf3a9531 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ba06921e09f18f8786029e93b0b6578ca3ae379 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1084013ffad661e4d226607ccc7d59fc618026c171f462876278df8badb3df1 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4198590182d65bb1423c7dfecdc75524af8083ea --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59634891d2c058d080b45506a0ad8bd5bd96d3e7db86722ca2547f8b0d2b8f0c +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9edbb6d34c4ceccd3e2c9c259e25277de2de7e72 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:172895b2a20fb86917ef4e5a701e59dd38aab6406158fe3d9c935ebfdb6b02b7 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24699a169b57b6fa2a1ac20b447e8a36cd2a4ccb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1099900721c518d2330c5f933eac42a8ca6a3dc39397a75cd9f68f07aa939112 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..647de3299ef7695a62e06de41654cc81194e2dd9 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00e635a5d01e89148000d6be52cd074b0ee46228de9284f3731e20201c014dcd +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f5066d3b8a101295a95735e6728b1b486d702e1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0826c9ff71a1ece4b69727f607194841e2ac9e2d2e120a340ea0503141f15a04 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a9586506a1d638960c041b68848d64c59c22504 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c1f572905f7936c13b031ca27a33f978423ca6b523f69fc289f44d11286cfb +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4049215eb5f6cbf2b2a25b1b2ebe3bd152bb6298 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc852e24fa927cce7d10ca4d30e3240c44c9c5f9f4d5d4b58149c256a1220de +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..158af8ba73fc1761e4c7b352b90e62124fce3b24 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bbfed454f1b259c1d17894a74612a1fdadc57057a2e3db0dedf82592b277852 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3de904369c96259beed63681d5f517a2146734d0 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fd75ef15175bee5a1a94c19b5cac5e09a4f557796085ed86e305d1e3304ef86 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cb8ac8c3d9b85f7068adc56d741e1fd8c42cc12 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ffdbc2d1bcd20239f9427183d201c4d11f35ab5cbff75150692d312aee1a99b +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0f88543db199789495f728f90023be3b3a1df7c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef9affe60926bdd59ef251ac68aab429f4aa0871b263aaf3222fcc18f3db740 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fc50eaabc0a6100c023c6ab615c9980654d69ba --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12c1056960943ac5f6b69215982225f5da19fd974262af9580ea6eb2966d2f7 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e89c23e3ec68568c674a1a3c87caca01e20c820c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bccae9d6530f4f6e98898039601412d67f611b72401de0e784cff26e21c3a64 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e63934cd734935fd67f07c46ab495b1f411f2cfc --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78f5172ed489d05db99df408ca693015ebc54dc2b4757f4e10a16365d492b1ed +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec8d07df5b28d569d079fb35bb509ca50740675d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a51d5d6f24815d7f25903790d9f20da2df1344bf8e9d8b0d27a5a93a3947cf +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85d93102578338fb222f51e64fbfb6362f3593cc --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b89c443166148bbbebd7f62fdac20b65c314b0a53cbc4ce822a092d5eeaa964 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4b55352427c0f03f5c509179e3ab6f2aeb79f9d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5456c9b95fdca56bada6fd4b513f34a773401e45fe1b443774006192eb9ccdbb +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de77a6b4e28a901062fc1af3b6a9d3fd450bcf3b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17d9e7d4dcdbf4a130e702ea6b3c3ee918c57a9169442bcb11dc9e12fb87b38 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c8da58e1b9e32ea48709f0d7873482ac907f5e1 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0faf23e6fe2800de1163a84b64fb02c845d897fd997d2ae4176d596f9c64524d +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c01216fa00e01777cc49d2d8a9b0c1a15d06e2af --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f945f2bf8f15d47830ef6e4994c781f75f40c113787e0271de5583382fdfbd6 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2ea8ff6d1b397a02b4e8d55eec3d3893c361590 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c8b22931976828b930ae7b48eec57da83f85cfb5a2640363652865e675fa3f6 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41519fce9e5b1ae3864e9acb014e4c788a7e9ff5 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:125379e6c8b313de8d7682d0e82a61e54b64305d7cbd19e80cff79dbe369a3f6 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..385b7672cd011d3231ae5526ff93e356a571f65a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8d98fcd25c87411f086b69f6d5b6e87c35f0f61449942d8a5ae05722426bc9 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..481437b15c1fa9a3eff7b522421a4bb0cc5de5b5 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8a0bb453178b6de432affc3d5ed3aed47a6f42ae736aa82e1cfcd0d9b96da4 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..538939c90580ebe8873ab47ca3d31b24cb875877 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b61db006ab73626dce0bf0c7229f2e07a370bbcaf53e334e4c1999d10657010a +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..765d97a263d7dd5a5245465d5b0a22fdb33ad86a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c8a868655230256dd4036ad2ce918c61fbc8b0b9cecbcf54d389638b7b1c2e9 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24a223c9c84afae37841c9bda57c245ae15edaa6 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f94c5ba1b96756209ce1c64013fe5a9cb9c4ae1ffdec0c190b51ceec2686dbb +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fde6438d07d646fd3862e1432ca33c6d48fd780 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b60c3b8a741e798f8762649f5e8ac7230ed28d263d4459b1a89e8bad29b078 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eee42ea60dc987354c87fb564dbc4c2bd3e8cbb9 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b28a6ca8a4dec0772b3855d1ac6b1c21d109646dd3c2303cfe16a8db3b1451 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bf9634de2074491992fd973c7585ed664f1511a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edb2fd6be6881ba371211f473ca1f17010d795e1511167058c128a1d5142b284 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ec7bf767dcc57a62da6363b8f8b5c0da61d28a4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589f81c80dfeaf7cdfda1d449ba2653dede9ef0adf38367651dd0cc5ac64b625 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f7f1e22ea2a271aaa8a39b48d8cc3e3c9688e66 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5545516e1e23f27704318e4b27a1439f2639a42776e3bed1582f9249ee5f5171 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e105463284da4933217df33f8a5dc5fb8d98291c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0304c4fd8c7f6c81f8b973c561692f6110c15b66ec9ccd6daa77b6f47ccf3aeb +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..447671083cfd2d7016a27095bb6cf6c20b53845f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a072122799af6509c9c9db164ad7aab3bbf1c8cbd45b84f45312df1b25963d8a +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcb2acc535ba6786f9e2d20006aaac57c0e14566 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df30db329b0dfd42cbef640fe22559dbb22349d6438f2a197f6473d674d5890 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ac2824b4d02a48cc426dd6dad605008e9edae9f --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b289e448d393ae577f735ad2258407128c3b54a5b31a39f741ca657e5596b5 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11626cc678d506717c30737c8081196e5435ab35 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62dc492e0db1d23ccc2eb57b0565efa8acc75104ea2e81c165793a6f4e8ea03 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..153aa01be54a68ba89cd13ed006cda5af8311165 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b88ec6ae4c6cd6b717fd9d8eec7fd022c8f3d6c98267443a21076fee403ce41 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b165824e8971b0d0cc2796651999533036c25f83 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:255da0abd85d0768a38141e310b86023e1b25f4f685086ca0bcdded61856cc08 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a926b35584eb8507e9b58be4c3c880dfc5c44d4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ae905a8af1de9af209247d77c595aacb31488c0795320cb078534c72c33892 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..183cc3772fbef03f06b2a80aa673148c42a44686 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a98986f2fd505e96a3e57c629fcc8701d77045f2804fffbf2942a66842f411 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3415186c9dfab5d55a1fd812d9e8fdafc8dba08a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740c8577d764c8d66e9d8be4b0f123a7d01584775913006b27c9bcf3d45a3154 +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cafd7cff7c9ec67618d0296f9cbca7fbb1bd498 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fffea369d8ce0a1ff8a6a5ff6d7b613fbda49690254d9bb98d87a69de4533f4d +size 199058647 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..667f10e252055dc6c636b79551357fa040f6c078 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eccabb622cda50a7112145e88f81035cb6e11e416d04c8f7bf579bec4656fd71 +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71ad4c887a37fbfbbb5014a18ad45623fde73cb4 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a8ebddc1792c57f4f2c98cc5acbcf914192e630b7487ae1969f8134262804f +size 199058850 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80d2e6b0eb0e6df624cae793a76e4260a1e5ba87 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e521a62d8ac46816d2dd06c60280ff55bca6a227ff50f0bf5d9aa1dc14ab596 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6835c4eeae218633869c1f4efc09040ddcdfee79 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c55772eb2733d78173b7dfbbc328a087281ea1b1828be858e5e5a18ee47938 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12b7c44b4bd56ab83366972bbb887509af08eeca --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b794169a75e63221b7e693dc3b0f06414117df697965696596444750db8ee06d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bace3f539df4fa96368cb0e6068a918d6b2cd646 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c15c6e179da12a2547baa085619fc1e3a473d72b0ed6687d1ea0316997703de +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f988d6d5a4fe1161615fd78a6d9828dac446ccb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f1eb8b5ad95db205e50265c50cd9c92ea8d27305fdf53f9a0fc376cadb32796 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f617cd77849c9b1b9325879a4868d2fd9c312e8b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:056fc60fa55b208eb4a657cf1588dd69bafa715607193497e329d7046763ff7b +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f85ce2dd15ab3933c3293eb779c931869e48232a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f698e677c51b1a74ae5fcf1a8ca9d8f4ce65e32b2030a88a3fb6c30e3e0ce6d9 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..827a09c8487ba7ebdb746973da061d45a5ef691b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9da94b3af49818dc44d8f475637548d5eda339757d752716791e89d9029aa03 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..320a8ff060bd1485331e0a55784d645a0d677f2e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe988f5a3afffba3d613581460bee6fbd596b19bb5ef9141cc042e467386945 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae661bd144581cda53ebfc27122df4b848c1c6e --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf244801238159aa3ace8e20e4d47058faa6cc3fa47267ee6839b1e5574d3fd +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ce8e9b908bcd026d354f7eb34f25c581ac661fa --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3776fc7634631138a7ecd008adf8203b3856fed8af6acde8fa4af3bac6b8ba06 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c9fefc7bb3991438c72b00effa2478a67948a6a --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6852dcfe782e2c3c919eb8f78c63b0be1e29e437d856d711cc27d6417689094 +size 199058722 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c34d2a9cfb251f64be139a9ac4e7cffae93a1136 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16cbd569b35ab782a7ad46534f36f706726cd8a4c2ed6ae6187ad31b71a9bedc +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78cdc2caf515a98a1ce4390cc2573d52ce7d237b --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:022b9943ddd9cf524777d1c67f2e7c3e55b28484002724e85f58aab62102fad5 +size 199058786 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e395ca1a1b859b97e28e24478a604f7ebd0da29d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d64b1139ccee81c4fb6337ce6c87eca77558711d00b114d04910383b6510671 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc32a00c2b2e75bfec83432d284badaef814d92d --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b3515dadb8ce37311eb9c0d7f264fe4121466548d1dc9303256c0753e95f74 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa9eb1ea24d29034eba17bbc591331f9bd875629 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed8f5849ff57dff4f4eb662c8cc509d2d10dd3caeae569ce805565765322ff0d +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b072c9f023455365dd595bab47c82a277a9147cb --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:887b904b67cdb28b824ae68183cb61fcc6ff62a10039e8ac433d3d5ff9991285 +size 199058658 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c46c0d3078eb1e4c232a6ecf58280d9af2892277 --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b5975918d02fe7a69eee83faec9dacf0806843fd7150ff01ad10038508752a +size 199058839 diff --git a/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..210ea5d1d56133092880aedd88fc66f198c9270c --- /dev/null +++ b/4b284b17bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7127a5bb58b8ea5248416a9d18eb32c142dc820d5c393fea4bac237aa44b9c10 +size 199058839 diff --git a/4b284b17bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a050988e4f8f68eb7bea2b33b6bc48a8413a011 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba357cc7c0e44c1d40f0dd7dba9cdfb54d23ac777739c65b5010e0bc69657ee +size 167511299 diff --git a/4b284b17bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..448366fd04f081b6a34fbfbd0ca339838f2f146b --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e8f56a6d16134386d248b37a36dacce608e598c4ef55aee5d64af04be304d0 +size 167511299 diff --git a/4b284b17bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a02f488cdc45c8428da532f694c7773e30a070c5 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dff400a0d79ff73d6bd1ed6e367c424b8886f4df581ee2631823885eab69677 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b926e62b0a9f6f28f58750128f323713379cc56 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2faf1d6b62153a5fbf3c0dbdf0afd5396ed00d4c8ca2785f719a646b3085cd30 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..916b7afa1cc9f2edba0123a9237ec2df450d9599 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e62cae27547d4b5c9e3464d7271d7415fab076f8a41f0df53e6ef4216fb82c3 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59ccd360ed1c678c1f0a73f41bf9fb1117ea1912 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2736453f42f6a5df3cca9b6282eb3344c684739cf38b0b384e26ace1f49b39b5 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..245e6ef73da88783db5d04030952abe0c59c43a7 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd353088b0dae460f7579201d9c20278d4e205ed1856c43cbba2b6effe73236 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5b2ec586f563a513bf97c670ff60323d89edad5 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f69952968c6ff17081224e12fc53d9eeea58863fec162d8063fc7719b1291e +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5fefab07d5a74920d9babd852f770857b64f07b --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fdf831137bf88b4c32bc1207894a56b139b02c98e91806df32857ef962ac502 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00aaf5fa989c516fdcb82fccb30d467dc7df02e2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c0878db1e1703e5cde6d6760df4894cb29132ce4128409713c75d86e256bdc8 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..031d8b1942510b1bb68d6ee5573f92f165ac6c36 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b5b626cddc16a7c97c8d261823d7ba1544d27586837a6518154732b4c08375 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ed05124e855fb678f2508eff95ba2c9ece5d5d --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8711225126dc2de4eb299ea36e99e3be7cb812f3cb79f1717113cfadf9961af1 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7a6aa2fe85208c4a3b2acf8920828eda885eb84 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92d5e9481a8880fafa0af8e16c34ff635ec75eb58f2c0ac78cdc88864477e6b +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62b19cbbf6ad504d718841b510bef26412dd9b7a --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8496a2da9f23c852c44036cfc6602fbe3a56317bab5f8faeb1e5d59c89db928 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e34ba82e1c2079d13e935fa39a5b4b1a10df7cdf --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8736ac3d6042f491ec94df05bc05efacc9dae5c219f38cfcc016ebeb3e6899f3 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..baed17072757735e71e83975f32876868e2efb5f --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:926017e1c37f9a81633527cf8dceabf21afa4ff59c15c54bc22509b40a5ab21f +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac22370f134f92bd2c50fb24fd188740b241e302 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287d801c8d4625a38e604266c98359df0eedefecfcedfe2f0d4f47dcd3350cca +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e1c86b354745d3430ef4d2094f6cce4cc649982 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d8cb34d2661d4fd323d3f5410025ec4089eabf9d0a54e6e9630c0bdfe549e31 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a126afe9503c7ea4a5dc8ec41197d58a6ccb7361 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:166087516178610ca0280a70d1f17b3cece741c6889db6a2f03726e60bd07976 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e0b21e758b184fe3a0c17cac5d879d4c239cfb4 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bde227f3288454594f4ba31b5963f851e84b335b6e24f03c841da27e1530acc +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21da0a08d34f104fc08d4cd209d2e8e781cddc13 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9bbf5a10039e725468ae16b28cea72527a1f613154adb7eafe064fe83033389 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f948d37b83a08cff345cf4d5f56e7952a03cae17 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3060b99fa6cdd83042db5360270567be834c1bd8f7c57db8c5b07fd63534abc5 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3b86c93c96a9acbbe6e1213f5e86eafb001f91f --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dff94271380c4e9d17ba020880bac1b5c044962da61c25742a510a981877c6e4 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d164e12a1fb2f73a789f809b3767cd6f6faf81cb --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe45ed50c55ce577fe795391a94e1916337acc69911cb173ed9dc5f507c2ba9 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..640b1526969a80a8b69dbe9ca134b05eac15bec1 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535037c11b1157d5e48e4d67702716d2ef002f3a2969fda9d8f945710cfb2d31 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9bb11057d06899a1d63d3a3e8c96d2e2f175b20 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1182fee33f2ea4ad5137adc74c24f0f5cd10902095780f556dc805f56518590 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60a0f8061cd39dcb51715bfdf3a2e04556f0b997 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0296ecb40ccab96a01ffd318e9bfe3eb22ae0cde2f43751ce43c725cfec7de23 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be7efd94b52b6ac2ac878890c11fed960f044175 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c1cdb7372c93f8b5e2badface1b2ed5f75998317275f71e6471aecc9c3d1c86 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b60816d6a0213a5587283031708d8532a044d722 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d6a0e34c70d3503715ff48967e0dfe8c5b6076afc796a60d4876238bedaf11d +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..547cd7739ceeaea4017359e6057e22297257d879 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2554d1be35b2645501f8e85fa7a3a26d2639c33e9af5ad14039caa36ed3d28a6 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2fcf09384edfdcee9f5199ad8e738fbe5be3790 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7428a81a60de79ca99b622d06836000e9b5b2a1d25b98a769a081597a0aed664 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d166d0f2047383c213b3da79b45fa1e8fc83c486 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ec772dd5b4e0172cde079f35c9a95c287ca6bc126f4b30eb3f9fb48b3ecb4a +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b04f3041694efb28b2762ba7e33c7010fc563cc2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54878cbcd4a75252abc76c7cd13f37302a8401d8efbc6d4fcf5689e4eab4d3e +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30684b614c3a8b778677972f066df93b29260a69 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:504821befe48ed0de9a976b7f6c4a904aa9c607e089c817ca83d19c98e8726d0 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f0923c92b7f1d536b4f79955ad9a2eb596ac37b --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ff27c2c6108ae8c0f8a21b700472ab4fd6fa9280f2f5251767ac71fe157ec24 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09a6e87c4d57a5bb8fa2ff09cdc968a3251ce40e --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd6fe87ff414f42dedfb268634ffffa2da5e4a02116e5aeb7e71211484588af9 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f22349f41f40c91f5e3b9101364171ce5548efd2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba7e9c50983bafbb4e24d364810c21af56a3c339570e6ab3e3b4b74b015a2ee5 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9170cd20ed56bd9b81968c496041e76b941b1115 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b229ca021f2e684e3df2a65e9001486eb86c149a2c19fa6c6a8e587eedcf4da2 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30f91e18689fc471da0107647f4a2ea650d6d689 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8b1f5f68feacef2207d6593a6ef362274cf8d28ca36068ba4d9e791b51fc872 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..123b7b3094e8694846b6537c762b9d4c9ce76507 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe2564fc89103b0193e526bc687c7bdeb347e0b894a2511a9c900367bf0e86c5 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25796c070ffa50d7cfcb16c333bcff268a980868 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae7086bd2dea959dcd58c771c9b39c882425fa726876ae5d7378e45c819b88a +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3034fb43c79e5c3cade794c251f6a245dd06f709 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97c4597b949063bbf55725b3aaa50a7ec55562e279183b9f101117aba869e0c +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21878f179fd7fb2656e64f744cf1bf86dbe31bb6 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ada822d26cff3a9d9b14ff9f359f04460c4e19b63dea6b95ec5b87e144dd744 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb4cc61966c69b1904b961147c3feaf7e9afbc91 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7079f35555960cca3d8fb5c359849e5962389b6df8ae4954978ae6e6a5c2fc1d +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edde295e0e90a763d1206d882d777b869ac2a574 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02cc98cf619c615b15699ec6d27e1e63cd82d8cbd1dea965b0794cb77d8e0ac1 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd1b1efff5fb26f88c49b57db5b62346c6e56b5d --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca0df9166039e6e1d728dcbb4458fc0d39e7a3ea5c6449ac345a32fc8427e92d +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5131a6e20ef5f7f5c64b207bac73d13a55ed3ff0 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58d218aceca560532a145d012d345ba88a32e6176eb007f7d5d0a06a0ee80c85 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ed4d9f6ae1516518117602f0b39c8bf0cdf9278 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab1dba1ab2759dc7bffe1271b22b0aeedfdd80046449ba1bffadfdbbead1406 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f49644bdb94196d0df6e63e0a3a58b1847b8cb3d --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de07434e69a97f4287266c71603ffa77d816eea8593f8784a395070c9badd350 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f03b63208012b0f361175e4e204fc20cbc9c8b2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9db66dce98fbee565688716596ad0a359ce4289a7a613a5ad4a8ba312c8868 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edce9fca65a20880a2545fba2f9871b53a7ae782 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:515cfdbf3b6740e0f4fa724cbb8fb62a2ea8609eb4fdf8d263f07fcef583ea2c +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a1f617ab6cd9135df6a4de324efccda4a486136 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b106b2015bf7cd5d91e873393294086d69c40cd7ba92310c2da86461d50f36e +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7efeddf99e70f89164a4903684d3c452ae5914c9 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:748ca973561495cd465ad09beb926501a02423ca7c0c25338fe26641832977aa +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1bb9bf38d98f2924936f73ee93d71163a437eb7 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ef8d5759eca0d88cd2a525327f4642a06ec54848b0de2dbca53298bf81ebbe +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b58c7c456a506aca9601a360cbd18e98e01efca0 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:905ce5c2040cb5d6602993191983cecd6a9f6c673d4694e078fdc9e90e802ea4 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19977ca59e53e83dd8b09c1246de996edbab20e2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:270783c100fd68236404338e87c03edd0cbf21b7d2f6a74e02cbaca1fc4ee79b +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33fcfe46a263b31f4844935644c3ffa970e4c715 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a179cd57560c42d7d902324e175c6f1734c68f70c112fb19be28bc7906409a26 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2d91b6756cd75e455d15dd0835b8679b17e4244 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af115564913cf9bbfd154236cd08bf150dcd1160e1e6c2ced383275d4e6a2706 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33518268af9b1afa129f69f7352788ebea5082c4 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601b5a619a845db7a99a1544148fda4e1bba8c6863c74f3b134820f67fd17be8 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d73da69c06fd20bc562eb04d99d75cf322c50139 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ea40a4de337df294e81cd4b84fe64f8b6c67218ded03ef6c9d0dec2a708951 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15cfdb66ae48e21be6c01ab7407960d28891abf4 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b51b7abf7ba0c9ffc288bd941b1360c034324bbdea0e1ebf531e13453640122 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..240e0eb44e57540bde564f39376616621a76facd --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcb789a3474bcbd4fca5dcf0b6673e3cb4183f6f11b4d77a33bb2db7fa7c657 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8770955585db3a714f581fd726047418a5f63686 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b31d83d7879e687cc39a4d16b2db26cea07931acf42a6a9fed7912fe9bd6b5 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ace71658451811f259fb19960a9b739201f85c7 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e276464941fe2f6e7a82db56b4d8539c596af62a381119edada051750e57fa +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0af7007921be9698150507f0048a94323c732f81 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c9e5bec970d4873927da4b2a546ea530226bb2b162dcc9d9e2c68c36b08115 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ebc65c4508912729d5d9352bdd7452e4ebaebd1 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:180dddb47c61a7764d6ba846957b349354f6eaacd409be2fa49f56c8700a3fa8 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a26fec34c1d22a4588fd5e0b5a6e1b99499914e --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6233435090496dd8bdf4b697c4efefe2d514a04a3e469e527abbaed75271a0a9 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62e5dde3b9d937669e77914189dc37c3fe29ad56 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d2171e208c0b21680ea4c803e9968bf67b5c0990d4422735ac35a220bb8b90 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41602fc654dfad45babbdd3dc3ff1743876ced8f --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ee71befa57367028575e9f82b909c8ac4d6694731108b4d8a89584aedbf5a2 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28d1fa7d1a670d3e7a3321d3d001acfffdf3c066 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08cc79ada613ef53b9f308aa6197150b51800d841f657e52ce0d09f10fb417c1 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60de54f3a28cb27672b5e4e51e1e56a80e95367e --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a5b5548d51292eb98f9519e985a9e33399eafa3c8b1a67dd5bfaeb8fa0ac18 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..063278428729706e19a96c9265c10e210edc917a --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7531de79eabce9b1b2cc3e2792b20a7979f6ca45edd79a34323fb55c358008f2 +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef9f0bf03ff2dc3ac582eefd4cc2e156c8e1acee --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f1e5d0ca7f0638082a57540617cacf92750eb045596ef36e341de7c341d6bc +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62e7121219681b7921f58d3c555e53c83c4b3560 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9005f6d53b32c2445df828a68eff2b2fcb4d867eaecbe4dd83aa3426cc2fae0f +size 113308931 diff --git a/4b284b17bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b17bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..498d0b935432829b82abb02a3cabbf2b10451fc6 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d81f9221d2908be2e460b9e56ff0180de9e24a9262125b5194996470ffa6513 +size 13507 diff --git a/4b284b17bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b17bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b641f2cd0d9da0b61d40511bc2cbcca82ec9cb2 --- /dev/null +++ b/4b284b17bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7fd1a85da3b3ffac458e3f118014994b6fd56cf1ac9e75c5876b6fcadde072 +size 13507 diff --git a/4b284b17bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b17bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..148dce6ee5f1fb4bcba36a13dba74aff842f27ce --- /dev/null +++ b/4b284b17bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0acc8f63b9bb7cb22975df51a18c58a5f96d5d9e5615274b7b8393db72d5ccd4 +size 51443 diff --git a/4b284b17bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b17bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51484cd4078e383f7928d659f040ff25212951a6 --- /dev/null +++ b/4b284b17bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:596384f07b4a8b8fa747df525a52b6c75349d35f4ba7d3a63781696be0417b39 +size 51443 diff --git a/4b284b17bc4/transformers/config.json b/4b284b17bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b17bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b17bc4/transformers/pytorch_model.bin b/4b284b17bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..04bc6ec637b49d5b2e863e9018d64b2b10bac82c --- /dev/null +++ b/4b284b17bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb17c67079fb3f6f44a2528db547de0056388d4a7f255db099778dc00dfc15d +size 8781203669 diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8a2a051c689a6e25bfceef5d7e443248db847fdf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3423886062648571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03277534172219839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07116912691303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015925050224480028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2998523601292701, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004615423133559915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1078155721409226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020097112571708245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03290155614985229, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009635800950074162}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1430151870039024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003133828558045969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04998894903569846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001254978454654862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06817638471397719, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001440134234671522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2915042735714293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004516752469805975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10384529013890176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018689379304153127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06758374630355668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014755310584898722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2855613282697172, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004322222242728911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10254842199703165, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018748895711891628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fff896e3995b2ca489342086d2b542bdc1957e9a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.43549317288896894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029438186163177924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07179524471867899, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013752072779383184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.35855038856048876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005049635846993475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11204264924342298, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018816879797699437}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.032801419147362856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008197872630256377}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1723852958712864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0035877516735487143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.051345397484036256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011584347174914676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06711558592890904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00122163800184802}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33387823961526397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004537882130758048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10481665765175784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016825296642133658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06830552553162243, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001292240550330291}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.33957294457499176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004661961230072316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10653343348961458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017665097203702323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3abbf3b2dea647bfe44ada1e884501fec25bba24 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4731158648079456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019398006063203924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07373712610103833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013232763617533117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3838400605745808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004971353366559517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11565039220118124, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017909510986326618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03418002818557043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007999127016124942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18966804432678097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037070132910163134}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.053828506115298144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011299593484305154}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0688866789263329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001197716696827759}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35416301817745277, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004408869173242153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10799852330681028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016342464145243179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07036220911566307, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012542262936579098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3642675981143951, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004634298128673839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11029915615802689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017015090669507955}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6417033f7039703db1edae7745191f4e7b5fd786 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.511033492138013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.016836817368392938}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0711394556954671, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012159991412829965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3852018338546001, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005067790324136812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11234455309812195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001658068318944836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03260081169248259, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007319564987104789}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18947442868920766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037625536404537635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.051724489676439236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010463045011873814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06636573472304302, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011254814997386617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3527863813991901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004403124502538447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10462160106353212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015374663592284314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0678291021786637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011705423632374366}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.36350690371493805, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004671323997080324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10692341362108594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001592257988435529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fc9b2937bbfe6255904d50670da9f129a0b67245 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5255384435057461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03418580070894041}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07289393265058683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001221203882596679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.39210555607501246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004907654599821545}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11520145885524227, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016544612830049232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03333523314795858, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007290358506575526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19457648128168623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003616303249775819}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.052942763106877684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010297898091120672}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06752250125401707, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011086146136256522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35896707400416245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004279012704408649}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10655328646588215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014999277772535491}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06949836967046885, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011669918564828742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3713609435685992, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004540169352829801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10971668045319283, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001577124609883823}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..af188c5caf6f762e737f0710aac5abc875d2be9f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5823615010118224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037398869921054644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07346269739111895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012187681049390564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4042470714837263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005065518163952181}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11639260450950206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016495247557604747}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.033940599163428696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007391477550345831}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20304234761076909, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003764961452207978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.054089458597439195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010444166935940533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06792933252591714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011100797550413972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3687266029366595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004388331338140494}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10741878654019481, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015017472778460627}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06991029736225668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001173008019162227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.38168151833425595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00467124723267976}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11057542829413115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015808583588389114}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..db4643465ef0ca28b1fcfc6efb076d2caff00f5f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.1593816005881288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028463012802740157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.15106039306191837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018614327334657758}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.1293668266283262, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020451981284945325}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.010415064972944536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005832498460949334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.008381836594921248, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005769442404081656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.008447632785522565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005005390644606267}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.12882295031952942, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022244289545785443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.12997760982624038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015539465574021857}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.1059430520968093, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015859113615098958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.1354477122353216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002537160964118554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.11805775221027685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015629478566191073}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.10719224487960032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017665884689823603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.012484298141528558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0007013360733034469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3cc1cc52644762d99738c108dfc14b32778c5cbe --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.20808691686715283, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004313398059878784}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4306521086221088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005516793126900347}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.2328558522551123, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003900111659742201}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.08181585393792677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027980013134964224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.1900831484868744, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004381497415255648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.09334851918279623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025591605276398923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.1667225810957368, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003461815683539065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.3693635934874014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004943896249868083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.18783561618919367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029446786453837417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.1831100821976502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003960631509702234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3774482577828289, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0050625094090084085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.20343772508503727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035163687455997497}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 2.2982278621261973, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1086928153245287}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..559db4c3f597df72eb5fb4dde337fe59e81cbed8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.44907951348269975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005884356170552851}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5373597445020493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004966707028771244}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.42431308600342277, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0041620247128580454}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.25059145216834844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004580817767902642}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.30420539737108304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004424545920380444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.23334399501205214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003511605630791461}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3650320462167926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0051681070707463765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4461965468151835, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004628706403782243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3447121578849147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036720178726937637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.39789891202282635, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0053996273811364355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.48008802419414803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00474893114047731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.37564451182458136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038397764850582553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.486000451731127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1760272135645943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..71485653037a43aec07aabf4f95523ef7e84870f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5092720807541621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005761992782925212}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5373968350772912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004897334207979844}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.46569868214556953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039167763038128744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.2902018344321373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004714799020672444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3073000070344161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004371879453082279}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2613220581162864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00360096423581038}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4098376883485247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005103095930707823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.43837950032145484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004496609058976664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3742438056269717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035350046513184163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4492052702368639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005309452409480573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.47692163068169213, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004649900767270472}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4099104459367334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036009459176647974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 8.592270404743596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3478687773942717}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1a861d25f5799257fa3b81fa614cb12484cefbae --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5265934478163272, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005660483553931701}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5179234781275224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004868884431211911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.4679567762670693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003781568636086802}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3011205615910547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004723582948062983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.29876898581754224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004360553531900465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2637473868679139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0035396559441471915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.42896446744335276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005075014781599364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.42738457538098473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045810446815337195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3813739343787495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003540381010340878}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4663362213990527, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005289062452425001}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4618244358692011, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004696080984340952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4136944072158051, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035754128454684865}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 9.748227001648862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.30465771546881343}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d01c0b24135e2c0ce38912e3504ea9a6e6b437b3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5462360493800595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005571448210438329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5208556355840253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048319172639873075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.48448665725831064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003823220765053316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3165259707878258, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004816977889561416}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3048098435714976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004476193519328458}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2771416755952798, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0037021003433152576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.44640610204674275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005009545091352138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4305349369311683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004579985246877059}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3962223744993141, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035914467869508364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4831117938397976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005150478425110863}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.46459910587278797, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004669405569917108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4284939067785979, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035897376609210704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 11.081728167045133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.32013487215713093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4e4c7d1f05d896fa47f894f09597b3e8d1a96341 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.08806482552055464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.013753939759917743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.02425622875877401, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005394400535560749}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.2187977886143418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002563987829673595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.04217421071874153, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008234598917093276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0027418010825137774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00027769817357513153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.020309096397945828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012796421259529498}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.004552592244101363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004257311158814129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.02376285292107096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004993494816843593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.21647307133945715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002496433700435587}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.041384238656935184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007632069889173777}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.016764423629076094, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00046755444259852875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.15365295702315013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020731095703446366}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.029066972734033053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007033196865904391}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8c382bf99a8071ed104b5dcf6781be9c91b25392 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.3977041826217982, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06476957892774847}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.08085175511474767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015420226394619978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5398549553723303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004633252724794029}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.13297509465908738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021274594259975206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.03501440648141779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008855447546426636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.25426029625002416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004268932324515644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.057734516186082864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012735245538216635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.06952000055759049, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011181437346579404}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.49228292845501337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004312668887865199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.11558954246536195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001566596618343675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.07046525801920696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001409866048243462}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.476402513292327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004517149863331296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.11581462372515029, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019451873798571877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..18db0a69314b0b2d6fc08db8f909a0e5b203ce41 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8053876932339799, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0692633939605246}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.0953355786305315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018482318125930552}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.587435498341808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044160353671122265}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.15238609977285375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021050373299750613}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.04635353043989582, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011008091437590098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.31351855748984986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042413072919440875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.07451843059126674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013538507376262556}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.08027003022288653, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014089308706206984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.5250978321957749, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044081811988050555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.12964573051803532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015563770094126131}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08462110334448762, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017266673212118049}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5256991157959195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004362166200252256}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.13493557268343806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019537513800864143}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0439cd917b37fcb6157f0040e3577070e26dac04 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.9133849288269293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04864213509199311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.10564975683529444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002213084642274899}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.590100549099999, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00427885948704963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.16402787501046193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023163623544869127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.052782129610134225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013492091587997011}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3231207925990761, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004256840681994388}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.08217622838318879, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014861030323633745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.08753763146321639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001717519449731679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.520111059713261, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004351158002319349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.13744565617599114, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017570257873037582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.09389990009122845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019967608416885307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5308296127486624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00420828920591214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1459665149392452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021389387676408454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cbbb5d222ced194eab16fb87ad1fab2185526255 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.869050618620975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05010088619712935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.11579207158486948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027602287304337434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5769641854763163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004366449526449767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.17069557641057814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023989321755758107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.058968068407052966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017140614292096468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.316819162619288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004169288630207608}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.08649984259029919, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015781356762232315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.09510940810876867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002143422239902668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.5023698294671003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004376544901142601}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.14186920549480303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018314233516765332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.103240891734108, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024728914583356716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5194520315283727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004312781932079048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.15217682563526053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021887647918235626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5ccfcd661e43390c4bfc1059048559b3d48f1bb9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8835135529548273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06110654983373539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.11973455465296313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002935618771876205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5670989994254594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044381978706761}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.17411810979908632, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025066282022309615}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.06166786996995915, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018761458565112188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.31541818233678315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042975296956600266}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.08905956074973984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016585801215981953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.09825615481667112, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023871232471909232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.48839020581913895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004392795682637931}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.14380871166547562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019567050216977417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.10703554238169732, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002636419782384992}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5126878599494855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004346737958136646}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.15569229150500444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00226672543918796}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56abc3b29bb08438c54bb1e2ec75bcb6683e646a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.027292817639688032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011146302412515304}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.18687932979464777, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004479317229289314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.04522889442637627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015737069173370269}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.007585884325607279, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006241186679155009}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.04765534435088357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002842447724510974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.012155444595988949, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008835894771083748}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.025041692580921926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008766166267586572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.17988054876134196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004148939076846254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.04203637506264007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012876809656108756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.02362548960154832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00092997017312708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.16710354998948618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038167321468611055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.03939048904516523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013252431846109201}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.3002948882840799, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.014715720186430872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d883c7c61065ceacc502812f747db9a833ecbb66 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.18576332777291357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029496821484972765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6724955092023494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004391893376238201}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.26592346891155666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029378841494143377}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08325319598690542, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017911862931407451}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3291202433540325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004356879936719757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.12050338190039989, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019700080370958755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1397921610751568, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021793915810409932}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5409023321464792, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00450531183061488}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.2022817877685023, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002109071664874411}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.1579221423717992, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026163266317572958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5798479579428988, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004227813926608466}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.22586865290383526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025693268375682216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.3174067250834143, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1095420616112965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..198abdb728f7028250d0aee4959dbd01c2a275dd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.2102247428366051, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003936648833938165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6673737234050499, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004469275663366487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.27845169302997186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031924386042951483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.10221556389262312, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002526456468616791}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3481713656770689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004502228925703636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.13465046885050352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022013696056236006}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.16004846098807926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003086567397610655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5368811685423543, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00452435180901271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.21334136742049684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024025933828259196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.18189899956371253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034717018098753866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5853292242838097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004328558737604698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.24099516029436527, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028365419353728666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.8194042853676433, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08001070612632513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a19f6c410c6091f993f2ca4e413108da6e909010 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.2214105510553713, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00397852038777946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6470607779127537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004406369519857878}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2895578361728388, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003271301388482767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.10995468701919953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025990883422240055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.34262834959953414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004384132009223851}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.14344396360996736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023093129067051307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.16816306750707308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031335861563326884}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5184873981262235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004441211075979107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.22148965895132458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025138893644262995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.19252227804988106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003532058484150838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5689593244437532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004202078292368127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.2517396049055751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002894872783585922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 3.0104483725602478, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08167667230226254}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2fdc4cfa0edffecc2cbe68544b28d8d7a346715d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.23364828958458597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004386673691400771}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6182500596257372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044847981647693925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2938502007499035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003471270572664401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.11633321819129998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029245832979215612}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.32548790345098827, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004370629409800285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.14528102357273256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002520152138311905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.18086003912409268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003662637084531143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.49642367395924064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004453695802686763}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.22758047906242576, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028386957066170496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.20359868681678098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039290266599936495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5457174954752654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004285590260813723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.25592564085521324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003081730914660469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 3.015666054890952, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06037288426011814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c1542c8bdbfe890a79e8c9465c151565bb2734ab --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.24397794795287123, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004784117876438954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6048647467371622, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004654471513157018}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2956946057982067, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036965817626566813}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.12634347401889862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00332524580607351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.32128383108894604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004439902579230902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.14969245864050304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0027172096623066288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.19006924520843738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0040021127107316675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.48595046650876467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044648101430210824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.22972535325752128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003016005286453144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.21334505184004257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0043020289632197005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5333276897372095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004389519168542818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.2582096378416781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033136409122061722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 3.0967052765116168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11202147855167385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..89390dfe020ee75c60391bf1014f89104b3d0f8a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.018405096713391255, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00028575863010679416}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.13437768144660597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012216701727659973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.0314250857721497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00044204337073046926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 8.223475482087916e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 4.15512150984225e-06}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 8.637745180723215e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.0776197761980826e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 1.477135016534563e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 7.496818160211679e-06}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.018401510662055136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00028569714825004946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.13436090726457042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012213468546436975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.031419100488651396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004419332549890634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.01264230643724104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00017903922143252607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.09751365313301781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008310455100700755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.021716982601686145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00027816849062495334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.0019246988010293315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0004673515089684523}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..70dea07ae673dee057022dc9add1bd726617de25 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.12318551179378552, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00265250718788714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6558902866499821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004947345321902676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.18551745196543523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002702377330670695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.05344001076519088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001521538259378062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.31589673451602196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004512575556417795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.08099215111185744, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016729988751392074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.09390021055693351, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019816232535704525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5356505485725407, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0047206486739923195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.14267940599206763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019480493372883946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.10862326603227693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022972669057671545}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5918746015837786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004841757394783012}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.16432822281253456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002361497951165923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.6637023692480732, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07113198418739454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0885ad35bd9448373d6716ca02e2669a7a05b39f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1666446893829218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032707753020478673}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6729567321222975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004305012822265882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.2377267859934131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028511858947845655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.08075845494399067, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020915699463510836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.35411093297637475, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004506419345004118}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11449533899405696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019357009062000538}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.12643098669316738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025705410191854284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5356644614517825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043565730389747955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.1809798890103334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002186958670887679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.1479261397803003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028341939795893155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6104555998853953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004304349996355649}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.21219482060830694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002515047834126115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.647805827243435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07629701303117106}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bbbda805545eba04251e18368474522a9c390ae8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.16348222985039676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003146872550209196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6750999942571687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004231543295192062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.23731957047929436, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002799505256926228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.08053104845339831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002115424621313934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.361316044646828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004498247573306673}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11630716134029079, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019838199857175193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.12237541399150458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024629041187135487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.533219718242574, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004361069891506727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.1784341269505221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021166909618823932}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.14659271750171685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002803961686626933}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6147977963499615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004227824501478167}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.2132548329175755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024974068122956754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.7650889809369867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08861961620174373}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1adf20bb845d03780a3627c0a18c5f4176916076 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1597629414302839, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033033901216669354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6823730805286056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004113105685624019}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.23104727016762644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028026542146094534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.07889136187736123, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021839099115212703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.36533837647073053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004382971120376491}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11317180592447942, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019371364795214888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.11952172602503129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025993702279697504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5378383490680047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0042111474429222555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.17336408160930467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020829759705918555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.14312638913068235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029110532892333936}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6230515627846621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004122693884939311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.2079495191882183, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024838961950283595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.618154777450835, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07956567939253502}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..667e05bc78d2e2461a149866fe98cbe283a660d2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.14955217982650695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027686157067858337}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6935554112517629, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003988293582524454}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.2261504763429586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025955875570617386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.07346044534208657, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018168270456380426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3742165028406123, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004409012138004924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11088772044085522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001779337380104819}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.11122194025511048, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021369513975720436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5461500634585164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004179927239919983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.16925176967777375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001942005609299536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.134551132386729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002499557879854736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6321720185352039, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003993926482242108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.2039003781849349, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002364317599492143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.615370951594709, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07817523000072259}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe06c8a97102068b95a7d8556cf1d7fd5aa0c90 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1646736204497174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001944375965208406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.28908804331097093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027751587359995016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19551467871244005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019193758332128445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.033596976210297794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007783779535836992}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06189607439452786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015755697883312382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0401546744710191, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008933850730994426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1159536094809458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012430606328199091}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21237606965032682, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021178449235415907}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.13948475018165474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001256562390447179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15307248616009633, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001797771989893273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2697257608869243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002597333354461373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.18192377197057139, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001775025387207881}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.7901688600266696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09473494560163237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e415d8ce9bf7a719c9cf9e8c7cc1673b962d641a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.18024946393550403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023365148116592273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2858663200405377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028577355675838095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19965649147362877, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001954107478734798}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.04273282245742925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012020669128774719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06727475322952328, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015736861230501358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04582510801169016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009888177785209124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1309377911100276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017619039502037427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2128779538670664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022235556075845303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14528886846186012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013520944350136044}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16786109069022864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002201832250644331}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2665923708356373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026751422180246536}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.18583509127014775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018160956867022732}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.1642687288297267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03609118184667803}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bdd55dbff6eea6527adf95e9f0919a105b717138 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.202956767711254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002676187150983512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.29159713975018653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028298980612215777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.210239996789526, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019255181063177478}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.052380042676589786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015473198207191262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07214674692201985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001657658589837987}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05097602932656367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010520475342058347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.14900247000453432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002165214590894441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21545359034531394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00221033373558327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.15265446172906474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001367842815236307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.1905565529645814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025620703427353705}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.27385801709096713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026819224687665406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19702119815111355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018063906023379486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.5286650412737517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08014901866692561}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3a94b230cb0302b4c1cec556e12b09d2f4743a52 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17231067692889554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027956027935857304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.243172171476554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032858246005163156}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1757806794088883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002256613151873409}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.04314429570273796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013561190191307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.05984664442402973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015879641807220839}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.042273822347749235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010073987728216936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1269442197755899, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021738281830229454}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.18134356725730427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025688638483573395}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.12839540685323708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00161856743478718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16152443369273212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026463298498459026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.22829827365100974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031167189145814398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16451248533027701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002109355534586791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6775327228175545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09702171839636106}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ace0c0eeb7f830cee7b9c034e7c121e27fa7b3ff --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.05610293921952886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021934282933142807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08033002095461989, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002892648072265376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.05586170258027261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019271475347510442}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.014518361578080535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009832377227663412}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.021124036864236993, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011841595966348834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.013928620381908459, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000708425052711742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.042762546108040304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017529381402243264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06135293147334968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022662727518870454}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.04182384962792516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014295239104789405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.05281014844060752, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020838090836962377}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.07549636242613741, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735416186516671}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05238777661289263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018093330643826854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.4585803694189491, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.042449805324812594}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5af41aba61a61bc8d173c5647e3424143171eb26 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.01015187837282015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012118368481013803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.012597080447224141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012614668952345483}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.008568589881060913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008425364496636647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.002070478186206016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003593417665592502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0031561784431225486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042801548350165436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0020632447863588753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002724530157499788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.008016061817972518, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010514974373518058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.009704734876679203, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009871790542858633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.006424819886854842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006247554238591577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.009477961645303942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001148548084981138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.011862110948524358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00119353729888407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.007978769124555215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007798817433535373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.929822783994029e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.454754027890242e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d251558ad1f86a322986b6fafaca217a90aab0d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07643821536141272, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00140218679541492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.12653305762501957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021916544609098315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.0881869594314063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014841987809875977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.00991125872816128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004535614337776502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.018159423646898012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009381758127412718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.011775575519769068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005303096103283162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06754639878793735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011484776442586243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.11490796429365001, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00197654999272724}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07869794174000905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012488046435799164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0710963976259558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012842232965217812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.11843767033976603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002056362191261535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.08220909707902971, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00136877406788079}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.5845357736513314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03072266241415454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..12038868cad54746155a9d7668c860c50449be0c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.13347948856316436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00190203694307572}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.21739906685237975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002808744726060509}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15296390666938464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019322995775618457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02419402358312838, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007370518433472602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.042662063621771204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001398060091307213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.028480082558715494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000844119944335723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.10050917437006718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013279033129049013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.16941427074493215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002206464398514822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.11635297169383609, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013711527236179993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.12419910398685438, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001754709084087944}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.20345233530607656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026368733023479575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14255020342356567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017856582246053023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.6714920428051223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07493178085444238}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..05000384930d2225beeed3d69b59c537328fed95 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1554086212544319, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002067211936039912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.2544322573521774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002810922596892263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.17865057487457384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019832213339117484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.037714132074904975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009374766307825127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.06361946862872042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001591682210752721}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.04342414284752008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001003062599440997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.12334722471039417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015067280447067734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.2091785533285848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002327818378218966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1435215312656172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014687451140009584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.14357511331593745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001917549831974918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.23666987395605485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00267335114823891}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.16537912003845628, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018481029569662097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.6275735827595286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07295788379483721}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70fdfe685dbebcec13079d809e42866fa737765a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.13637703415923158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023528280402658097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.2101854175375818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003134663411489244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1487528972942182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021573858174289036}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03442882399419072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001085658422803244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05416049539107172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015569213296257193}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03703687600072442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009586366255869859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.10957513679331089, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018197153562858968}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.17454640678104377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002643517915284805}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.12065017013720424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016584003927085527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.12614818245454176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022002630882711474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1951306252568764, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029575901472317177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.13759161249237567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020057670100919424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.696717061739458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10004414748713579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c961d0abe347f3043dc52ce3b6f8e3ac48cc2664 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.043639000260860615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017644092166845472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.06690857706863987, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002513457134572579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.0460595175107433, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001696044544780373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.010977304219473067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007111284641033093}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.017770704280928078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010731197359821348}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.011604393750423206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006238601062608368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.03598281883079817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014505603927416834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.05685382605643658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002163845582643641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.038179249417304884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013818818185200492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.04044545116309038, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016519202271058225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.062070973864341086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023531363135469723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04260911072473102, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001575090983128174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.45244912498308465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04522890822214683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..51a93d405df490de2ccf21e10e4e4ea66df953bc --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.0070189779398907685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008113662184147494}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.009756903947301174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010491271958210223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.0070051523902687545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000741818765951434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0018834961678454644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003137323371258015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0025948535559862265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004145432652310791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0018773141620031116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002924665783736417}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.005576197279787843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006409300377598502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.007827196064222617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008343635995047826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.0055660827324147755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005823170803562475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.006493130647527894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007525468046300129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.009066194317283121, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009684333729492796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.0064846745016825785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006821624022857435}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 5.251000373867168e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.1017248657586459e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0c1d2796e30f07f39cc74bf50527a5998b913569 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.11663618257355224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020010876313614093}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.17750836507313886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002420755888491993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.12655297110315525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017223356296797345}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.01645523569334117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007281743616814746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.025103051974347854, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010138593670258253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.017208857113077958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006040791246522063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.09528496641012617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015116946024390716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.14985724685972096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019870170906742244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1047005055608947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013001810997754814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1070586643291757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018157291974967367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.164451689142304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002238195137110574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.11665980577339671, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001570131237321446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.6707040144799241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03623222494925353}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..72a925e64309bfcfca8d9dff70f7dc122d214e4f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1538305631637577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002118137735001832}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2534164331356793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027384226381832084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.175746351793284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019277453775472107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.030776430413685576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009835988394066894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.05014575490604494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014072381348858034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03420510386300404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008827127864692935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.11549602941828332, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015598015652852524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.19577461014108763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002133713428500096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.13274190810557143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013509301692574175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.14403643967460825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019822677694410597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.23804944003176418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025835044144481086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.16461109581502553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017958278888830555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.7421869697677301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09072404254874503}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2303f489d30e2997920ec41c49e65f48a0eaf971 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.18530745879024665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026984526710165596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.27372904579227686, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002825062477476198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1952096816483372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001995746852085018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0457473960215223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014209292446018405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0647678078130054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001616150713511116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04558123756721243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010186767133771723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.14324298206218397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020861451808508974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.2174336811042295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002306599328778544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1516146243997318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014416996193126431}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.17310048096253244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002540880696334963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.2564277940851817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002676735414961789}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.18225288248589572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018597154608296434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.552370721219516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.056456617033753535}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3cd68ff48bfeabe8ca9011c38671b5ca73429365 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.16655273761856884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030693164785274346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22239594480968503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032442335330023385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.16311913030851707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002334208239517657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04426024560468626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015913171237693896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.05557686275427169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001569025220107771}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04051754398323109, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010700958454151245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.1312478956516053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025056286449919598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.17774125142496916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026625038105766233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1277212531982069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017662200246993489}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.15605200382139137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002922128458943474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.20818681432495342, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003071577904437599}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1523177208194968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002187292192524935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.8903876098590975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08033021470087338}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5f18805ba0f9febb7bf28a206ab590621a5dbc0a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.05445104964004579, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002443079649660942}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.06600911297746138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025520971880013655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.04834308907312041, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018503707685198511}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.015529728785536703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011838329122922067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.016747064756245943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010305687534927282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.01222026826951555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000704325225575219}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.04394332257272588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00201438218193949}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.05360636578416732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002104351294157986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03857646230433444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014595106562382601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.05112939118161437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023090030630797185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.06163586533175658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002391851276162087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04516151790202869, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017340898610069717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.3208877330233361, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03494147074048822}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..10304e48c2d5a722e330b0327a4374f25e56761a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.007010026455738011, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009017435483523394}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.00878867615147148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010157525862754392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006459190800150568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007207860950963499}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0017497370302994892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00034838975656659773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0017941002253548174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00029001602890075666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0014447433493777688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002371563408671573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.005424480425576281, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007093535726255014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.006810607613407806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007726864270681841}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005000926315966357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005492449452936106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.0065291661542634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008431825007789647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.008127880829555213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009349057417684032}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.005981178381897325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006652187976853396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.642808156949046e-10, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0958818603819076e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..abd05be0b7a605d1d8277166c107e1715299a3db --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1463073350919793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001849976731473478}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24687692205659445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026245416604448834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17104966216653122, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018388410541572498}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02868070817665277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007455100631059865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.050970339370236116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013773850589296726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03393757157227001, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008395819605445009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1134975332744349, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013046086784248739}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19882847469371265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002160779528818444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13431002294362815, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001329252784566218}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1341572569615249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001680913399139742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22735010800139077, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002429214690741554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1570085078363825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016723834304551517}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4552750330972206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05646997136025405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa4d2b945c8c93babf7db143c2c0a3787de87dbe --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19104818364291273, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022212767005973058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3165545579856358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002818835296854103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21883314563862602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001968035833203396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04737338234449901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010449765003741809}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.081573363995006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017297723073854117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05423182118294372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010465141353409224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1352656471757331, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015360042939327185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.23168675674466935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022572191515858007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15600647601501733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013358035921943341}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.17973117727215357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020830282164021902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.29889898054403574, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026791790500151685}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20602296990458846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018407304080033926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.807870663958701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08388649431195123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9c891b27fde893d6560127b7a748d0e743868f25 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.22091940935835946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003036544780507811}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28592540020450335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028268458929152375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.213319730404769, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001947942927707738}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0625668775391695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001863481800778566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07636442527094099, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016705370533154184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05619324678157442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011378174233996204}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16524622277086715, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002504977446892061}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21312172953880804, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002230572391304009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15707830961714367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001424704542577375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2088533869488438, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029209986594073956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.27031759901723496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00269091513844755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20133540483239756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018432855451969799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0758182963274967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07269865886048295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..def777a9c5fc25ceec71d5bd8e598d1515c5a990 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.21388175415191102, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037807201763198858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22109179833036308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00317048229219639}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17690311487463797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023173541475324253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06310561248802647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021350241076741575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05983363177094545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015915281660158853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04775424867054453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011657814934947282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16656647324261953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032154715784240336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16788106886771248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024768275501905303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13377606619329538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017521905460361976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2032660883469589, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003645991164048245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.20988404520976878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003033569663633178}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16765421945999662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002202375233222233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.2676357062273516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12290136548273946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dbd1635ff59236b356fde4cef10bc814c2776d1d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07217489085431607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003030440715398035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.06719640775054124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025953918642397556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05381265634572307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019615926895963105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.021665967083708774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015098116987345535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.01860172806438717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011134284906677621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.014607128479951145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007893617726734669}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.057955761496930415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002549676328226009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05288705994275681, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020802147270799964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.041897511040161206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015198623386063986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.06845672227370601, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028945029688266562}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06371644081837961, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024692144864177654}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05091825334557307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001857333758746474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.3207302415446247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03951812995159553}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fc5239f62746d661e27b5cfbfd582c564909b764 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.012746310259293975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001498370110228753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.01036719903184163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011041663951146658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.008600982975105755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000871586481245996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0046837043167768724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000851048694914921}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0032041325852383415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00048168285623642387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0027143726441978717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004021894165602589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.010658331542732118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013171585867806077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.008285275201527513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008892336234217646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006953217638177468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007188116843350712}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01225616498642844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014511703007000516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.009942438567258063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010638347073576458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008236010332581721, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008379564793264335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.2490869778774144e-09, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.579875964652809e-08}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93b2d4d88b7049bfdd56432f4252212f041e1fa3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10754772441287547, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016958455385689898}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17816562206177886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023545814766439887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1242136099389791, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016788768510989007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.015527269147006644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005591631732568687}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.027484473420738165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010336047843406612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.018163595558952392, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006156401137280107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09256520381779812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001314822114051182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15709670414112983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019776606072259206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10783351661135016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013120711524955912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09949748623454367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015502911503945975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.16623114509571557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002196915069674266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11523321457897195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015382866944637125}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.9470864610845786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03249798513588096}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3eec52fc8ed934080992152c6ffc205c4616f7b0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10903330571413737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018189243088536072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.16131809640403347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002537055300872018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11703392994984627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017519304821591252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.013663796459003277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006556047677219756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.02186335600606571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010597085086764856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.015006978500477466, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006493711106973352}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08286088730918939, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013035354194490178}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12527349544952351, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001902974449846937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0890622266980778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011999850822209978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.10274455693959166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017065171168041343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.15212866545305415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023755694730805688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11027318467498079, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016371611741646582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0004698111272452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.061814579441424315}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6fa8be57a5d9b6947bf0ddeb8b7e0792c2a781 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09690581008488926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024248475400055754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11326850686300136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002336894022718304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.08849504604578734, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017680512462351341}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.018300502482703724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001212414444454405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.01947166619266159, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009643040730582587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.015011655554296155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007047911945416481}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08087122864775938, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002027221626489267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.09562923022814045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00189683385928486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.07359670365804201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013609375455583069}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09100233183422309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023243523379837595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1060183257047995, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002210133291756926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0827072308194733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016679545909741406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2275559960359324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.055602030188377644}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2fbabee803137c170410532f9fb9b6fef01b85 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.08727403229824231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002754747362815702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.10038330296542393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025674911073591644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.07581549153450809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018567329894756907}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.022062717078419312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016202827842642569}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.020684116383698558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010865171767872877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.015436277594316166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00075744970662451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.07371123650658079, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023770899696980465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.08513597314680194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00212812432731865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0633376000334696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014673941274429858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.08218273785211457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026325580514724206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.09419184346305906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024294175505879165}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.07094827851387892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017436711590059298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.4434859465762546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07239736975664855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..261b88394ba7aeaf10ab4ae5d3fe5d16a2cb6d22 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.02122229087394324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014003737821723116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.02652989886804251, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001520146131752233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.019677720071019873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011247541319417723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.004726681009885704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006457139194226688}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.005360196615771528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005369786057079111}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.003915565527437388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003687016348744532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.017594475833257826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011648478992631079}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.022309854870142254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012646138680947798}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.016278889399882698, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008990662478189324}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.019818945547900205, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013230955041361396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.024765412779247827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014222742525507647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.018303637479182004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010484815874735873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.050241534910141376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00739529390509812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..357d7f1b38e0954a2f50540d982e00dc4f683e15 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0016940318129783684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00043807301732911447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.0021116213008436154, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004157235401925989}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0015455675621237376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00032572665041987446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0003818107289535861, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00019345670461531905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.00026797247013965285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00010331614803977232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00024341445452042402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 9.580449462072127e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0013961176573495614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00034646394432675024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0017757153532734638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00034186622977328717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0012780986139275944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002518502465672378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.001588174661722081, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004292160855573092}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.0019250318967419596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00039002719041384033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0014218801930012729, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00031119324514029866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.862729574232818e-18, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.585352301614098e-16}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7e402046ce0fea6a2319fb033730d4e818f0b9ca --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203928}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8a9b7c8b5ee5269d5c5e0f63e626a86a66abf2bb --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411244}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0f0d7b9dc5e3ff85dd7d0bf2f64923a019f225b4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..673a0b6f8b746b64cd21e2ec79aff5bb77b3933b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.347, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01506047203170662}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eed6ccb0c258ad513536c4af12953e17a4955c9a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928373}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732958}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9ff3a0dba775bd155608254debf9da29507889a1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620335}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01483050720454103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..69ad501069d28d879b54a9b754fa2a2fe4e53e3e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.34, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..63df63a198369a3386a14b23011834355796e230 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac4e736d743219a4e2eb41671f7248f69e597db --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.358, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407557}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.351, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316407}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..99c89f1017857a3b9e0f62dc891a34552adc25c5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.358, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407557}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.358, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e7b95a96af4c62347a71ba6731d720e346a57fa6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.354, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015129868238451775}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.343, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d89591ce8a1567f386d288ea9e7d62c87d4dc13a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.345, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055235}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811478}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7415c11cd9ff00a0f312da7075e0efa6fc9d20ca --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407559}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..16812b28e12e182ac5bde8af94a25595b4110439 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dcb35a58c9fefdf425f6ec7ed6f2cde7bd985605 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.361, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175115}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01506047203170662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fcaea08ed1af37bb91c5cbd310974cb793024291 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.35, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444233}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..666bad2b6230f31e50604a69118dd3419bf4e094 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.344, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015029633724408947}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4b7cf7b6b44b5c3e2e6557488423eb0ed450aaf6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928362}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a581cf812c5a9a009d4644c328cde9921c7cb9aa --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.327, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411247}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149981313484027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..18e528cbb922341ac2e191dbb8b99515588b10f4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811482}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0bbf97200d7a8712a382f091681d5d3951bf4e07 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928366}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928366}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..db9829e3b7039b8af861419c12adcd3875ee6c06 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270334}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d9d3535ccc33496e458d2cfc5849b13a41dc2c98 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270334}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..678c8bc89f0624d985970ac2cc84d74376c4be88 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01488827258820394}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2689b54def09c9491eb5a9a8a54efb12976a19ca --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.356, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306625}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4cb1117d05da6bab83cc972ab2cdfcdf98397d53 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8087cdebc0acdf46708c24d67070c8b79dacd926 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407555}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5f18048dba28d0bd1e81be8e2390ac53a7518ed9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.355, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015139491543780529}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402709}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3fe1b3151b708e85ba59d91c6be58aea748bcb2a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01483050720454103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..716ed2c23d2b2263543d0da9504f4ea4a146ac1a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653605}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01486539538592837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f808b86c05ea5d273d9d4d11b4c0ea1a7482f327 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.357, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015158521721486767}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5eba85bc53853eed7a1d4adda85554466be88ff5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01467127282297788}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.322, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01478291360099668}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8254f6f829427e69d76e87325d336e9e69047ffd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229873}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928369}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5616144c72d0073c9f0f6fd1ecbd4e016805528a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.337, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01495508791865359}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1dd447f20c1baa35869408e89d05d424846b8b12 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.307, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01459328489285263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ec178b50a19a3ef865108599eecb2614884aa57b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.312, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93b81ca4f8bb58e9dca6034adcb16ed3e9b6ea33 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e5110c55956bb1022380b2c2a7801fd8f4e17181 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0795d7c240fa60df795aec749517882f94f1a81c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.329, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928374}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..389918118fb22c86f26e1d814dde09923c9bc420 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.317, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880215}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e3522db8f6f259a68a44531a894c387db19a6a08 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.314, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087964}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9f304c382003af99bb9ccd4f5c2d0dbd6270106c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01455320568795043}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.324, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9d24a364f6095ab47ee21f1ae0f0ea37ae2de80b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.35, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444233}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9dc1bc8c9867dd70ea849aca86450a21d4a72c9a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58b484da505c1e3c0583deafc27846df27212051 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01479492784334863}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996692}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fb8d823d7c7c471dce890fd192b72f606b890391 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224479}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977881}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d689b2ded6cce7f2dc57e1e2e4f69d6499efc164 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732956}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01475865230357489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..64fe83126f675c90798ce6d1b1f0f47c0715aede --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01486539538592837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7c9eebe9dd2814df5c6aad5aeac2f5ff3a1840 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.34, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363935}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e01d77486b025812d6eda2406d9aa59c584c0eac --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.314, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087973}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..de74fbff5239f9e98e2a505e35cfa2df717b1a58 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014632638658632895}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.319, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67a5a4542670ac81a13ca683ae4e55d9c1e370bc --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574885}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c921c956b067940f0e61be952b49ebe296f034 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541038}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..76f0e0e8c32fc39a9b3a505d9a05ff33a0833d6f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811485}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..146aea526935c299be6c4b02796837a884460e5c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620342}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7897c373448f038d8b36e93fb18bad6ebadcb9c8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5903cbe9a3749ce6f9d301393e5f4f8196197381 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348632}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5be8b3f1c99f4a54bb800d60c01ed432c812e532 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928364}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880213}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..41b03e4e5f67bdcce454c5e0c2b97c00d845cf44 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928367}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3390af7cb18fa92faeaa55ac187c3b8daa646e3d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880219}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2c11fe67fe289cb3e7f35460ed12cf8f0be1449d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821476}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.35583333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013826518748493324}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..955eecc3cf69bfb331e5da5c408802ac230a5629 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01363087184382147}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013728421539454872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f212add9be8945774a993b8ef499e98b1c2ebe04 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01357953127780092}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33916666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013672343491681822}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9063493fe03ba11513cb05268e01fb3e6b1f94f1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33166666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013596836729485164}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3433333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01371263383046586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e31903228d57e2a163137e043f936fe12002c031 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3383333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013664144006618271}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..47221095b9e452892ddf4c08becf9b9d3a583d9e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01346230971200513}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013553211167251939}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..46e4e1cc366e6982fdbba3c850afb2f252ae5496 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932877}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300219}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8d0bf3504ede3fdae5245ed99593a3449036cfdd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..815c4fb6caab1e981433747c8411a3a9019ed568 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300219}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.30916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013346684134591948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d694e9e5a00fdc52bd57185239eece69e4aeacb2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996296}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2ca7774ca19f33cab9f637fa7e1d65918b657a4e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01342456883035645}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.30333333333333334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013275870057740436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cbc679233abc8f3439a48933797dcd4b58fbd6d8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01335659633120026}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31333333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013395739415639082}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5f88a516ee8e366536a25950ff19e28387c997a1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225603}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ee4f0f8a09488b7059bbee08fce510ebb3db5faf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7dabae26480431aefcf7943a37d514aac610bf86 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351021}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01357953127780092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5fea0e589c4fa38942cb1f4898b0310794088a3c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3258333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01353542204341746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0c3c5532e3ab9042c056f0ce506b8e1bffd79a7b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013424568830356453}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..90748794844048a899f874d787391111b099bf8d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982096}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851555}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..72ae6ac6618e2c4a95d8e622f898ce1a89938db2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32083333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013480882752851555}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3a94de6e082136516854f983f3ee389eccfa1d7d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932887}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..48dad0f9807549367c4e66082ea7ccca2179148a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769144}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c91d8d04227b41c7f10cc3b1e59bed814d7b5268 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070709007}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013579531277800923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9180c7837d2563f36820a43fc5419d84dad30758 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406394}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..430f08f3748adf996ae1f7bff867f1c2e758b1d5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618263}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.34, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013680495725767792}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..214a4a517e151ee560ce8eb52e1dd844b23a24d6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002524}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..721953fa481c36d00e99204bbb9a11ba41eaa20a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..639447b3e7175c372642e62f825836f9778ce3d2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351021}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..666ec62bffe7cbfecd5b578d01394efa790d10ef --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3408333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013688600793296936}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..faf81c9cc24c40cc0754a0966b0742d96e248782 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3175, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013443538681348054}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01355321116725195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9cc19d3f8b42b3149ddbd0bb808e5bffc650cdb7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.30833333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013336721143136467}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..675e0c326044c136f030582918d2cc23a3c28e5f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301829}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301829}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..845974ff27adb959ebca16e6570850730c7cfecf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01230492841874761}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01230492841874761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..180072891e0e6d405d6390a67e96360b70a4e2f4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01266819862131543}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3ae11edb1d21ca898f880a3ea80575839ef834e4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012739038695202107}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012739038695202107}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..009724618136c866a1459f1621faa19a8cab5d09 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012653835621466646}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..53162ac99ce7ba3aa1a43c6e5c9c673de57281e7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587092}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d7e98959c28e62ea0395da2687921790db4bce6b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012739038695202105}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3046075085324232, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013449522109932494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cf454f77799f529232e93cb45a03d2ed9fae8298 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012808273573927095}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7775d78df75eb4e7a68475646087734ecaa239 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012668198621315435}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b4634c836101dc62c967f5988b042d2033a9e832 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042967}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2738907849829352, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013032004972989501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d226fc7efc5965e6f80672a9094e943fc8bff6bb --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2773037542662116, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013082095839059374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4ef3bb84c57c4ac8da527b8a158fd8e0c5fa54 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012696728980207706}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2773037542662116, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013082095839059374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e8e7d21b80d14d7ff10b429e1e2f04dae91840a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012368225378507135}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012915774781523224}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..efbce1c600017452fc5a3efc525b088e223eaf9a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301825}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012928933196496345}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad16c231032d0e30e70811ad7844453a16853d1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587089}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012794553754288684}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f849295996353f2a20539f58667e483163268f50 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012445770028026206}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012794553754288679}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eb4c2a579a023daa4cd3493be825390b9fc55e34 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739432}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25341296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012710896778378607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4f9160c7e31e1db6332de3c9cb54dddd27c2d8e1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24658703071672355, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01259572626879013}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012808273573927102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..61dffbc292765a23556baf6c818b4772c2bea408 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301829}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301829}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b89ec6538bf8da356f11d234974aa371b0ea43c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890797}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4777a021ed32ea73de95c9ebab98c4e8cda0d15d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012624912868089769}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012624912868089769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..990fc592a97a18615efadad593e6f52d9d504cde --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042961}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012682496334042961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..91e0239d96690d0f039359105498517cf398a295 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01249146853239057}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01249146853239057}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..179c7c6fbea2a44d0194c8bd30ddf4b760b06d62 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012581033453730113}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012581033453730113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c288607e0e0864635840b6baaa49c42a2b0502d7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112563}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2935153583617747, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01330725044494113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3faeb9cf988a4153252124665f3beca40ad5c668 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012551447627856262}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2883959044368601, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013238394422428162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c7b140a253cfe3d1a39abc6d757f61af63ff4b8c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01255144762785626}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013094469919538798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ef95a2640c779e2a47e58c3e15f9abf51b1beeff --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01255144762785626}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2858361774744027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013203196088537364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8749465620900f9e2cf3ba525511b845456fbc13 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012521593295800118}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01299380772754578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..27a1cf3914dc033472b85d15e7d94c879446f96d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_challenge_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01268249633404297}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012862523175351331}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..426bbf21f16eea1b0a4ffa5ab89ac3830a4ed4ed --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c71f33f163878ec91557091cef9e5bd8a7905e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00880917174472056}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00880917174472056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..26f8f61a04b4f49bb65e2f46bb0b0c80b0b2c00c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008778027378258021}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008778027378258021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f5e53d7ec4b2e07366ab26e9da8fc35d00a742c0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008875238553583164}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875238553583164}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..93d73ebed277e73e091c53c33868d36938178d6b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008798836444222042}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008798836444222042}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f46bfa68e4638683f3207e56f240e4e0fa8cab21 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008804009846865536}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008804009846865536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81f8e48497767635ba414488a3306eb02e99a130 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.35395622895622897, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009812370644174421}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3253367003367003, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009613427708996196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4e9298f76eb9fcdf133ac5c79d57f0bcd2881c02 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3468013468013468, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009766326091716005}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.32112794612794615, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009580787536986797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..de3b6f0e39baea024eb59365d09018c52cd7a83c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3383838383838384, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009709034670525097}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3223905723905724, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009590672908157438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..891b06207af8da9be1ac421c3b24f1778f321d27 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.34385521885521886, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00974666058485245}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.32196969696969696, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00958738669630038}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7e1bb2cfa274a997263e5438ed25aa8714a6864d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3480639730639731, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009774627600259014}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31986531986531985, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009570821820573587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..721f85df7db9dbde9a444e5dba5cacbac62b2acc --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.33880471380471383, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009711980224301643}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3253367003367003, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009613427708996189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa870f6e41cf0164c4e41e94861707219528da6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2937710437710438, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009346423298166725}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27735690235690236, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009186490105111899}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b55f3fe4a867c0d98a32fd61a1498c9073ebd4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3253367003367003, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009613427708996185}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.335016835016835, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009685160765932363}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..87b9f6bf40275c5efdeebd6fe2ee02591b8bc5b0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.351010101010101, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009793703885101047}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.35563973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009822854395535489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5195e1b804e30bf2084b41bcd7a46043288ab4d9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3446969696969697, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009752321586569784}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.36363636363636365, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009870849346011769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1c2429eec1e8031a23b49f2f7b3c3c4a4dc2f692 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.33754208754208753, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009703117820790301}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.34385521885521886, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009746660584852448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..48121ca0d36d00e914522cc4fab8f27438081d38 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.33796296296296297, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00970608053863286}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3480639730639731, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009774627600259012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f47996e039a7e30b4fc6a3b7c9a3442a1f083dbe --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008850055161459239}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008850055161459239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a9dd8df6980286793cb892c8de3202c6bea4db80 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23905723905723905, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008751754723580432}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23905723905723905, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008751754723580432}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..665587ab644b2acad66913f93b43b73f4393f010 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008778027378258018}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008778027378258018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27f82964149de33bd8db983d2a2998e15a9450f5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24621212121212122, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008839902656771865}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24621212121212122, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008839902656771865}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d82f558061b575984d9bde0c65ba8daad640ce74 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008798836444222037}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008798836444222037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..47d4d1bcb5133fcbbb35df04df77727e33b80f81 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..99133cbb27f89cec3fbd689537014e98ed47b33e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3501683501683502, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009788295410093153}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3207070707070707, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00957747457110883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..63d4b8caf9657363e9766bffbad01dda5d897f7b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3425925925925926, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009738105469984187}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.31523569023569026, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009533589368505848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e159d8eda1889792ebe44c9c09af61728c80035 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.335016835016835, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009685160765932357}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3202861952861953, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009574152668739424}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c7e85d84af0db3115135be49fb7e290ebaf7628 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3400673400673401, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009720765494805276}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.31902356902356904, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009564133249441085}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e348582e5c0f34f0a134795b71313c5a3fab11cd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3367003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009697166595752472}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3164983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009543851857323888}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..caedbc808baa4dc16c39d1218fecb52687538c39 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_arc_easy_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3345959595959596, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009682137724327905}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.31607744107744107, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009540440071928285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4bc3773a6beb08b854a112526bd56872128c799c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5143333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009126478842204577}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6296666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008817866528166162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..310f46a05f168635c531533b5c39ba61248900e2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.493, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009129336317272389}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5726666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009033293159951217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..00567f4d80a5beafd53b139c9a64f4e43ab1ce0f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5063333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009129498646958133}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5886666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008985524690229492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..60b39503ffde467d4617f3b0ff81850c2173d01a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.528, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009115903679831517}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5966666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008957972256087354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..01cac313d52bcbc60bed4cb07296957db1ccd4e3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.531, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009112665923139411}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6066666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008920048383377182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91d9cbd04087a0e6e41c90d2375b886c6282bec9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5486666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00908687931270849}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6083333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008913348354532974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf88521bc214606246624b56467d9721e6235647 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884811049411477}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5203333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009122678313140908}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d54603901fe9a92a9fd004a4a3624e0b29196e6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.546, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009091509877386513}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5413333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009098980657278165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4f185091374483d7087bc3a8475721b94be7443c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5836666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00900149831714761}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5663333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009049526374650793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e3bfcd2a80af89cab3e2f2e2b2fc799a78276aa2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6116666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008899620943397692}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5953333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008962735560535848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..169625dfe90439b7222e285e91eb4692abb8d907 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6136666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008891174310695492}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6006666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008943269429955153}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6630cdc6da7a8ddaaf7be41632db31ddbc6bacae --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6126666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008895417372116209}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.603, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893440584870012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c0243c98045c5a1072a219521766da4d9f289956 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6216666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855801251873015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b820143b45017d9903baa228499fe1e3cdba42b6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6096666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008907909838637953}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5866666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008992028793524417}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cad189b1afc6b47d3eb4ee9a2fbc6868d1cd826b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6033333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008933122315228996}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5933333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008969751860881005}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b5e9072b0695912d17feb23e1d3ddcb8dabefefd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6083333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008913348354532979}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.601, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008942016171856509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d7d3be9db684ebfaeac2995600bc68b9433553d8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6133333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008892593055774285}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.607, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00891871708850756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..924ccc8fc368bf8e3904f0984f8d30d835f6b437 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6183333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870849530787627}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.606, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00892269792043816}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7e1bc1c810e9acefd212502947d714f554b42e19 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5753333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009026006087500425}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.411, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008984425782182318}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e967f81d799285027cdf690eb435e826da505097 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5676666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009046234144187917}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5483333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009087472531749428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..56a75191656c636e3330713dbc2764f5908435dc --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.593, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008970906255948529}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.57, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009040312075041279}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..682fbbf1b87602f3bcc66117d4ad7f0ccf6546c8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6066666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008920048383377177}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.592, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008974343780026192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b690153e76f6a80d39f030f97543e3fcc1da7ced --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.614, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008889751171543848}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6006666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008943269429955157}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d311c9201b77483b06ce04e009d12c0a6a408161 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6123333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008896822947561613}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.601, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008942016171856509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17738f48a9501df34503133bd56bcd94eaf4abf1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5276666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009116243039079383}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d37b20d6550b8461fd630c65de025fd7c2dd41bb --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ba434560eb355b970cc3e169fc2cdc5efff0f77f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5303333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009113413981658816}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5826666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009004578551254038}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0ed009bcd85d7682abf005c72aa35328f5d0b24c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5283333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009115560243539187}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5786666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009016519157880409}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8f92b0a514f534e5aaa24d42729beb64972984 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5186666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009123866148533357}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5753333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009026006087500427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6e232a8e08a00d5bfebd040fc9ee356908150727 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5196666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00912316564893404}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.582, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009006610887558775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3a4765a2da642b86b440bbd2cb79c8d9246cb8ad --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.22456964006259783, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..37be188ac02d047c480dd2d3ebd6eb41d86ad61f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.27045454545454545, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c4480c61360fed6dd360d7eee70e886da41e6ad6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2613756613756614, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12b0e342d9ea6d60e1f0a7bd47c47b0b8cd9e631 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2489177489177489, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8c1683c1b701819f52dac295ae6d154081d7a40b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.32142857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06297362289056341}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.23462970093697855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1622cc8b233eae264fedb2002d977dd7f6862a80 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731725}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.25051020408163266, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1772d72f4faa3dc1a2cde8edd856440df1900739 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..339ace715ba9f2be38a9c2fd011af2e44491178a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..21c38405dd45204a31b4006f72c9aa93818c2251 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.31979092421002614, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3270c777785b7278429533ec99cee9daf4d4a8a1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2887426900584795, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5680363ef4452242f7f9ab557594d3328d11af5b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.301994301994302, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b2d0b07e64d588f2e8a51c2b9fb1a65422cbf6c5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359542}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.28651292802236195, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1c3b74570f0f5748b9c8f3cc5fcbe8e6daba7f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.4156746031746032, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..acd0f99955211740659e0071c8bb832b28ad1754 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a7bbf5f6ad1c1c4017c985446aaa0e47a1f9f02 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2794380587484036, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a4ea1d567dbcf77341367908f1ab6d5f80e95f5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06585388898066351}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2772108843537415, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cfacd1d5e7e47606946bbd9955e13464de36f298 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.31333333333333335, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6e8f1a424793857083fa19dff3d553dc78eaed49 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3113354970549345, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9b074d402f20038cf71d979f5f31f26b0a3bbc8f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.10714285714285714, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0417053005800816}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.10352728047740835, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20d2faba060360ec07c54e8f0059ba1f41b883c7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.27858293075684376, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..db0a82179c8f8744fed48bfe52d44afdc5a62030 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.25, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.058387420812114225}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.21626712849026222, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6b0ec313e8cc6ae5c21d79b4168fd9bdd93dc0d3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.14285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0471841613625583}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.13505848989719957, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..56306cf3188b23216fcfee52701a47091aa1a335 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.10714285714285714, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0417053005800816}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.09963985594237694, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e821fd076608e9d1ef5ffeaa058acc6f34585618 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.14285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04718416136255829}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.14384662956091526, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..48fb2fc0ba16810292c1040deee00d7069cf2722 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644648}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.4046085858585858, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..647daa00460fb86262312ba4316b2ffa587816a2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..177875ebddf29cd9367ce9904b5e34a49ba7e942 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2794380587484036, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..973dd6290fc4e1d952874bbeabce9e8e4a598fb4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2596413657577991, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c823378e23f937ebc4d3cbf8c526ba12d83641fe --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3173681664247702, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..89feaeef38f1056faa984315561bf6cd58c9adda --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_cb_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3143399810066477, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f0370eb32c153c8b53043064579d68122a59aac9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f25ca3de6b789809f47d9eaa296982ae383b3449 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a31de58a3dd3dfd132ac693c6e422e4b7793fdf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.63, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04852365870939099}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04960449637488584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..01a3bb7d6d0c2542e337c37c09d2a33128c16ce0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.62, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6a61f984cf07e1e708f9a609bf5a9dc2aae654 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.62, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04878317312145632}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fa24aa26e57deb1ef616e598f8dcc42bc04ecbf4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_best_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ad2361ad58ba536a30de025cbab56fd37bf74dfb --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9f41ea7115906f3e7d8943ba4ef23fe570f3361c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dfb7c1264827bc5796e2405fac5b77b51b6fbb5f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5e4a6821a32fd2d5abe3372e5701325fcb78b05c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eabc31fd185a58e34f0e6f0a3ccd9378208bbf50 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..792c19e843b9f1bb6f1a643c251b8394a32e32e2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_cause_effect_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..27b38078d365b63c03a5c0819821b089d2e649fa --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71668913430fd9d3907b4bc9733ab5c85e594eff --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f51398573560c854eb7d3440d4f7e00e243a5096 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..df1aa6f4aa2379f86c193e3cd7f6c2586333d46a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cd31231c5cd787a0b8455fd3fba979b843eddaf1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6c8661c693d7d80ca385314a5e7526a5abc4dca5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_choose_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c52e4beda203fc30ce384c70cc9c2ad7c90a4297 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6cbf4b1388acdec1aa0fcf36902c8a72b3104d14 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..29683439a52c20a22d44b2643f0e416385d3f4b6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8be1f5484c3109c6847398954ef2e51bd558b77a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5b2ca4d496fe7a6064cb972326dffa0cf6b1d10f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9e2969a2cb95204982f158b918b5ae904eca730e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b77024de24c58b971d4d8fb19ce7674bb8e818e9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c3dfd10aa90556c5265b1e1fa2c136024a8936bd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b1033c0a1fb75236a7639c95c91f7e3684a4416b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ecb47dd8e29c9dbd752e2dc2d7b795a7fb21bf04 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e002c29a798b8f455670413cd69eba9328581711 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..898689daa539dd639ff97b3da206812b75bb7798 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_copa_plausible_alternatives_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a9ba5d9628fb74078dc3e1f28046fbc1c999f0f4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 2.7427887623503127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05495550525224015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.18437430718819275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016081919163123087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.3251983558888638, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020138546152268445}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.22723646490231306, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015666486705066473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.056050363270124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008590677097817941}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.10050481850962295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014187911826345318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.06946399430025461, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009869119968554082}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.1456094405733479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012316335261671133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.26113451142460736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017752376482140586}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.18067137808834505, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012648308698162281}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.15934149267059367, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014098113104567104}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.2813363867963323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018136711023303616}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.19654380094534718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013999134084878718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..631500ecbd6216b83a80bcddeb8c7e1bb5088cbb --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 5.501660579699312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.059490555517424666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3112584786759358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001907448658666166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5491290672753819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002745962690555782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.3871051829172928, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001953296778601808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.12796428764311324, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012052147710974312}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2325200457741587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002130744747415604}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1605728054943973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001412706883124633}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.21648135721974734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012587276826098216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.38988924141191994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002309000377521498}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.27100478266727807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013681493938654708}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.25574830221838724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017155621473719167}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.45162452980561607, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002604535523838573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.31806770177523574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018173275489533744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c12f1e0af7c9223373c5e726adaeffa8eea9bf2b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.324394303872813, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10867005579792703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.33094397240746043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018564220392525137}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5631788704398346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002636056309828146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4051242312515321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018087304910981952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14418060422905016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012689215486412471}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25265308621087024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021790130347415517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.177921888015793, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014368818296271887}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2297030175226026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001353128997875971}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.39650377304869283, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023226005472347976}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2822165267320512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013886080174190089}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.278170389520455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017013879950176962}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.47403743423884753, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025306580301756973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.34056029442652097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017186859749549793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb5ee453ce889988e7918ff0f3559c289cc87566 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.023795509429225, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1045133219407453}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3281077630214524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018892313549911306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5565110906097839, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002632763132900043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.40121542494211165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001826481389337569}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14595032198106483, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001332168833443709}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25433155482778347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022419507977446606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1797184980766115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014978605293431552}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.23006830678751594, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001412084408622655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.394784182200868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023087386618001874}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28212439353399954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014319999333914978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2784043771811358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017584082669023667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4725249414865908, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002566203128742103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3404036668863611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017688250393426252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8dfde362ef767005f1d7ff2a2d6f51c3d33f42fe --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.241056271417146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06584261856760189}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.32573683579768875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001877843583143733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.548887629470057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026299920384990267}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.3975971671274264, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001836450632172448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14525291203480517, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013572915734951672}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2508152581679459, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002279211546178668}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.17842808573274627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015395035598836696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22958828009634885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014143451756674962}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.391229276944502, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002334722588500979}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2810484863937827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014637888600224375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2784229970473634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017664712981204373}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4693077141628463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002584144223809568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3398221063833657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017989152716216044}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..382a1ac95d3f5a02dfc886f63bf80f3a4ebe29ed --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.2465196884878145, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07468916464119668}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3255815725397693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019032883471518422}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.546540731713107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025733832831453347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.3966890040503588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001815803597852792}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14519398169659237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013353231399095156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.24987277938742591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022140552438889268}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.17801029501851723, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014940190331615374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2301383140529255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001418389584367908}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.39137629197532353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023042337566443194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28146458367847593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014472139059049296}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2791614306561805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017928244822471586}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4685579869769028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002529144473225649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.34001582427002575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017807422041094688}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fa8cc3ff1667795450d03132581e74d6665582a2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 0.8753334130290518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0348620294223961}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.12180299903557583, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011144789606423104}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.2607151270005577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001833065327541976}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.16280348551957885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001309447781869726}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.01683964413978754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005356789210692077}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.0356320865210295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011364294225061037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.022387020564367744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006945772086659809}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.1047466751047272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008761170348141068}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.226582446043322, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015936684448292061}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.140452983092435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010520758669989665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.10382518824250553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009416399281524375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.2242636517948043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017031410291911053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.13916001785403956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011384861820446248}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1d33367eba8b33a783a9e5e4e14f8583a8eda51c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 5.721366546161991, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.059879890672636354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.29934120585636687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00167831396994261}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5469683642207921, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025082833435656686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.37872230003166957, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017386604468687269}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.12496074032642647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00112594043655694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23529069358454732, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021079015900675175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.15944824307809596, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013532088062737225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2036609095228398, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011853406607828087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.37837767528899835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022449611252738045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2590184966950011, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013377464931078383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.24947318226454393, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015365533884932646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4560568485237131, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024137023011478282}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.31560678490975924, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016501630567473267}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0fedc08391ef75b2a4fd3ed632110642084e9bc9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.535547229648224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07998247245957349}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3043393031178695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016987372631724537}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.564230482567467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024769176130864756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.38702008716988656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017370872647159401}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1329353387981719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011973152327962053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2547108199869813, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002222655965723188}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17045094780052067, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014026421153863727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21068603957785997, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001216562219549965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3978790249309968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002297088221037071}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2694273591107714, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013460707667909964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2565183996017009, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015725570964798953}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.47610471055377285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024377798420915993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3262488238812611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016716617044544048}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e62a16d3f25bd67bf8f4c43e9c9f381acd22edb6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.816658602745218, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04583016968843249}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30377319756142107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016392467233435767}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5686273285167529, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002510804564764257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3878861536787146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017017932936630189}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13391657548983904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011760375900396684}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.25896717520218415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022974650442523993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1725725568885113, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014322789151160747}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2108908717196471, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011962828612559094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4006206317458079, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002304730683692073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2705133936819741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001354231331623015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25737446644019807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015454274177506696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4820507894942288, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002515972922796953}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3286981074887459, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016837415400218188}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b38806ddc7e5a3fc9d753e604f54b25ad1a78cce --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.979957732229111, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0659505224006796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30391913123253583, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016517877163181051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5698197681808488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024983427107223084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3884017275288488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017060422782565476}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1344501382831743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011696066678081112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.25994005857695457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002276014994019727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1732811817482548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014176809818272833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21017112712931915, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011880019144919782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4004756665207533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023172780379295812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2699817385866637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013509422774341565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2577902147906641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015458181802897012}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.48387236758259095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025093856044876017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3295357685903064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016750203066771487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d136651c11cd47ff42230b9c286055a482b6f4f2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.9467832258458815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0713556533267485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3040190125185005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016439994643967335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5691357371244624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002460959843459368}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.38816794389969306, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001684513494480907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1346910777243502, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011732332291231678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2605212984297539, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002269174033571433}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17347865411768018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014147739784681232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21001184801420653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011889650722311973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.39991518737149667, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002307549465530525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26961120657660587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00134699971864286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2587096095734276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015374119276061612}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4845073574529188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002456173501378085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.33036410242618386, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016552917619089278}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d9a7e4b6cf7276274c74afb4571a642791a91bb0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020684356751050448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.0008334849399141362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0001358503424917359}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.0015622118728288864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00025395262482497686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020684356751050448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0008334849399141362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0001358503424917359}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.0015622118728288864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00025395262482497686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020684356751050448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.0008334849399141362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0001358503424917359}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0015622118728288864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00025395262482497686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1600f173e7bd48aefd3cf5454122fd4eb7f32c7e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.5565042233022994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06597968521354808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.08593898665872673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003701585604317789}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.0887838872066671, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003666707231082007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.07140698262139507, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028371334110872675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.024892824112996058, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011425344587644847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0371764329986735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017241156866971928}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.028616212681722628, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012853116962859368}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.06606011638251028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032423797227769642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.06093221366523217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002567923224143668}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.048987083153452016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019444092164217727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.07579156901424648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034585652796348012}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.07390893983809439, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003072194012262202}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.059745065640798484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023782780189300344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0890da62dc02a0fcaa8f80eced9f1d320743f48f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 4.952955446585565, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14703261112875554}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.1775561426910298, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004354374307061146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2112636167055307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004916752832733977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.16985011321499935, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003824921158615819}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.061741721519856914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016305676991503655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.09131710409720983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00243568525413484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.07112804192230661, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018368523862338794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.13080469299969233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003698218373012542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.14612619148208336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003515094714410421}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.11698574747654279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002667973913528021}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.15254844160782838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0040016749319373825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.175513921363672, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041508619445660075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.14129352369422737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032206138029846256}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5bc5e64d13e852d5e3ca0cd679cfec73b9c7571 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.035933183438738, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16111218297605656}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.23042023299400893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004205272460982702}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2950691553359754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005135544471290643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.23676635522395745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003969984523240725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.08901699667028115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017692731972765087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.13239068456120945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002720773239274303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.10309288089148716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020209463742805306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.16676542398506278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035028714409443473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.20592751042876095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037381203986915166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.16447844130188313, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028207459523148914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.19511953619785463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003830263451296712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.24484238681845968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004384904615208511}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.19647057857599984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00337544239864232}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..55924d0fca71c0db49372f256f84f44ff536af37 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.838539717642748, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1542199121945329}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.27122631402577446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004019032209864867}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3543225457053986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004967042595033049}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2846908650582164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003829987590059196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.10895746152226353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017929065525348804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.16028390936179449, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027130809496109776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.1257055751954671, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002028879799449096}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.19623445722856525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033607871400964796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2494589917112239, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036616398264334195}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.19965383022630442, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027610441762478733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.22931849058794615, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003672760310574185}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2951186979389869, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004285864671929924}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.23705688169423808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032905873586870126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf2ea28eabb67fec0e9ecfb71730d28b1339d8a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.887554411273673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07716295936766063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.29889933654446116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037338421535844956}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3963217358852519, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004651910319820262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.3191894028386597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003572075569447729}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.12313981096867667, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001764254452955877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1798873724588336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002654894317462101}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.14176701175164574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019854849759513705}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.21599407087504968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031436009861130674}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.28082966722369046, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00350910000314051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2252484886653337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002627200495231524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.2524224681188342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034299667817147893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.33082396406528924, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004058856021277282}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.26634567471821236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031018728642803}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fe77862a59ba440593bcc4e4e5d47826f2c6aff7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 3.3868925634521596, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07415023515490869}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.12781588961197554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016392738386146906}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.26686211195039367, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030010672113457175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.16968795558812888, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020520636735140885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.04754240795799523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001014281868832704}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.09732440743600192, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002035877095135508}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.06289750165250287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013193385142581298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.11843660338175163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014270824435811313}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.24837983245494402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026307193382184542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.15745602394794062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017868944105710683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.11138415788379845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014602010171592388}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.23362654109870873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027185132812130603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.14802696104042146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018361037263776232}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1783a24f65c9c7bfded0743bd54e1a039bba106 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.656637998489558, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06138640019754124}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.22448431958439818, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015774519438982097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.475937695786901, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002823854240730373}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.29857293628895076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017883517178428313}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.09278798103068853, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010182785337770847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2039364568886483, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021426616098330804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12449829406834531, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012625501065363337}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.18571244490415678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011675141052374309}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3980265502028155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023515010622215645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2478615356783164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013407338809586194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.18244634491896064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001437571751990845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3876575760439037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002699802672015727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.24278851384210426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016843478489220692}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ee41d03979e8dfe1a4848b45f8b5dc57407959d9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.856901680068561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08124041054587171}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2401800477102809, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014473576335462571}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.5163816757468545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027490552702758896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32161895837988735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00168146427395452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10780300493629737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010069472486850257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2413921319510364, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002257493837016372}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.14591205568832014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001294685527449689}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20140298531801745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011006514010170898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.43694912141494435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002368493512084033}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2704562548681343, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013038206676934103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.19780238487026972, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013643352859564666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.42666941865224894, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027363306527661384}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2651360409168281, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00165326573625161}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fc63a5b197973f4912770b9c2194094983c83809 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.067980420028392, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07674666618798165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.24119330473410452, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001436265091489619}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.520447176584629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027841731171524635}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3234918087831591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016866852215600267}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1098856604345649, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010230282133462645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24765707981063853, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002344687079258296}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.14908085018598377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001325863965517803}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2028877822846739, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00110597560698619}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.44168375010036015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024210245845319865}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2728226991547497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001319038570084664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.19995751457276323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013597786817193271}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4331317668864426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002771288232621466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2684669151387224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016553181450139245}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cda522f4d97bb4d887da8814ba85f28fcfd5f6c9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.301331414189049, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07282835065460666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.24454894011699824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014458985189821688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.5282893998840518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027239476703194553}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3281997540286874, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016842914784987548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11241096101927037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010296837842975505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.25292679532334217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023135968641909677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.15250785714191883, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013289295083122636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20493346554385022, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011217276272122712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.44669665680269593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023872195322647088}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.27576809325399554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001332292317195587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.20253099841039804, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013836573195200987}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4389119231936933, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027715255759945875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2720277049182356, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016797663432208906}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a895b179c752ae79f920039b58f67c0785f891f5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.391640493190723, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07696132388022625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.248561550528162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014842009296995168}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.5361907676530253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026923692664488426}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.33316361794611193, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016721318654132375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11498051543992278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010486619107346665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2578870983510302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022815014259324113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.15567663749325128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001316099001704392}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2074081838133947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011575590389617165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.45169489347455444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023905395426844995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.27881193769275936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013297414200118412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.20657408552804282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014132706167399262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4467577031168348, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027053770792765386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.27704641367351746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001654188879883823}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a706e7205c181589fe7cad95df45c63404d15f6d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 2.0677053326576096, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06271134039957105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.1095707231662833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021591140734212957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.20802103992679305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003979629734649372}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.1395687520228349, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026646833399520944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.043273009183308724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009648056549439121}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.084551914403829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019148621747591432}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.05547805696954945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001212976335572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.08666935739550846, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016576469738929047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.16514211361615944, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031559470820256225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.11040718684668492, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002066401071200146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.09469111770984258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018586644683897386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.17996098817894363, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003488975907287554}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.12059646907618253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023117453399075457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..61fdf0b5a34fe2616d0e7233099bb245c599b811 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.776339014367035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08648488550482673}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3689526387645257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021333449754014004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5958464926870726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002476964048840286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4455306771983514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00201463745384298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16443602410248034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001373572653978035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2723125733003977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002120174112140588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19999728868621525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015176461844321897}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25666663991049066, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00146996338508828}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.423240490787575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023419654403491713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.31215943868340223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015312730015625377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3062178505891934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001925082430561453}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4957583957629278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024755678177244595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.37009049888168155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019204542565398291}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7cbb56d158690985d934de9029d2b481a2482136 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.956869478198272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08706326939132922}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.35860576976732106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021204385851447834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5877143657823972, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025365074511194768}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4357140603298011, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020471026571017327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1610875918591629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013996599831001433}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2714598427748779, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022353847705593125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19727207437417654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015716203234345902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25147400899840344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014714723008184948}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.42019871172862305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023315626390538035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30752457963236757, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015398007381628844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.30198591705675015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001941444722568633}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.49589524688238196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002550990119421592}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3671539275155354, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019677103053781135}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3915942719dd3478816052582544f3c3afb86c9c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 7.009191295391601, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07880823868019797}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3545298481996187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002080164442210043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5832652468397247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025434678918276176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.43148059444634873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020169750963498836}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16017594753225423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014169660528143824}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2708031087813346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022903011042423914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1964238350803286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015986177875609523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24852387761819955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001485017547336716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4157032927811326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002345334628732105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30412703782905015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015615998587158336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.299053998385982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019322475227694653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4926202062191711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002581909625016066}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3640982827070694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019717220049021546}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62fbef64bd131342180c9b6dd3ce17078bfeac8d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 7.000198019935731, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06913407290704363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.35024781130506905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020285351396598593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5776691917985523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025294416589362444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.42692314967926737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001988659000618427}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15717110741288884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013996429298285594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.266320618349341, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002283184726774742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19301601907405783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015901538810711403}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24489748566928496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014705299982066997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41079255911678236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023718685348987417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3001609211346324, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015693837295482464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.29636957517330675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019035502093200786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.48931428544569894, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025795333772663286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3613177972378212, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019560581480454974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..08dc05807ba980e954eaab12ebae2476e2bf67f4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.9767120337505135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07800248993311952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3507462370081385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002007947032661163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5785590837053719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024745176796048766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4274281275013867, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019410392555796734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15691758500190972, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014061231147467989}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2657298946460687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002263553464788525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1925812091012645, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015855872884769345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24541799797892744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014553330784770901}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.41122561852623524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002317891502947824}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3006040774381334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001530792855406818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2960131787945338, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00188423815157528}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4883301232467587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002513006325165267}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3606676599309041, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019100284921251059}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7883f67f8b9436f41a6d3376bc7dc30f1a9e02a8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.09769813350875045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015529769050519056}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.24854008316454318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003822284525903826}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.13852188755221997, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021293763458264075}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01493502693762796, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000687168087320552}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03970637215388994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018488604775761157}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.021404604329843858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000978061112594587}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07997070021861699, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001165505436023522}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.204221486314787, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029072542956042127}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11349412170506364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015962350151948002}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.07895320097795921, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012925709790883738}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.20223663136840953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032525187208582097}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11211976668519616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017813295955859596}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7256315560298398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.054268166719320964}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ff7f4da3765148a6ee4328cd155c8f20cb6747ef --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11318588173143941, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016141770639632976}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.279979949784874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037134168551931756}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15918378891591406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021744946842033627}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013696212725928195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006678764989725612}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03434190326540072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016508020659010145}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.019363039161219217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009344774229308126}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07914360184018943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00103668752345318}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.19748984092630087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002485099284856001}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11153061433705051, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001401528481365617}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09056691007854044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012802454821530644}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22609222107906188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030831336083997113}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1276700935379806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001741533787976053}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7100208650541717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06307666643323927}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..953ba1a9ecc1e64d62ec43358afa521f52cde9c0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1160701827729658, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017439664665625302}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2860944708906913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004085639279103821}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1629955337480239, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023615726845261148}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.017447949224103376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008032811020562298}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04485569053060954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002114380505243146}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.024834458548132245, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011433201739787905}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08456270084600391, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011815898217163434}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21064825146724378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002957758437750362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11904854566150291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016169606046750717}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09152186527385546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013729069191967156}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22753083497124593, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033757963655885394}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12880725110187866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018796612939592603}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.9602615670933534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06975113829339996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..738e728ab9e19315d20298673ff7ed646b3f6838 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1198315859407343, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021313635031272324}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2845973554839675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004807019416971591}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16536255544133652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002782591039843851}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.021995282393962437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009659870502554929}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.05477147588190337, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024254506123566537}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0308541164205269, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013433559082160224}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0891431800173679, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001518317618261569}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.2133363419019677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035492022408266455}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12329381135423298, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002007770555031566}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09477716470429574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017017716066415637}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22706163311880304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003999321665699719}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13115287292645436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022631518633725732}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.213942331424008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06983125587361878}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b96116819f621274e759992208bb96f68aaeac3e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.03617972980376136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022835108300555376}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.06939836439110722, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004267803204475849}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.044055315863565106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026368956007959852}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.006768492713667434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006816043308723117}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.015027042518545813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014854463791600376}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.008953373288301222, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008618162172086633}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.027808549592201456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017616362314983645}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.05275279200412055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032412133339056147}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.033494270940581335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019921856030734055}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.029343763312784644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018717600410582465}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.05584466177175486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034859449912314064}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.035448110651920155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021385233474543013}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.6027229140523703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11533577622824943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..768db3512e854a0ea3cfd9a1eab087ba12551145 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.004288164665523156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019144304746682786}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0001731252011799053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 7.864942490494626e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00033237715251661107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00015078706434605712}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.004288164665523156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019144304746682786}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0001731252011799053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 7.864942490494626e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00033237715251661107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00015078706434605712}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.004288164665523156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019144304746682786}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0001731252011799053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 7.864942490494626e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00033237715251661107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00015078706434605712}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0631879b49bb4b971f03f039a9dfdd4b5505a7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.13056123327549546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00211653085075359}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.30744582623088057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004345708260459012}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.17862356070756422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025190444667790237}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.031215859261324586, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011455957695466357}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0753592824533711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025928926122515596}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04267694660641669, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014272559106211188}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.10607783393120825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016604953804639634}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2510352242981599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034186066493379894}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.14532363973135456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001945356869811387}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.10275001625696571, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017792008566591565}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.243889829015575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038153653094762778}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.14077407394066704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002139215859454691}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.5074370696814168, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06306583053985478}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d36e9a2bf164ec5ae07633872f3371bd7f4feced --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.137488328967968, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001854761861679906}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3409552422180664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004347354842228533}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19355866105950706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024904888130049163}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.032790830434572774, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00109127458636922}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0841464589274841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028228926118902585}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.046583919666909064, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015343154616235192}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.10887517550153661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014322908116765737}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.27169151726551716, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003535082608448959}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15350551415370856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019452968584178893}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.10904170474313231, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015480655523710234}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.27225025414903886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003786898530655058}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15376034054280813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021049152368113177}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9294507192370047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06927174390558496}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fa0d0b5414734a5896545d66f171c65a39451712 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14622869832638033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001840271628617099}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.36110939972212, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004317171977659118}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20561129767344294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00245643585677957}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03579409350172391, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011122812276210662}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09230985292859638, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002925197294466604}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05089418045053158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015644827567191525}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11460227671669204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013793463537122247}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.28536047123610525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035023837864113175}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1614717754951783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018726384968348116}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11587559985347855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015497939952997533}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.28811845556237264, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003818249039685}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.163228640222356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002103524354064436}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.0334951949441202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0861569471135935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..93cb5998e84e896a4b1ab55f03cb6e4b7b5922c0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14592947816165613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021611659038272}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3443812486249235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004671315297787991}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20045136953668377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002718747872622448}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03602991518922341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011524705369005208}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08958291369771845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028952310760534993}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05035265573315919, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015735834442296916}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1140823083670884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016349867638249449}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2720512676890386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037892985318394535}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1571872470032892, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020963391106268445}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11426494976351477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017631615041811524}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2716320363223784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004008514578347172}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15729367100488104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002267612514707712}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9326408991617487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05911662841116225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..39b2a29e8439dff8ec9608dbf2893f1e569e7b03 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.04775403576214989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029942578637535498}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.08769716230460457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004942918692076916}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0553051713195741, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030382631257604253}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.010407613906275788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008600375403691117}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.023456621445988514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001909353475291992}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.013792609834707413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011026959034726492}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.03800524902986652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025635931438879765}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.06826516249123325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003864253776835998}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.042863987037903804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023336709067174025}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.03874591139206328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002614949735399864}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.06947543218015861, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003996515839046432}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.043714690958694166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002416760693299736}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.9888155443961298, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16386874187258033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1bd586b15bd80442aba7a549bd5e23e10e4b86ff --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.003193068638626207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009043418737167503}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0025114151662593677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007268593279769246}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.002720852866414494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007709104718885456}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.00047578207955566436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00021103651915959466}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.00041802798878270574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00018654044401303826}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0004211068403029026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00018200691938578806}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0021986780281657314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006286463155410547}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.001725510678386032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005031104040557216}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0018716501706524194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005361379726101486}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.002403627074679706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006873477461862152}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0018880944258754403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005483050471105171}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0020513446899695803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005879898596125877}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.9891071758683426e-44, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.981986929231053e-37}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..765d6d3765415d4f7fc61658a6f600b441a9b67c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14714404834539455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018923895784899547}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.35332808198249066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004277854613117222}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.20448305051370513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024499517599214865}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03290270141635618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010989269503369368}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08287973270989689, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002833032880998302}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04639360161894793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015361212037212923}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10929554255441323, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001387362394565063}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26469649902339626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033616815736178403}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1522143356903949, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018364801701113477}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11672705515751206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001546854617689576}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.28315180080173324, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037966298013545237}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1627172387554665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020740152357133783}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.8973746751821576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07374608873039461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2019e912c498911e4677ae8201d56bf76f0870c6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1344416388362992, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019127327308320389}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.33053933219336484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00435936977132835}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18885391905418497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025584792794476653}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.02919021495874484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001083579043412117}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07395642387380598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027833547930418757}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.041352212313184845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015256985849036882}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10283551822207533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014281321188522025}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25511273674645885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034307162021354593}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1447818747001178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001931305342910804}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10637101855903888, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015675521812090556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.26365396988763345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003725007857599503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14973596762001254, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002121716066142779}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.6712614198546765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08029090892350033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..91b4de860992f304b3f74beb3d1651fd693e8a8e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14045160525988198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018675726895835078}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3448073202698317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004206053313996407}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19729124660372196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024865996635105967}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03250319865239163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001104582027493848}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08285013633430294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028679116547561338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04614704751240165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001558612721206191}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1076421486644076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00138815532226034}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2669756747819945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033509483772327935}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15159285154298055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018751369925771642}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11165266260752228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015420215686911798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2762348157117063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036302813408504828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1571758204423751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002082270276853359}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7895958187086474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04787428909067152}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27986ac75abe29876585b6eb53463d347c0b175c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1397482550494329, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021614980521941947}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3314206756572503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048258842534713575}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19247676178691825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027783344646224338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.032034815897723355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011234176352706835}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08021224571691461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00290937059770342}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04492583126972709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015630404482977962}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10664386827077653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016219261205998713}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25469242016766824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003747468811937111}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14710040508558128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020844906061187624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11038994072863482, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001775109847716677}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2637144833686383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004087246305150146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1523907734490472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023066662456799336}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.799886255448494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06241605343512123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..86e43ee9eb15abf075325325f61750da6b997ff3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04293551268263482, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025300915700631186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08251909123638941, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047552295627311435}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.052109895215743926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029325961974541494}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.008902116582691397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007882814434287245}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.020355209333257882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018399757489791825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01203821663060757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001058756941479336}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03291018535700006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001988611171780747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06245660895956642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036021201516783473}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.039428572953528446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002210624192813922}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03518063964656488, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021155739741956746}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0671088243853192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003920638757312349}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0423468971865807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002398760878941016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.912241430159217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1508779538575514}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f5f3ab7c6c2d676c6d030cd5fa8d4727e2f74c57 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002739968547331711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008278012212952025}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0022134599272398445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006273422111807437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0023684978392920267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000684399718764436}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.000376696230153845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00016337393027254123}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00030314025597044463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00012776302473939982}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003256051958251534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00013752393773970521}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0021741866307845633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006503820290684123}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0018275205470825698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005304151694392347}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0019272015023736237, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005653230599258551}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002299258100195655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006916877058747434}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0019069310038515171, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005514157386454834}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0020216316744952494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005921597049542807}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.097815638153428e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.018895149426352e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..587f50130c6e6535e4dd4aeb411e6d8ae4d357d4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15006207382743458, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001987657336844973}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3454637430656072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004018757653016808}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20343816500554804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022810928631204065}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.027682031685392947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012572133596850813}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0647058637194844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023098436259221776}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.037047223818116884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012769054344671355}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10668249292401177, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015581101621468096}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2459446989428375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029903868489755438}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14427043435851508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001641927464053827}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11765973495308749, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017048462642842042}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27281684754355495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034923333313323017}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15967622365586978, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019165619775066089}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.3629401242051673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07163139222356285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bf8fbcfa50713135e11cb40c3ea395a8013978c9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1491508085903834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018269519621024542}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3664832216419509, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042114762385522956}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20959848507445628, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002442544487345855}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03343924101857058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001064909350142866}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08498394458221385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027112134192700124}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04746052242779793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001497983093227225}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11001044604391327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013469472913403972}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.27255320287378415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003314920441365678}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15491949613982683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018314818566152655}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11751544431488214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015368040243137976}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2910075794259318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037311617285632286}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16551777637641962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002095777574542374}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.8090419671658906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06243960286799727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4d197b20a70315fa365f53fdaeb4b78442a2e69d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15135142269847585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018503020885729993}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.36992385810715606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00428697893431127}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.21227676754700187, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024708186366684537}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03621955126854001, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010923924587675196}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.09215185931049906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028353381841421765}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05137980859666271, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015384305388393403}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1140477375940746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001355907302607709}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2809365376305075, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033434191813504033}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.16024351750916332, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001828916419211741}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11966566066238304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015275000983393555}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2949558808752265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037725507061813924}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1682190350645718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020795057799479827}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9591273158820803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.044543479812057196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..234de5252d78e207a8600511b0e0f9a642d5e24a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1520556889414388, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002189742662234435}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3562962414780344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004749166123906837}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20846450018241194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027571228316079694}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03687572904819467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011771782163178155}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0901761315164544, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028796580054493573}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05134777699396957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001609365371866251}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11417177031962449, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016577260631727522}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26888326992906886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037215896036639656}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15665407959636418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002098634389081189}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12014388751084557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017911422977108812}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.28401937552529777, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004092634223313844}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16513176347393482, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023093147777262708}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.0596089820557593, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08930377223048487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7c0b45c12acd83029890e3a3dc6a31da7efd6287 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.047023957094332526, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027799550540700086}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.09027695540385026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0049774928615611445}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.057092747254164154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003080646206696148}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00908866042073288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007468809595545078}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.020385013066777098, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016479365749736059}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.012186294567403028, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000974923421275362}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.03498091370776939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022036086527599703}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.06597700098824039, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003697132762313373}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.041702960752974014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022604615815070856}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.037710892572660165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023299806643745616}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.071991225804228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0040607831831936175}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0453763496307077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024814034302527927}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.9295235714400739, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17952600712489128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a5cfeb0edbb3c0bccd54bd3913cdff79aaf87196 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0029488555074161714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007930006513468652}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0025816173879792048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007065729972344169}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0026690932382273276, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007150299022018968}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00030619737889438225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00012483905412694334}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.00024479974951673066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00010164474700168012}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.00026998206537521717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011079170761441466}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0022740359120988827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006037958962838299}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0020309527255223335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005682477636303094}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0020707423522561583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005514160671606608}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0020831557924846085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005457471442594348}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0018757125016714748, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005287065797899205}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0019001191955137153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005013736076589153}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.3118412132647612e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.142891951955452e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..731cded14b63ab5f4ae749785b259bed9cd33b77 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.14761415510887813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018460271861890638}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.35644215533066814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004313928291972872}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2056827728723364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002442623835139831}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.03437672503730759, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011443426120385147}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08689221376038479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029525081998479433}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.048478969239621084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016015359697998386}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.11084459960593879, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014045626956033276}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2695965057726157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003458083185065001}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1546552506639713, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018693227706026462}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.11575748816387031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015363157261191857}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2824907234086633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003845732411905414}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16171184786817824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020719541841424964}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.9932437983297846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05005508121283258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..57887702b6a16425937b40f16121352bf1c6f405 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1290438512870586, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001908907445184339}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.31582870770710825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00430942059053028}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18107269784715221, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025610519024537653}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.026404127167916132, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010432396448414494}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06561202026843391, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002586868170319681}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.03724797091171915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014590126800901643}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.09933917048841584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001409747027873095}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2451300217584245, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032869960015931565}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.13967411856886675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019003725867380186}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10225509404962359, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001564036497709352}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.25220739881743365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003637572454499514}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1437650081283148, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002115707145967448}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.483994717023367, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0583433944934504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9da2226fadf16818d6ce26ae25054e816f0aa817 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13513852151062536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019145000289811317}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3276171253130558, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004253252892642702}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18909993339896647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025443533012134306}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.029483119025250706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010991930679911828}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07320552894804701, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027333394649254345}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04154990869291492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001535468901684112}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10394340309946604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014280355461601787}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.25410078113161033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033207296201257777}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14575285141182545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019148076940627124}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10579391735030412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015362690884521812}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.25908485264591374, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036064198067757756}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1484681379100036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020754922487701485}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6369889601746683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08510169629278132}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..85e2c6785f004a838fc2924b20a52384a1eb61b3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13077969013065976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002174976181761563}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3051790928330733, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004806410522034264}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.17983877354806166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028344653807296284}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.028769652888890242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001066351955086345}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07052778069631396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027420302852526044}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.040167379424322004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014918819374576107}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1015239172914668, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001644583430262152}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.23886024900651934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037908298592866764}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1399605239628858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002175089499821927}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10201260281036947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017070040757652396}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.24078075446433067, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003985094366002531}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14082010524518954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022782275382841146}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6288699845696568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07351917145628349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..286736c367f9f97148d7cb9abf1d38417e75b522 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.03885731681664587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024465188352010866}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.072218223256745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004400630629119156}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.04710206848660278, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002827851220284823}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.008293369893348976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007674824532485735}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.017409607367553894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016283584197287304}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.010750703825901103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009638910126157537}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.029640111134550683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018709888056480106}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.05506280693983692, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003373140482326596}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.03580387652237057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002145060472538106}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.031047920466470008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019660483285081367}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.05761387256991218, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003550284822808244}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.037509929346318714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022590153287918896}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.7016713676020814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12854976408337526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..07931f8c2c224d153784e5e411dd784bbe52e18b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0032875929102344197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012833920426189144}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0003777789691102385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0001360677657980004}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0006709563634213615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00024206325506310295}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0030017152658662095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011845885037617687}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0003434736517860532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00012248298015115857}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.000609696868199602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002177019049863885}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0030017152658662095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011845885037617687}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0003434736517860532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00012248298015115857}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.000609696868199602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0002177019049863885}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..efdd33e1df97f4024224de53831eda1c77c7293c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 8.293650196551072, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.3291933211979279}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.20985881282206706, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005678392457961964}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6760010324649197, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007109960469106136}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.26518149573262756, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.00598353213568122}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.1581319955963504, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005136391219179097}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5233448090942575, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008115996413199544}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.20168519353681674, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005583925104466641}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.2034802697348527, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005608894909532193}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6584424374867024, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007242236248876609}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2574325851884139, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.005932612011453031}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.20469461558102828, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005642747231844393}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6591156896691646, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0072783852307659385}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2586740738981541, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005969487363499531}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a536271bc5a171e110ecd8cf143911fbe9cb15db --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.9708485559775974, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.34463867046580154}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.16692070561892322, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004836046744236672}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6789384494865478, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006554803195463939}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.22252428799090185, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005166919209222038}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.12325642385963735, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.004315397965867194}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5132917664492825, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008124723992810777}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.16670313208229318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.004842857551789808}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.1603231731890274, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.004760330118066573}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6553730557520384, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0068118148422351065}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.21392670473431238, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.005120862001741173}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.16247005134061046, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.004797762841990119}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6612423664976972, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0068194387242296725}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2165274990499404, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005151253631478041}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..db0dec17908e9284442de49277014c75dc55675f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.9957771713059085, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.3238947499660514}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.16352590100073616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0051809522811463865}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6961503957283254, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006309156641782605}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.21337595032188442, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005144022732419237}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.12411789818339344, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.004723425320608877}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5262901845589097, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008088441034716209}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.1635128839739126, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.004978071911619273}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.15682105604362034, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005078191461050391}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6706778746147497, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006640724268201555}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2054038225613471, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.005126617953639098}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.15931759159992323, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005115274381138}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.680473606884987, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006549559772955488}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.20828419170971285, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005133723055482889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..45089674b87bb532f423cf74579b3a32239eb5c3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.990071211182333, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.2967253826874956}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.15871004605077652, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00515504545306226}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7034466308669174, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006207428258406954}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.20922245282570967, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0051976113847955795}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.12184134221668164, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.004666177846152433}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.535375224176284, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008001057333235266}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.16228271943343794, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.004992233069443837}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.15189168360996885, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00504068577523969}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6780782501114493, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006547450748075444}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.20121070269350022, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.005165478141129265}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.1543756601976579, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005083274387419325}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.688408073065803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006448637747839622}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.20417200076191475, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0051750936785322785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..77763eae5929be4fba93dda60cf96fbfc01d146c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 7.240744535616564, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.2586062831369915}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.17254149538622868, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005654525592751173}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7169800690761489, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0060374304726635245}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.2205327621041109, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.005523926029284187}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.13569869881237576, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005133251060824273}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5506147150470818, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007924999642538401}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.17392590467314775, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005274573095809241}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.16613299736339213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005531569846691979}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6938260684470078, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006371003512990666}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.21292785321625532, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0054688898695651805}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.16847957385780346, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0055567140187079665}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7040155922937625, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006248279598171387}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2160261905713099, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.005483109104579196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a0828c49e02f8aeca7a578b2f24719aacb6b9f59 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 7.501952156457839, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.39271130030770846}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.1907119925251693, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006242576921723348}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7229868834154785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.005996824073230608}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.23639707996315365, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006047125566678411}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.15227920059782085, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005751542723918666}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5588359253387518, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007903814632285042}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.18898088840112187, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0058122997504793505}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.1847082573134137, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00614731549204923}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7001578119554788, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006345471739520215}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.2293218614443035, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006023695302966697}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.1869491193721871, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0061683914136742134}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7106290738998889, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0062144876381464535}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.2322126006841029, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0060236108326316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b095afd96d21073db152bab4355a6b83d4cb9634 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a66c4f6ee2a95062da06bab0e434924f51e66f8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..13573ef6ae890cd752f8119e135b5ff9583c12ac --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5108813928182807, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663061261117748}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5108813928182807, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663061261117748}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b255deab12f97869063285c9bc5983dcd3456510 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5016322089227421, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665762007194866}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5016322089227421, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665762007194866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d01bbc568fe9513fdec41306a4c246f7253f8baf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738878}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738878}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8dfa897d99b9719c28b57506c75eaefb1d02994a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.499455930359086, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899182}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.499455930359086, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6036d1764c7dbf60bb32175961ad0239c5aeefed --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1743540081034935, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010323609186140817}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020414014144919227, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005848929509747042}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.22265270386663025, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004060591384362697}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.035178147983051675, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008727304917432003}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003228525003290761, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00016737474198048977}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0383496959158395, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019867052472860103}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005612668710216912, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002723517397472548}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018559243344714177, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00048315200547689713}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20821746365510307, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003757747342281253}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03213626849465759, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007335055493238853}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016991539384663735, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004971662800825557}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.19247194188686473, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037031262038145878}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.029311011598519118, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007339751223464163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..27b358f7b674453f1ff3df59b9c65b40aae1dadf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.174213094733269, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.02197451232409411}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020403702655232312, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007707685830758424}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.214395939532725, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004149318989451621}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03403289853539769, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0009305982872464937}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003189729472705606, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00021737905154245153}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.037318744091622835, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00206867790744111}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005467793325653137, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00031728961152867736}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018933871870268792, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0006694389007203683}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.202411719381226, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0039026013675624565}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03171656567240596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0008276871670439616}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01699166069756899, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.000659008917435192}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18391954436557392, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037126321626407112}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02830120340253178, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007928221584109298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eebee07691c3f2e1132cd9407faee46aa2bcc3ea --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1491691772441667, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.016463506105742935}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018743075278271827, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006585934745600448}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.20113968945623895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004069048528819971}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03119020870747495, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007848115949067071}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0026023719861491207, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014730866868978165}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03304396129526227, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019097087061322854}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004589484275527073, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00024840528981741393}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017648037515718574, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000612945231693746}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.191148004137635, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003801093134569005}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029387188096479906, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000717400002936173}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015525163139534474, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0006012960856164704}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17186503534549524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003612669608221627}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025633495048222255, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006527133219539642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0884462a62ce1605d2c528cf5bd797108c300fb4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.12911173855917576, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.011212622020653384}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018353557289208323, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006164592823400039}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19679724157173895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004065759031603333}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.030624519846734758, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008233916974397996}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0025089231703953195, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014314849944617542}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03367952377754561, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020381019021143807}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004446131485933507, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002459591462609964}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.016850481271058308, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005360133862246574}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.1837392053172025, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037458785538945077}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02816065241160017, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007063876575393347}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015093144272162332, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005171501945296606}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16891544570433364, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003653462651005743}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025134463700752695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006674889527922072}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f9ec1bf4fe35b51ef3eb1482c52d26749d6b6ccf --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.11653255801457632, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.00937865353959567}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01764227588810449, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006403823122550869}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.18185072302335697, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0038480944045729292}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.029037401002642696, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007664957090889966}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0023525543276751216, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014914943547875338}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.028175098812063096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016880276568031164}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004018163648220329, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00022597062969223787}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01642546114280103, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005899392695445675}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.17098066504375273, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003570562801659878}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.027024334508787648, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006867914982061877}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.014394564094771958, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005138074352751769}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.15527067254188423, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0034206077082441716}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.023723025806798186, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.000614747502166158}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eb63eae733ad06a5ebd67b8dfbb5533e736e29aa --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.10185172940494028, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007051902591792091}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018013100442129417, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0009190145899227636}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.17846455901442868, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003868171470663707}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.028066473296435548, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.000754026689838101}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002344978071093655, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00021411077145171528}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.02798475436220232, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018363804191775573}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.0037514567534632703, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002162496835966985}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.016632311383909484, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0008812891847948634}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.16638495652422183, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0035613541742668953}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02582308765153566, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006671470754149905}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01508965511135499, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0008651573938015946}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.15460309329405014, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035069550811540557}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.023227601838740366, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006168362129579054}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..46926a20c87f76235fa76034366219b680115eab --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..43930c0d37406e62baac76c6c4303dda94a1a3a8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4967355821545158, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166557553076037}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4967355821545158, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166557553076037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ed44976c55cced42ab461c61bdd7ec255fc4228 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4836779107725789, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011659606710151779}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4836779107725789, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011659606710151779}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..97bea87ab701539c5987cfd35028649db2169caa --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4766050054406964, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011653047155927788}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4766050054406964, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011653047155927788}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e06b64a928abe33642ea7debbf514eaa13597142 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4896626768226333, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4896626768226333, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8bfcfa23c73dfdebf679060d13e4ac7b6e723796 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4885745375408052, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662778026451675}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4885745375408052, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662778026451675}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9288b8e4e79aefa0c616aad5957021103dd72aac --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5609357997823722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011578865649321299}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5625680087051143, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011574126069682387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..57b395e9c4246b9fdf65d2328618276df8fd31c8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5680087051142546, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557407210100255}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5674646354733406, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559142916063143}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c695832ddd79a5dc974fa47de903fc31a5ba38c3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5516866158868335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011603326108334514}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5505984766050055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011605936624156083}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb5c86700850e7eee4f3f2094a3671276bc7bf95 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5625680087051143, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011574126069682387}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5625680087051143, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011574126069682387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b03852ee0cf4b7b00c3285dc299d6f1ccc01cb0c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5489662676822633, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160974720073308}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5527747551686616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011600659443292926}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..534e9bddcd06cd48eeab02e9f65f552bc297ca43 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5576713819368879, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01158796354550718}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5609357997823722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011578865649321297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8ca97b2fe94a6697a24ed569a01378f965d739f8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.498, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015819173374302706}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.465, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015780495050030156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e526f884e2b4a3216960fea0e57a259f78e7918e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.65, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444233}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.621, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01534909100222535}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8c8fee66c327555f1bc02862700d429c6ea93150 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.664, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795023}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.649, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..21f9bf64f61b7dc4190eb1205995557733711130 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.681, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01474640486547348}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.663, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149550879186536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..275791af32eafeb7b25f45e8bd61a8bf67ae4352 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.686, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01468399195108797}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.69, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fab0c5d10c733b882994f880bf736ab317619521 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.708, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014385511563477345}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.707, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014399942998441271}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d329c6f2994b39a055c349afb0d792c5362b76cd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.862, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.010912152632504411}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.808, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012461592646659966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c62e9251004a7ecf7a17fc838294c507377d8684 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.896, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009658016218524277}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.895, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009698921026024947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6174c07e14543cd661d3d1cdc0ca8f8f3477a00f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.917, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008728527206074787}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.905, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009276910103103317}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..257dba32b81e01891c3338ddc146b1800eb8c1ee --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.921, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008534156773333443}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.91, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00905439020486644}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..041091e08d0fad36cdafb3b97f8179b861443a61 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.918, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008680515615523715}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008916866630745892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6f31be34ce48d848725dc659156dd12b911d0798 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Direct-Question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.923, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008434580140240669}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.915, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008823426366942293}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..87e5d0f59012d0f1f418061fe813884e042cbd97 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.422, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015625625112620667}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.395, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015466551464829344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b3cfdc9c54e7ea53533f4f3078a240323bc269e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015663503610155283}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.425, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01564032031704011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..90b9d07b219a76d0637c0a5c7b17b4ff465e4e65 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.441, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015708779894242676}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.436, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015689173023144064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..86ef2c0aaa6d44938f043841c13d119bc85bc3f5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.481, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01580787426850585}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.452, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015746235865880677}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f4cbdb0e1ac34ef2c0d6245282fcb9f9352df4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.501, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015819268290576817}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.484, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01581119837311488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b04ed2a3001b510003b925831bdfe79650d792f0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.524, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015801065586651755}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015819299929208316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..32e85a0685fd503f26488567b0bd9f29dce63c09 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.571, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015658997547870243}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.506, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015818160898606715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5782b7af561fa3f6dd28b84b77f8b07ea7bae463 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.427, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015649789644462217}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.412, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015572363292015093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2b86c01b333fd5d1ddd31b35cfc3ff2df741b8ef --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.431, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015667944488173498}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.414, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015583544104177522}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..48b089edfc7a889564b9dd8d348b541dc3df840a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.441, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015708779894242676}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.426, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015645087688113814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2e1da01331dddc3aaefb077014c11fd8fc421058 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.448, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015733516566347836}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.435, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0156850572527172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a725ac4198094e4f45543c273e0528fa01d1187a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.451, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01574315237958554}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.438, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01569721001969469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..082302336334e8527c8deda4057af72d6f619cac --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.569, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0156679444881735}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.498, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015819173374302702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..156467adf1b86a4d2c7e5d4aa43fc061c7c0c66b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01574000469338385}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.511, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01581547119529269}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ddd4c0eae924b6954d9732396c8662e97a169698 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.565, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0156850572527172}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.543, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015760691590136384}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ed960f119b9a8cf50306373856900875ae3ae2da --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.571, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01565899754787025}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.553, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015730176046009063}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..872c790ceb8ec2c10ceb2955701b03364667ad01 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.588, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015572363292015098}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.563, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015693223928730377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6b9cfdaa5d8b3549a9963c13d533c39344725723 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.599, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015506109745498325}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.57, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01566350361015528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a225ad3b42f9a247f0a9477376af17a2b568b162 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4719401389631213, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544210396951669}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.49545697487974344, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561954965856516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e4aacb761513dbb95855327616070ad1d9259b10 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697235}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4927846071619455, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156122826464673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed22a708500ccb268f3a24fccea6c67dbb8c4921 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46018172100481025, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011525709570367521}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.48690539818278994, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558466383367182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..387d746cb7b87a1d4abb865a79493637d07bf1da --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439893}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551049647290312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..823e4e8af8d1ae1caa9a64430c343bd0802428f4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01152669031601459}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011541325320336616}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..95b0d9e26afd8f621a05af8d6e144398b7a93de4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011537420054210303}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4756814537680385, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487317}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a3e173d67b87d697a1e4c2254dcd4749d7d13f2d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.484233030464992, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01155668204219638}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.518439337252806, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011554566910658105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f8767c44806199b9c46166dbda75fe1b02d28d83 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4794227685729556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552636515221862}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5104222340994121, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559920087347776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..46790a29b34b9c7fd109309c47ac4ea893e62c0c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01152471548624065}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.481560662747194, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011554566910658103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2cf1d95d1739870f4581809d2e14d63868e305b2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46873329770176375, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011539803085637733}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548139823074772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a032f54358e1afd12f86a031c07f4fdc375c507b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46018172100481025, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011525709570367509}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0115462348137774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb1830addc161dca973850022f8e290de46c8e6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4580438268305719, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011521653168224729}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011527657726586461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6247e9787f3c478519a60a22a5fab4fdc974f8e8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f412f2fbbae584461703444fa38b85b71301c519 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..58b105279b12f34b10bf5f5104fba65861854108 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bf612c6373d4f73f5a37a727e7937c6082f540ed --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..82d29b0983ffa3d29b5be97e3647a3c70cb5bff4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b76eb0a99654a3989d6bd630dd2751b1878d1f26 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6af393b412c3f1610f03c4cf7796b8667261a423 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.48583645109567075, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01155779233130167}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4991982896846606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156241738830021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a7febc836495d897ec042319cb7614e0d2adfa5a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551591851683338}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.504008551576697, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562060664045727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a0b11320bffb0ad75d4ca11661e182ea3660f511 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585206}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555016408505476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a5b3c7d109583ca7549a6c33e114f4687410566e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011526690316014589}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539022035111226}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ba5eeb9a8ab8a560ffe56fadc78c56b471c0de6c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4537680384820951, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011512899199863032}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011530479981182624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..697696244f8d1a6a062972bbc5ecb94e42c4ddea --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4548369855692143, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011515167912227987}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4548369855692143, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011515167912227987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..29447ec3ccc9d6ddb0d1fb878bc2d3ea9b82662d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4804917156600748, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011553628196999314}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5114911811865313, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559378273599126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bacb4cb6836daf9dba578567cfa54f2268a246f0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011539022035111226}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.49812934259754144, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562351329083266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..328e5f8ec8b9b9d5449465004cd68f1637594bb0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4494922501336184, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011503288699799179}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546234813777399}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c5fe1cf8f42ed27bb30e7215baa6d6a2ceb5d5f2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4580438268305719, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011521653168224729}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4623196151790486, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011529552555884575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..19699f5f3a8eacb650a7b111ec36bd3a0979d4b8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011519544865928062}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011536599118298168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e26972dc60a6dac0717b1deb3c3c8449abc47de9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01150577173876986}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011516282203726655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ed41c01804876571234db81763735379a86d1e74 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.47653429602888087, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..397662e4e035ddf46a0d382e436c51247f8f86a8 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ba57645f8533fb3f520335e13af40c059e43ff54 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..00d6336674f27ac1e98d56b5a5c733e37e8cc7a6 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ed1253b13e932a41179ff5f57820ea9415b00ca2 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3acfc43ba2aeb8565bb1b2f83cef50aad2626d83 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6fa75c5a41c89e92e63a6f5fec78271df2d82793 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fff67913a7957784a86f4a84d96041a70ed6f0a3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6ae4f36abfb542f5017d20faf47e0a4d58ab8047 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c3812b6d96bdcf91973ebe16d512ee48d26f4718 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03009469812323996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..367e9b91a6a5bc659516a74c6f122a8dcf32ba13 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4584837545126354, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ca61393c0f9a655f46473367025eb5a7ff37b3e3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.44765342960288806, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1c4dae8fb05ecd86db7c759481ceb83acf0fa9a7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..96abb74d420a935e569575efd7d7ab6f2acda28a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..da56d36f2e594804de573962fb9d7bb4b8911b49 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0d144d9669400baa4766937ff3d6487db04a95 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02993107036293953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..757574d80ed477931aac85244cbc9ba89e577095 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0ccc6363edd3fb390d6a373a081785e74918d601 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d6701f153f5a47386db2be872b040d20e1bd6c1f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..38bc56cf877662ad159a5c5be10a37a5a04c3f90 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c7a245d540130aaa11579d17f5dcd806b5e889b1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ebc8d4f7dea41bc50b36fae62307cf07841bbe39 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..14bc5d9b20e6972d2c97092c1f3b587cd29d7a2e --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03009469812323996}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029953149241808946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6f02565db6227a4a4f665d6b42fce305e1d1960b --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5595667870036101, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029882123363118726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..576212193da63d7f9493de9d8fef2d181ac379cd --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..042673a86ca5797a2f047dcf431f10d3ba13373f --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9334155d1dc93cac3c609e7ffbc2e1d8814f29ba --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5469e9a01341ab3c2a11351b1a2e370ee6e9cec1 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300523034631437}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7f8761cc9bdac4c94fc07855dbaaba412220a0a7 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..74f84db72ba36e438ceef3e767bcc9e91582139c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_superglue_rte_should-assume_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029953149241808943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3a743a7ee8f2f4f7969d30d4abadac48aea4e596 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497697}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2f2c3f0d1951a29483076bfca229ab765bd9106d --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915845}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405150083848581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..97163b660d62f7c52df25c8ab2cf4fc2e1ef81dc --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4846093133385951, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783668}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..afa88d501f7a6f90b2273e20ca2e4effe8d4608a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4846093133385951, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783668}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc5c9fd1d31c645476ed82e728342761e700dc3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.48224151539068666, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014043619596174964}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cf731e3cb0b798dcae5f69e454f638bd46138231 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_Replace_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915869}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..42c881dd21c6c4249ac5c6f90ab24f604068effa --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051956064076896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a85993b024c648593bb05accd62288f74e572317 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5082872928176796, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050555322824192}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4925019731649566, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01405090552122858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1c522df442663554354173974791718b314ef5ef --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.489344909234412, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051220692330346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6eb2819b01f91f95268e45924e8111052889d15a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.49013417521704816, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049749833367589}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01405248130604952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..29923ea09e0afb98edc781a544ab12f7e54a4057 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052131146915853}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049294536290393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d411a625bb65f59eaa20435f11671f8ddd41ae4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_True-or-False_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.505130228887135, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01405174596179051}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1d2352ea2b4591cdf7946ce30c684391bed40aa9 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824192}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d9ad5fde6d018a328e236352dbc13957d6180c1c --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228573}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405213114691586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dd6ab036ce4653d490147d4f305778872cdbb57a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405621}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330352}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e8e5b653ee3f7365f9e96e993494ab5047dd6501 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5240726124704025, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014036189665395136}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e49625bf367ee46908b9d70820a2b2cf6cfd71a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5169692186266772, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014044390401612976}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eac3ad199ab98d566486fffa44551981a4e62645 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5169692186266772, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014044390401612976}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140501700944977}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7c7d96f25c6c59ae36148aa54f07222a36381ff3 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824189}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..50d2e7fadf430f7531aedb24a779a6eb2948aaad --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367582}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7567f90a5266b2818422ce55957bf10ec04216ad --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405195606407689}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f7967fe7a2135585604812d7030b2286c23d25a5 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b209bcd9c10c6d261a3474e6085c5016d71f1fc0 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790513}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..96930f2ff7781e76795b366365e36fb72836b061 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_stand-for_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5327545382794001, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014022300570434137}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_0.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d857173c3fc4c4a15a3f90454673d3aa4a4db137 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140519560640769}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48539857932123126, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014046492383275835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_1.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd44a9fbd8228c86bdaacd060f9f9d94d54581c4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225636}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_2.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..12607483c0ccef317b25a4ddbcfc13c45a1c1db4 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529024}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048278820405621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_3.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fb888af195cafeabb65d50b9a1e600453bc6a02a --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783668}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_4.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2fb5708b88c463c6dc0aaa8d100d211c3574a234 --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5256511444356748, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014033980956108553}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5303867403314917, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014026510839428743}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_5.json b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fdad8d7549a7595b14786b516de56f689fec60db --- /dev/null +++ b/4b284b21bc4/eval/agg.4b284b21bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5256511444356748, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403398095610855}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5272296764009471, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014031631629827696}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4a780728285ed99915cf834cfe8ef44faad7b3e6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e2277fc60e7820909cd5ea7c090ce375276072e6f5b270e256de14ebe0f54af +size 16413248 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..271d58e95ee80ef1d64cffbbbd1838660670ed8a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d4b23cd4c4b9fbdb7a9740e00776630b818713b32e5b178f442d8f18cd35746 +size 10291060 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41ce3daab72fe6c411c6aede594b99da31fdb29b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea3cb8de49dcbe3d2da4776a8c03159a180674adbbd4e4516f2d36ada0e1110 +size 6067012 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..09a5709e817e082bb90ab2a0e8ea5a0c631d780b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d738579be2766124619a7695d2165c159f1146875c865ab0607d96a9a0a8d40a +size 6985232 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a9ab08b2e8bc6debb914080d012536545923de4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d411c6ef8c82ceee68633866952b3fe59ad6172bdd194b38ee7cc34b82ecdd +size 7877034 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9a55c36934e5446466e04b4d0b52bbac4a44335 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29eb7ec955b3652a8837d84f291de659fb6760b70370ac48656081b5a2379fd9 +size 8790392 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7dea3d32593c3b36da2e2d8546683ef3114c43c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561f5cf1c42565c2919c7e83c3586f917bfc39cd0a20c9248e315cf163f40430 +size 13018880 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69b9fae02c9dda76e81b4bdd3567a7d8315da7cb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7304c2911d5f98e06800f200c95c1e589bb317391ceb89ef2589e354709cc40e +size 8155508 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d65696b43d763acd2d54b56017e2b128b350839 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4b87e850ce31b497f97b44017a759cb30727d530665d1856ef1251bef38baa7 +size 4317699 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..060485bb0e8f5efde3899f682a54af290beb5ec4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f39ebe6db40b6150a0d4c1c7da2da45e7957fcc6f0a80c2706594dea93da47c +size 4935683 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7046c1885b92abee0ef59b3911d2a7e5944973d5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8712fd89dcc5539d461fb3f54cfd05efc99ee24843c9137e4745ecb293c875e2 +size 5626170 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7687faeec6d1330c105d6636c14caaa4bd6ecfee --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96ffc8c55be433f39a2bf003387f26e4194f4f15cacf479090e0c0ee958fe0dc +size 6331248 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8edf969d40e0fa5fa4f3cbeb79965a019a186bda --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f18dc4a814a5f01c704a1d0324195a9f35c3b96c9760868c909c68c20249483 +size 19576240 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ae30a2401fceded6f422f11549e4be39ee93d27 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cadcf1b607e61f2c84cb2cb7f58bf4283c667e69fb055a848643c6c22d9f0d9 +size 9775994 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f588adb48eae0507e47724e686809684b208fd88 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779370296ef259bcb6e7e14472e4edfabceaa9a98b2d45459872306664c6ff52 +size 5431945 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f4681a2655ba97739b2c3b769318436826a8aa3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1636a8187081b0bb8efd8cc8f8f97b05709e0d2d14b0ccf38ff2a54a905456 +size 6039581 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f947b2e521ca420e893b97d6d4b6970b19815a16 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c42adf1716d0774f841f833e31b5c4e3bc1a5e7883f405a1f269eb05c4bc496 +size 6597217 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd2cd419cc19c9878b9a1c105b242282a991f821 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:390803cb319f26223183f5e394c92f178645680c5c41684ed9de92cf2afe92b3 +size 7186433 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..151ee14c2e28db4f10823b29c5f40633b43be3d5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07cc1f61ec3940eec6295e250e999936b59bc4d8369bac1bacc6572618988dab +size 18179532 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..052368a5b0e20807095f652117f3e10d47b1dabd --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93ed844a3ddfe4cd8777ddb77a4ba5bed08fb176752c4664f545eaa27e7253c +size 9352454 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..11c90e021009799390662faad590147167396fb1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc2d7cf843b4eed4864fafeaf2cd07d288e8e0351237899d61dc6f46d019b0b +size 5475200 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42da0dcd07016befc452fcb900b226927d1cac91 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:020b786aca22073a99cafb547f3874cf7326db1344e0d041d8045e14ab321311 +size 6255681 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ff858bf0ef9e074a6e93e75caa01e81414c592f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c401edde1fd55c201339759a1a0fe703afb5cdfafa58816efc661422c4c4e141 +size 7068876 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4845976a0c0cd8097361336c9710802eacbb043f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0261f57f51251669782611f9333dc78cd2e43d1ef0f397daacc4ad2a2466f870 +size 7900821 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ffda9c2293e080c536323b959dc111a22ec6359 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48cf7cba9df8b0e9aae6bdb4fc9a0c4cbc55087d52f861944af2730029d7bb1 +size 23345304 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c70e3fa4c2d94f90c77c74d5f33133e617c0b84 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98b27f8ab502245100309447c9cd1d50b2467977d29b0185efb319ed03ad19f +size 13132636 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cee1ab7bba5dcf47b930cc24d96da3253b78ab3b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2919c4aa33a19f21d2836b40ae40d882c57ddd8914b0d7ab8c8da4ecb1c5d02e +size 7481802 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54116c1edaad0cba0f8cae9dbf762cb7b830c831 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db32b4e4d8e35093a5a7c3aaef8685dcc4ce2f3dc8f340a471d2758bd59f15bd +size 8802672 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2c4d6d722d2fc5b2859c6c109c468305ca0feca --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c2b526bc0351e182d7c626ac5f19c05948a5456a3080ab1414d3d4df6e8b1b7 +size 10191709 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89155ffe8270c1aeb439e5709fc20379836a4a9b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:573c3cc3264c77e012ec2b2e11757b934a7215c9b317840d9673d91d2c449612 +size 11554679 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4815770755ccb74ed7cdfe5dde64fdb2ddab38ff --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155d515bf8a89649b7a34733c1bd65b4780363d68bcd3b2faea60ac778c1a3aa +size 15816900 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5c08b2e3d5941872d6ed1c894a9dafff0e58dc5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ecf2798215638c46ce486ce0ca564633b423ac401bff907619b6b83382eb21b +size 27040256 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..18294628419821882cab04ca380dedb77f9af568 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53060e48c8b6c46f0c5e4ac24c96f04c6696cc5475d6b7bd5a38fbfc26032c03 +size 38333350 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3594be704235c0fd628fa6fe2b5fc69049d5be42 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606b69176918e54d5a7a0ea0ec4a2aa4ef47f376d40946aa0deeb29b0485473f +size 24663800 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6dfb701c91340455e385d7c947e1d5bf306ddecb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52511ba3e76c6c72cbe2563bf52e09b7ca9f76c733f9c709897c36ef8fbd6b86 +size 29886551 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..daa2d5ef09a6059f24e29809eba379e837559243 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d26b64f1900e14997488f60ef403a2b6098b38fca696059caf0fffb7f09249cc +size 35292437 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ee377c2d95876c1c6a9f6996a4c4eb1c2f1d78aa --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d51b8c431e6c6f514655ddd28417dfdf873c266f83b79e57067e5bfdfce55b +size 15761726 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58f6e0e59816b30afc080a1b5335c04dc40264a4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec19d6e73ad2400664b495eab69d558a42b1ce107c60dd236365e7560949491c +size 27205378 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cdd7e8897568b985200b0471f6378ec3f305ffc6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7934f7fcef28ffeca6548498109b8faff38e940ebf29258b134de5fa24417eb +size 38626638 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d69252d30d1116110dcc37fe5a910e439d84604 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9048b8cc1f75353b18b21e0aed6a00ca67932e8a96e43d916b20fc937d4695f +size 24811700 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..add303569b1d051527a812e7f2efa4785209cd18 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d45f752ebda27c259e9ada2dfe59acb7d376cef32a3ffc1c415a96d7ae74e9 +size 30021399 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a766a453889459e186c1c49885d5867d2e58aadc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822c854237969cef4f6446ecdd0b5c5ea2336f7e161178beb065aa1aeba504f2 +size 35438989 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9c8a35bb3951c703a5df6e7f1f90bcbaedd525a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4f923656055bbfd271e1ed4fe3e101aa91b45777cf326ecab50698debb295e +size 15786358 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d3981767b184abdca0cfd5645612d6718bc0e123 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7166a4c6c0a2e9860628d8e106f8bbda40f39e57426e9829588daaf68692ff89 +size 27389218 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98a373ac555fac5aa726f4eb574db5cc75f505f8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85f60be5ca996272e20bb34db415fa10775eb80577e01dd3039cf6cfdb2a07e +size 38738864 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4201665358f0e99e96f4145bdde5b161162dcbef --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe60e8585ad3de96f8a6f1419989a79198bc16f2fce5e3f8a5c88fa628442f3 +size 24848081 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef525cf628c557ffcc32003ee60c672958cae71e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aab0ae934e4fd40975aae03a4e04c51e614f1df06e90d99c79575782bfa3a41 +size 30106568 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab9130d8bfb0629d6c51d4a2fd7d1cf1609313b7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9873db15fe03ae72d53403f8323926c876b319f5a7a6c90841a1c22bc5c10262 +size 35577754 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71f8cfe94062e6205ded389388e46d79781ad0ad --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b9770b445a221cdbe0e5f5ff34e9c2d5006d4cbbb05f0b9b8200d588f4dca55 +size 15369166 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..084bf3e17e06331f303eabbe6e079cc26b1a37e8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9dbbe16beb91a806bcaf62111fb6715b64cbf5e6a54f7342c7cb4e77ddbe81 +size 26617244 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44d735c1b69b5e71891c935923d26b787a022027 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d61f220ad1f74e9ad30d483bedb06a462e5cb3137deffc0572b57c94237ed8fa +size 37633782 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1368bbb9e72f8a2b04e996826016ba7c2c2dec86 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9ac5c4cfe06d7b2647468ea98866ac883670954555a778f3b5f82eea57edca8 +size 24192688 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..faca9271d019b767f97345dcd65109958b7a526c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:108193a154dfe410f29c1876259f7605ed421920db46f2fd05e05de0dd870b6f +size 29409374 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6cbc2113bc8b78e757852a43052513b52221e9cd --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e0abe97706f4b8f2a2946927a1f305d338a7535cd09d9c4512602c2200656 +size 34788440 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..895aeaf4ed95896439dfd4b201501e0a34993f58 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1febf9d29f75ad02ac15b97349fde37a5039fcd12652ab9da6aa6bbf0a805eb +size 16519886 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..470e6b69af47c2aa329ff5fe7cca2ce614a95445 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe25797646ceac975a8cd256203e481a3ce3e02062cec34650771c317b0b96d +size 27911068 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c03d9185e125594a39ac14f7f3cb180b70f3c7dc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c064143fa3ee95195b7cc7af6ed22617549a7740316155a3f762e294af7daf56 +size 39410740 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29508b68ecf8b724d74fc15285d812d133d2f550 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59e364f90a4d2649226df8b9be9dccc82148de215f24f6508492f29e6727287 +size 25370261 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad987b2f4b221b5058fc8276cd5b518a59906544 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70bb8ad064589e6af3bfccecf5635b49fe39eac0590d569289c4f701349abe66 +size 30837348 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0d8b85677730a230595cf25b6345b9831be2566 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a40fe2514e8c04a88848b713a6f62bb33fe18ac52c42197012641d2d0a6c5c9f +size 36469228 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aeb9ca9ec4c1451eabd8720094efe8488b9af0a2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a006ae746ad66a237b03e0f2ad24616c8ceb03c7c58697e08d2f6a22bdf21c9 +size 993123 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0fb8962d1b8a0a0facf92731cbeaa5140d7238d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9b66a06cd89074ff1292069e23ff2feef05049cd3cd4017458abdc944a3935 +size 1451958 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6756b81614897ce65f2b369af5e9ad702c3534a6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5933c87afa55a48b145f81710170bfc8360e4f38e998b26b1e12f775bc4b1c66 +size 1910488 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb5dcffd1f4eb528a9d490479d3e6ff9a4ecff27 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:234ed176b45ba08b8471dd66016f26d9190c4757248511efdccce4383114fc54 +size 2367768 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe52efb137c1478374c7951c8bb37088deffc031 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e1f4326e19d12d00775c33d9e4623242a4f8c3a4b56591db58589d7adfc4a4 +size 2822145 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..824cccacfbab4545b7f666774bc5305269d39881 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e241b3ec4c03696a956ea8bceda0c7c91ba41181af7e7bec4a790e6d0a03a82 +size 3278476 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c11b0aaab89a2ed8b9cd8c0783479af1925f6164 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6946761b8c1362f040a0472d7d440149c05937495fca4dac64d2aefee9b57d87 +size 1203117 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9 +size 1755006 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19c6386f94dae543812e5c23641d6f0c7cdcbcfe --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ac36a1a8bb7b76b5d1aa27581426a5963cf57f257f358f754777c8e2ab9958 +size 2304159 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66a56a27bdf9ce764bfde877da6598c95d316cfc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe922e6fc2aeca9ac9b52ea64ba77914afc1d68e4aeb454a98acd452c85c238d +size 2853231 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b9958af587f46bd1037d08e6db14c69cfc15b41 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fa2ce0f3c02a6d5440750964268715ebff47b94e1653ca7adb450bd5cd2772d +size 3398880 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ce22a618df35de5243fbd201cf9325ff97f530b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a564957f6e2be2275e800a59a0b57e2d48cf3c69959baf9c8483f51388d8a24a +size 3946308 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba0dc3ff90f8d67183258aea7817fb9b33ea94ba --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab4cac9ce265fc3874455e2e7ff9311a57df15d021094c928ebfc4058b330760 +size 1007992 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91 +size 1478640 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c6e638e63437a59ad23e07c03ec615e8495f515 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9110cbee1a8c373d866f51eb1cc0e98e309b456ca50c0a6e8c35af49091de661 +size 1949007 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3e4a307c3d33e3e5b07bc7abcf7c9df6014bf0c8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0217f407cf38c1f74794b53cecfb6e99b45823c22b3a0d2daf9315774fde362 +size 2418646 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bb277535795670dddbfabe1712029885a286c3d6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519f95800cc1597dde9e0b7435756eb0fd399e1b4ba502b42d9e2416d961095e +size 2885017 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf3c3f7f092b111814c706c503c948e6f79da266 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f17998f59460dd70e31d34d3c8333a28d8d79086f54755b690799991b439d4 +size 3353362 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e59f15a127d530ddc5112722442d0793db41b7fb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02f419cac451a1fa5f648facfe995e1ad60a7a58201878a2ad3a4011986e8850 +size 1160218 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1eae458bad0cf815c26dd7025343cfdc58a81398 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9bebb5426f18df02d3edb1cf7461eab99fa750c5d8aaa6b3692b3f9590cec6 +size 1668645 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce53be2928d72ea68731a1fc5cc8ddeb5c7f791e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1ec5aad3ae200cb227d7cbaf32781f410648cb52af01123237438ed4471c5a +size 2176407 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..909f246fe3594586e6950db8aa29e0768992aac7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618f6075fc38ad045c6dcd06a38bfbb0e8686c6d8a0ff7a1c15eb4344b6e6a54 +size 2682989 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcbdf4513f4165f5ea4ec9c31b3fdd05b1fb0fc5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966042106d1344c4e2f8eccf9adb38ef6b4e9e3f691e176c47864dd87687062b +size 3186892 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d745732a289f33e09dbc187d61d11d2401808eb2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d26089d85ede3f900c1c1d72067e7d830ce0dae26ddaac8fc5a9f1ee3c321284 +size 3692859 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab83f1ce70f2b57cec12708622112c15e5b71d05 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b8650da184ed2879719df5372d73df2647578a14dab2a75b9fe8f928f3bb91 +size 1027316 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f +size 1503640 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d88254eb06afee6406ced2d83af38c7c86364076 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd15cf9dd36f905ab11d15c1c0c9f2acaeaa9f6cada1bf0a7e802c04d8dd9a97 +size 1980077 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3617ed88ef6e4ae41a09c471e432763fa0f0565c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168357876a132f4d958979071f2370f7da685908fa2bc131cc969814555daf61 +size 2455693 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7bdacc8480112c3757bd6abf1c6548b38938693 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57787175c973fb6d62c0ca9c798432c832b2b569bbf09c475a76679cfbe792e1 +size 2928124 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca2fae8e33b406ff4dea5651bebf5a5b52188f65 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r1_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb82b80fb17b19b1c44f3125ee0c46e8e39ee74d908cba755c23068e5bde7caa +size 3402503 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e570737da337cc0d0bca4ec86857a8db1e11d857 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241c6d1fcdce5e3a273b8f24da0afbed2f2da6773413ac1024d18c5e445a9b2c +size 993518 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dc86f884a138401693ae08428faac209e820650d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9644d349bec0e145df516ed195f251c3c0bd6b9b5328e6b717f401eabeb41fa +size 1447234 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab6dac70e549e00210ea0cfee0bdc368bfc3c594 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d170fb17a6455ed4f1bbe8d8bc8b6cb82ea67a92815f2e9529a64edf2d182c7 +size 1900517 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce5618d450e71cc63ba5d6e3711b10bb881cacea --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb51fed3b07fdc6382f4537b509cc1674cca98931297d0950403cdbd254e05b8 +size 2350377 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcd67f7ca47dc7ebd539baa7c520e50fec502941 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb69afca3b460d4fd17ca918627ab4f636b3b31359fa03b2df11711dba2912cc +size 2800090 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc181e2d3f066669fcd7ed51c29237d1db8789ae --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880910961c39c7a0c276a10e42d72c83de36efe289232bca7a5672afc89d6ff7 +size 3250727 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4e5c9619f2b6c46bf6976ce4f0b3de4520e1d5b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd58f811648895fdb389ba2f830967c9a43f1e72220a2c2ba26953012707fe6 +size 1203517 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23 +size 1750064 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4c9dc22a49975a73d4651a6c1b2b9b80fb7971a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e103b6b31691477e9c973256c8fbcbbf428a9e01e573450218c12ef85b728417 +size 2294172 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..370ab486a44bdfc621298687b33ec3d1c3cda4ab --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95e3f8d6879788af4a654ce518bcfcb36ab996608482ea74b1021d6265b6261 +size 2835917 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ed486b9bb6b83df04fd43f2bcf027b6b62ef34d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d3a8508244a007b36a0c57a5188e2ea86c63884a75f4714d236e7e52699ff8 +size 3376890 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1fd30a140c82a587470867fc44421ceb937f1c6b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb9f1914126e21daecf29cc77e99b728709d7842bdba66999685653217b6e94 +size 3918745 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52fa07206cafa047f66fb3edae37e74dc0628e1c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b203a621b053317ae6e1ba5bda2e6682a0dcf74aaca1a72fda80ea051730ee7 +size 1008363 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5 +size 1474064 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7f665d0fb1d679ff2f342d249e215bf8ec9ffef --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90615b588cbe96005cfa94dc5c15b61e57b951877d3ae889482537117f6b119 +size 1939292 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb6476705200279058c23787c83362f5a216e426 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a2afd2ec1bd513696fe84f076286d037a63809bb46a3bc549b9373b18616ba +size 2401377 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5f0c0330562fbddb1e4477f4fc9027bd9f802e2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02d8c291eaa2831ffcc4b04a7664ca8433852670af670ed2bb66ec9e64f0ee91 +size 2863059 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9dd1a199e987393e3c0a0dd746df8994678dda33 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66b40ca20faafc593f14fdea4ad531eba3703a20bceb2bbe367828244771b1c3 +size 3325778 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7bc2d886ab27be2e9a41e158a633770029a426b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d832d488bcbd2e0c2df26e412d0b559b251acb037b4e06cd7d26c41c1be754 +size 1160636 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..34d834bd00fe03c150229d612e16ec14bc4c1cc6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9576d64a6302990cbde8a26e581c99c415ad81f1e1d4a1db5468f997e0a1f59 +size 1663797 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e587ccda9985e218c68182e2c948ef158b85c238 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db1bca30c6f1b046ea05b849eb3507c3df639454d7be16e676c808e201d1bb5f +size 2165960 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c355680346c50b2d036766f0167e16f18025964 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa1dff821f6976046ad4f5ef04fb5ca06e16c4dcc888571aa1c13d2028367fc3 +size 2664999 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51bd77d071ac71b6603de87b7a8f067de4b6d3b1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec4b910db37c4d1ce50634ad424ff4c93a560802c5b6e91d9d412bf38a8a30d +size 3164019 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1aacaf2a023f7a895edb61df3e2cfd2eaf537c4a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae0a8de042c8b6d7140e9ad437cf3381325e8feb3abdfb4186f78ac05513187 +size 3664216 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0518881283a9a8b1d4f17225d4d46ed661ff0b50 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70bcff0b90a7bb3033bdf21bf47cfb954c959e9e966bd580e157c1f495070c3c +size 1027652 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b +size 1499064 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d5d18d1d436557d1c6be17d8375b4321babc82d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48d01ebd95e1586657717ed7848bdfc6f325f6bdcc0ad42e7a8be235a2d1b2ec +size 1970351 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..681f651b7c8df3a782fb1a8027fb12a0ac8420ed --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7483df0fb18e8916552eceab331bfa1f000d13c2f6e837e1bda9ad79eb62f8f4 +size 2438438 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b81cabc85320446acfddb3e1472010c46c9b6b9a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca5491746eff565c3410ecf3ed77ce104fb50f30d81ec4084ff29eddbc537519 +size 2906160 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da108138c19351c91f771e5c6c3ef2d811de3f1e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r2_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40bbd0e9378f72e1624d7b191af2c9d9cf20be87a973cc3194e08da870778961 +size 3374920 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b81c5d7afa5f9181e865b2928c4344de88cfd3d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b4f6de5fb8d04033092e9791b6d3fa182961b5ded6e848cdde2a440259633e +size 1169343 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a774edfab90ae41ab373aedc5744e502c8cbf670 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce1b91e4ad4785f4f4c106fd41ea1d3de086077dd9acacd0616aee03ffc736ee +size 1698921 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..889c8bc4658cd48e2babdf21d4beb163b118ee47 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb4b4abda48a8b335d203adc541f4cedd1b321a4aed99c3aee4943851d5cfef +size 2217638 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eed80b38b6c7b33593ec785d86445ddd704f9433 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a111f6a006d7ea6b5a6fd9c21a605764e0e08e5a1c3ee7bc77b57e20da89664 +size 2730980 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe485ad01f7f20ef2dc3663e1799c95b749206c7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d926d07546591c0d0bd5b1998eecca28e81fc1b24a787e18de487e8b307a1f +size 3248458 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d00039466e3a93c81cc80b71b4170e9a5939c5dd --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03fd33120ffbe96faf44ac34dabdf0f73a5527fee56aac75164008119c00be90 +size 3777092 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3555a93378d94752e67d194bf7760712e1a6aa3d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccdb174be1e8b76049caa5d2bee4d049454c4f4a04147dbc4a13ca4dc573cf68 +size 1421159 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe3c0c77f0fd332fb2b9aebbc2f2949d79b9d340 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8436189e45f167b68f379e1b616cda810a1807bc1f8dbcf03908bfc6beea0535 +size 2061859 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f913b9f727115a11c939d54f2ef7612457caaf8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41f9893cea5bd297553c25e145c8980d8d3bc3ff399b3c0a727fe934711e7ee6 +size 2689460 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52aa56601187ed3fed945d3cb319ce80bc8f2fb5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38fdfb2984bb24b17f6e593d8101a9652ce5d6776d1d19bbf231cdba430e1b26 +size 3312619 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..575eabcae2b2b33dd982df29ac911300cc8e3947 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0f674cec2babdb171a041240b4d47966c118de684450eab1e35d8233be1fa1a +size 3939517 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..03519539bc2fd81a2d05400a318a1da9dafa9dc0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ab624c4be8c5b36596d54a4a12350fb1bb6325f7242fe7b52bfcc9b27dbd81 +size 4577361 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19cdda097ccd629e9b3d9dccdf8986ce35950ed2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba770afe7cb47a1b26bfc8898959c1a2a20f6b28cd16451d8f0e77b088a1139 +size 1186719 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778 +size 1730743 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bca66db83b03382168f3ce882e9981771746436f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faaf1944126e5db11655a53946a07448e667b264f099da99fe184b943dbf58b1 +size 2263984 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ab3b6edc5cec6c819480fbd91bba61dae7de119 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6f5630f21508d322b895dae4b5b77a8eea15cefe8c5e598ead1f331d793a01 +size 2791831 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9cdd2011f78fe6dccbe65d0d033f701c9bb7e637 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa803120c7de12a6caa3e7e7783618f208fb6a6c518023b3debc7769affff02f +size 3323761 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3fcac77a5cda51f1314147f95a7930ad2c33d85c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c7c3f8b794c470fb13e132339243952986f20d58a3a204423cbf887b3e1b06 +size 3866706 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b82a3fc2afdef8dce905ac6f4b6c09422cdd4dcf --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ad6c08ff771c12b891774c2f97f0e096e1f12c0b7bac454b41849666a8c7709 +size 1370105 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a8f5b15610ecb975510d391c700cc102efcac32 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78021db31e5cfbd5bcaeb92d7ae7b33f99f35e3014335613e39f7d2d5d02be85 +size 1959117 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0e3bdd096874a6a199a8b46f488d088208b1016 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f11e9343fd73734f96bc6c1a70b9bf6e02b02fbf4e76e93e9b31f3a213cf6d0c +size 2536930 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..884eeb10099145406b30fdd4c980df62f7724b8a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d55b84380e08393e0dbbd9b03f9540afc16736670d559c5bbdf9479652143bb3 +size 3109651 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b44d22d27c718ee4657bfc617b6defc7ab14a1c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d587693eb6c49c63d00b6948377ac5ebd363c6132037a391f6b3217201dd173 +size 3686768 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3711774d8d5bf69e92e9f581b9923a2c8aebb9c2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cedd1b8e2963feebb0b01542ae86577d29f71ee4a577257c2340732406f3b0 +size 4274903 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b70bd349026f9ba7a9de57d9ed60fee22ce09af --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce38a7af773324056134b08fbb507ec74f59febfff35862ec8c9ae7a57d9c22e +size 1209897 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021 +size 1760743 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da2a860086bf17477c32e49c0c6a277f300b4013 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ef858edcbce53783d638e8f3e2ab9d498ae609baaf26bbf76ca9f5555b5dd3 +size 2301257 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..662d8724cc0828d0f88f8764f0f7facf5abf055f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d564ce3c201ad33e8c884cc91ed0a1e842733a2c351cc91a6f382581664104f9 +size 2836310 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28383cd663e901489cd9d5b426d5e05b3511033b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4fe6245b87896b9fce810c1e40f1a6895d49ca9561a3e656ed02b89604ddd78 +size 3375492 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..498d3d757e80aa150866bdbb495a6121391fa9f6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_anli_r3_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:551a54eea1910aeda4c8df6245127195666a1bdb6c7a56ef4cdfd58357f7cbd8 +size 3925658 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c713bb5fa34338fc598e80f4199ae70f7064061 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6428621d6a45a680195980ccc9f028fe283cd88108a84aa63dab2122b6368407 +size 1216777 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7a7b410760807add41b94e0ae85f4e52a3dee3a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc614e364349d113cc86bea3a7e9a52fd7b47311cd0e6f5a35fab0c332626a6 +size 1670632 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..73ea957cf711e5a6abecfc2a39a4094fe76597e7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12bd2be25dd01d0942618536c6c653344b3a2166238b3ce65fbe435b5cdf3ee5 +size 2120084 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2215c23fdfa26d365b12e2b27dac105c9a252dc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddbca4049381e7313144edcb02aade16e1b2ee1ade9e8c00fe3a99100d566394 +size 2577615 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13b7c9457ff539d2f3c6aba0736df78fa3370ed1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cea4bfdb83d6368b97ea30cdee9d208616a2eddfd27a5ea2c5c8789de2efbcdb +size 3027517 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..323d747007e82ad67d6891c0cb5b92b5d3ee0e31 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00bb5026777e5dae901ba70f638ee1a66f09cda5eabed784799e0e80547d3a26 +size 3479076 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13aea93dbce1ac96ceebc99ea92cca665d2f58c2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a85631253af78fd06dd97fccad0d3654df526f9364d20a018ecb465d2e65b43 +size 1458410 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2c597ccca977f69157019757ff5e95b3e561c70 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2923cc4357ec728ea5ff8d9bc5aaca8de2fd571828f3ad3b972b7f9f95515b64 +size 1960964 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ceda59cf84eaf8962f3810306818846410d6637 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:181dbfad1ce9d6aeab7d6bc719f98e0a491164e06f956e48df1544bc181c7952 +size 2457604 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a0cbbdd27053c567ad71a65c68ef265eccc4dcd4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51910c1e4a54923ff374b937c8013f0a5a05602546133f677e288a8cd5f9db62 +size 2963610 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a05188363de6b57bf606cac312716b7bbe0e8177 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e4df533c1d9d2bf11ae4c352e3888a72a878cd7cb51e5586790284e70497f9e +size 3461550 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f62452780ba62c4d2a658fe095f0afccf2ae7173 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f958a026644ce2fdcf3ad9459d2797aa066aacbe1ce6dbb29a2aa1ab00793188 +size 3960767 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b23cb2898c41ac756601173d039527e5c33311a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6abf4e8163f6d539d5428b0610a504fb6b06af8c301efd1ecd7bcaf89d5077c8 +size 1505774 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..773ff494b6763fdbc5c094bb22ce36886c1512c0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead738840f7509e99ac43c48f2510b6cfa8619919baa0c8de11415ef32655b65 +size 2032610 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47b317768146a67bd5565424ac0efeaee0e5bdb9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34d306ed9412a2a6d724e42af8afdc428068a5746e494f44e7a5c199e2c63fcf +size 2553671 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..31a172d50f23a82481f489c86aa4b79217853855 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89786c0988e0d437fbfdf9b518d1eb059382e3cc5f88acd917ca351a68b92657 +size 3083767 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a66f5f5d8fcded6f0e16cca77cb14eefd5960b4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:018ce8dc3ed37f8ee0bba6bffd0052504ed5628d3959a333b3e2d58dc9f704a6 +size 3606158 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ff19b538ba7073e0e0bb5610295b36a7a76dc02 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf67ffdfba705b26557378434c09308074bbed2f4cacd31ed1d2c9ae7e1ea77 +size 4130076 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b728b572a9532c15d2d5539be96f696281129ed5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65041280d86271884547920f3ec8f10601886bd38b7a8a36279be27bc496bbf +size 1202714 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a3c24193443d785b513810bc18cefdf8fa12a29 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418cca8aaabd9e5be98808a96dae4dc1b0a7a9d97de20edcf6c81a5f28f07710 +size 1638992 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6040449c7e6d1074d57412e46414df45c2bc5fe4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dc4c49fdf8e541eb4403a5c2bae8117f35f46531c84602edf61089fd226ee92 +size 2070864 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93d3a32c19e9a98a1c18ec44aafcd046f7d7986b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3065c38e3ab5a0e5b86683b6ec34679c1d9e634abe9247eb802cfc9af3b411a9 +size 2510815 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2937b518064a64a2297696c1dc6c8abf6eb132f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba3bbc236096ac2f59e6877eb0bcd18a7725151d8411ceaa3725ea03efbeb9b +size 2943137 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af504a11fba2cdbecc7d0303cc9ef629060a8dea --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd7fced593e7d999171395adb9c277ebe5205482039f63bb3383275e514cbe0 +size 3377116 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..317cf408d8bb7123162a670c50f98e5b3b0e8e78 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93393fabe84562236572e14c25f062b83c0d731cb31785d2fc00f77c9b7b38b +size 1187625 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b699344200a8a3235434076662c5bedb03528c45 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349783b9eb38b3a096fb91bb08764ddc4c0e05c3a7217039febb64880ae5bca9 +size 1557776 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47c0e2e993d02598d18b5da7773d2250f5547a98 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d68ad58c6c8d0fa0eecf1cb4122b7a6fa8323fc39a235dfe6583a5b4c02324 +size 1922149 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2ad9302e84d458783bd6e100b83f7dc0998b054 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:434f752cdfd13a9dbe82d181a9da8f65ec42ebad355652e7f0a92ca4119dbad4 +size 2295489 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..140425a6ba3a32ac224993544fdf94d2c3dea1b4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c3b93ea92833693ea26d6f315f011dbafa3bafb706134b033387731e8e3daf +size 2661019 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3f03a8824ab8c2bd04f16bef284378ad6cc23a63 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_challenge_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f994acae054cf1754d0450894a2a3d9537d99b98b28904f5cae5c51090b12300 +size 3027951 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7bd4904611de6a8a32421b293ca9ad5e65142591 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860c9d0a35659efac31259b7db84fcf470fbd0cbe42441681770d2011d25b44d +size 2351241 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbf34c644e03764e226fd26544b9166bf8a96238 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403bc7549f625b5bca7e7a979e19ce45dc4436227b9a19571813021495945315 +size 3174164 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..32753e7448ec9a341d56686ae46e892964d3dd7c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cda7d69d27c859932eb7929ee81f5d2f575c81d94be4761469ad504cd10b121 +size 8017892 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b5242ea2ffe51d445a1b1fe100e6e7b9c0cd26ca --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:285f5a05c01b17cbb405e5cfe36f1f2560291c31933df05f7e4b9a8757ac6b57 +size 4831612 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e0378283cda202d5c7a4d919db3c4aff7a49183 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88c39aa11a1051d12f2db29a30aced24a90d5cd8f5d1315af1a9501c40565fec +size 5662356 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9170156a551eb44e637f0962fc4bd85f9e7497ae --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b03623b9095053a69d4ff3d26f0d20e1ad437781f74eb9836e9a2dd9c5127e1 +size 6494729 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48865e9117efbd43df531705645a7361476b53c8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7e4e006349c57fdcbba9e1d2d11c26d04701de3bb8e527f55fa6e4ecdc6156 +size 2746275 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2137daa24579e9d9532c13abe0eac3a193b0922 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7083b2aa79c69bc5c773af2d0e36bde10ec610ae5a5d99b06a9e5c324e493ada +size 3649277 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9b39a55457c6988b777dac93787edd1f742775c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b13a9819007f75087e27944b9bce282edf5aa3fdff40d1e6014ae1aede1a455 +size 9133944 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4c90c047d4299ca30cec20c67d32e64bb67aada --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186462375a2a4936890d7b1cfe202d47b1ff12c44a43bcceea6cbdcee7b8e2fb +size 5470207 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c47bdc2a2abdca56fee0e72be944af729f17d650 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c9af223dce224b2d64382d05b32e349ad83bc56872cd995935ccc2a1cb94aa +size 6382713 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8a783ab52ce48abe1822c9595f9e8c6f7306d64 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2072b303b040527c67ab494be9c09edbb6b00d7e550e39d00b38e477ff95ef97 +size 7294826 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..227dca9531c6815bfca8b036b3f4d76af95b39d6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10edee451a725372c518cc97fd7651f43d15713c7998e9a6032c3fbfbba867a4 +size 2842762 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6105bd4f930f06bc77c21cc194562f92f97b75c7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6dd6a7c1b79555e3d55a6446a9ac3b9b015538202f3a8a4b639eca33d1975a +size 3795349 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..379f71e1f331fe1ab692163720f5fdf2bb3fb907 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60af790aa23dee81cf020b73f678a113f3312c34c31524407e20a5a3ba2ed977 +size 9525472 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..edfaa7fc8f5510c4c17eb529619dec89d05ee880 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860d20234b5707628696f735e1752003d9d1673ebad2c3cd648f0b0fc5f7b9e5 +size 5715589 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a6e1b1eb844001c6c6afbadb8c6de10e4808fcc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b763d29bec4bcab607dd8e2fc7d671fc21658a5c6075047d82d5644027d1b2b4 +size 6677656 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f88d477d080adc1ec609a8bc113dbb96f5852783 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9491d54bf4ea2c7eaaed4df46cec7d5550ec67d0de0315dfbd4afacb9ef8464a +size 7639723 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d124e0b4cf205d522952dc9049f785a306adb4ef --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4a278e6c2efbd545adfd85f9afd88d557abcb3d36649a3b05ca34ea5b6f3220 +size 2322732 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce4c8e9b12791e77858cfae4e8e8209dd4a250a6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23534447b3474e8659661224f1af369aa686caa7972c3752412d1abdf17adbdd +size 3110021 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea943ccf099ddd399542757b24edea3eb2d0ad67 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a0484eef5ab9e8ed8cbd8456801040632c868ec4951876346ba182e354213d +size 7818320 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be58a45ecd2dcfafa79cf0d58d3195022ff47cbc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be33c61ee9e7d9a67cdbfc097c6810be1293b6dcf6b073ed21411b5c0ca7c9e1 +size 4696184 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3dfbcdd54557f63662a64999f0193609a2cde21 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d94dd20c4322f841d1ba0d067d96136b3a4cece78b5c01356ea5b75ea9c80213 +size 5491290 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08546ab0efffb6609add00ae046d72ae2c4fc73c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d35fd8d83f8a563c5676b7a45494f2bf9f77116f2e57a0d13fb31b65b315754 +size 6288023 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e30930398eec5a75701a0616bfd8d5ecaf9280f2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1219adb4b7511b3fac6273b1475966c5dcaeae70c5294a3d530264cb8deb0f31 +size 2197063 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb51dc578f144645a2b074cfe0a5ee3d61311220 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f4ef4e7ffd9f1555afaa4da6a2fabd3740f5aff703cc19b09cf9a729692c640 +size 2832013 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f0577fc1d1adb7d63f0c8cda2e0ccf5e2a5e4bc2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad2060b79b64f29164d0bf7f3099715b10416b342d6e0adbd93f73f90b3453b +size 6961774 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d3a361cb4e0b2a56cf57e13755b32d41d05fe7a7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc9db4108a8f76937914364fb6548d863027a690c76b9d08de78f544aa2e4a5d +size 4115565 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e99d83a8df5d2d3ddbfba959262b7cee8f496f2e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a54eade2178948133e952c149ab9221d9ec07427f8b2426039fc0705fb418d2 +size 4759577 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dff2183434001625b6bebffd971e3925edcd7a04 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_arc_easy_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba15491ad9536f26f49567ea1c8e066e02a6a4197eba1c07e910a9576a259ea0 +size 5403514 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..edef76e155e7c8c6eeaac044d6940fb45fc42879 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e105a71f0929da093b5192ef8b3f5d93658f3511b37540a016101a5007a47ad +size 3641270 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4eaf399cee73d6f1b24e5b48f5759b13c3f431c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976a7803a74553fbf8760625f6d89de88d6075b0402e780dd566b91ebf647ee0 +size 5656251 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c83a95de323a72e1be360f73862048ff1737382 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75edb16c8374b6b9b7913061c102e6fe89cdabd14824eff2b312031ea226b215 +size 15386722 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45ea7178cbeb1827b999190d63af8934326cb08f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e279fcfbb2362e5333fbf7d575084424c8156ac27e849a9809c6b603b4561a5 +size 9727534 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68e1791da8246031cfca68ab412737dfbb0e13a4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908c42b84d839039e5e0d715d78f5a42ee257ae0f8b46350ba9ef8b1e912b8da +size 11768266 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2cecef038957c3ec8f46297c86e8630e658f5e90 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5884c039a3bbdd4b64738a04b257dba9f5b45368568d15d2e357ef964429d42d +size 13789890 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..747b62b2f416c2eacbf027a342d5148e9e3f4300 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eab1fa20505f04419d1a9508a5cadbc2e72a66e65c5169ddee187fe300ea58f +size 3984659 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cf317bcfab6dbf109de0c802d929d4f8ee7fa810 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba3aa815329c27789a904eed74957557972aa61eae8dc7eca127f9e833d496e +size 6168074 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb3461a538ab6fd70ab2c0207101237631808278 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8181be0441627a5e5ffd2a9f4a333bbcc1621d5cb20f1fa6606e8c5c09a9c817 +size 16743634 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8ff6c51be473afee0819884f9b81e70d0fce871 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eac26825e1c3184812d9d93ebd2afd18257dc2c6fa3da4852ec9ebfe21f45f9f +size 10572887 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1cf69a4940fd3f87156474c543aa71a61b44abc3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b879d86040e9a7c9e910f6dee5a3404cfa9c459cb235455856f67f78f18bc3d +size 12780644 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..191e552dd2d2338502dfd02b2e60073b7b16cf3b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91dc42665bb413ae4fa2f7daee77a42ec359843a0eb7ba5a82175c221d30f69e +size 14969428 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c +size 4041656 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..16fc574cbea6161cdd7b5580f939c43ebfc2f4c6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd22fc63b76647d3a21a07824ab45dcae7e9be250c5a1ff4462a5eed7d6f8a03 +size 6260363 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a2fcaeee70c5f5594f182bcaebdb75ecddc24dd3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c519a3f26da16302bec1d5d5870bbe244ffd36d36480dea1b0e8f50485f84a +size 17001252 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..023f07a5da2e3855a10d78781c403eddfb87fe75 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baa35327b0c9453ca17adf7343f1ce51a599a3b376c558e088a72d76e3aeb72f +size 10737905 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4619299253d3b7822d755df4963b5d0b7650760 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a73ae343418e99148d07d362b78a39c473ac2776b56b7ab5b32124f52b7e556b +size 12981773 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22c0f1acb3fe8a5a79a51be2832dd6d78361c6eb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05fb4a88695068e0e20434d8649fb30309c9ae30e4d9660f86690152b236de1f +size 15206581 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4f8b24cb7a4e0e604d776b2356420d0c19b9d01 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e72036b894de580acc57e93d5a6da65b4a6e71860a52971639eabdfc28940a +size 3664223 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab5f280874b6cc287e61484f18793aa94b11fa2f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ac1169b5a7709ef1da025b694f9b36ad18bac3e3c9295b825794ef285444bf +size 5687827 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..805d70b39d6c94f44da2d5350b69c95ac10fa413 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0c5e20a432b2db04ae30241435756932828556b2ca667252c5b386f64cf7203 +size 15465638 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1f2f1f07f1c61cb36f38d18b89abc180a865e49 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcff3d58db241ef3a06ba37e1ac3da3d2c5b3c46c0e5cf8ff4951902247b246c +size 9774978 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2e0acbac33024a538f0653c4ff4626afd661751b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d010f239bfd5ef7a22d849578ec52ed21eee0c432f814373d0bc51e7a03b3db +size 11823719 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..816cb83dbbaafe22e225e367430a2d4cab938140 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072674aea6f7daf10f355db7b8b26e2df1916da92333d8950b6b8729fddd2ed9 +size 13853495 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..18b58eab196f7fbd8c9fe211f12fe41c51527c6a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a83df4185b2d72a1d2faf688e05cdbc646b328f2a97f2a1a3f9488e9bd9df7 +size 3860904 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550 +size 5990492 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0cabc9c9130b0802f5851bd056d11a455f0a891e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb8af1324db3d29bc4b6f996bb23d15fe448ad9e6bef8881d0b7483547e9c16 +size 16282534 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c01eaff147a96cdd2a608acb55f51060de2037f5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f84ca77d2dfb233cde41e502ac6040aefdd7078e3e11a7e506cb2b3dc2b440 +size 10289329 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..35e5cb327d585a336257cf4045fc0e536ce67cb2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a00320ec1e0e4864d4ea52dc8e3633f74dd428fe9fbd87cac855c0fa2ebc45e +size 12443865 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e478909cb9edd446be0f1490f37ab96da7b57297 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f20bb0e1fa83bd6b91187d7f15e49d1d5de77a156344bb0569dc469cff6b35 +size 14579319 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fad440ee425830e35ef2eb322e1b6e21763b5103 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:491d86d7b200de05a7167ceeb76912a762672be774ceb9c76746a0cd4e8113c3 +size 55156 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..056e3d70233eb8d7630c11b0df5850f4b3e8cb3f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fecafa6f1a8c4a662c97d97bbf1332adb114c5528a1ad794222fbfdd338201 +size 77972 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0c47b6ae759e149927c417b372e240a24986e17 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69483d9582d7ee864f95ce21efc3175a5c62d2dcdb3b3cef7553eb9adc13fc92 +size 99590 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..981da99afb0ceab64b708bd0607fdc5bd2d8d793 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990ac5f677ea0eb27b8d89f559e3cd0962bc93ecbf89e91bb81f0a043d0e5778 +size 120749 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4949a737f633d417c23c9c70618b99568420cda5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffb1ff8f7a875910564898d5959d7719722c4194c8aac2243fcba84f0a96d9b4 +size 142871 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b279f1c5262b7f70676619c6ef0f99d6c013ec82 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac053b5e0c870fbf2664b3caa01c891dcaaa6e235b98726ec31c733c0cb4ec78 +size 163710 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8 +size 66218 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c +size 94141 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a687f146799111f55e322d50fce459902ab6bab5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5652224d13ae566670439a9d934cadc0a0e662d1e12a47c9ad0eb0ee5341dd9c +size 120804 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86a0b5d17bf1522d1985bd4fcd3293e1b631154f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de0050d37d6b057bd20ecffb46e3df607d0b0a6d7545ca52f8d2e6c9652e467d +size 147052 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5fd5c32c157c80d061e8b7693e83253b6e5a77e6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb379a032b610535539770a59b70bf3280833f915d62f79ec3f5eec5c2498bd2 +size 174258 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0b6714b77c91fdec9694e16e86cd950ac28061de --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ae4e8c638c536a2aa818d36e7307b5b64a5c055f5d796effc6c14f0ce34ed3 +size 200159 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..730baf81c4a7ab6ab1979a29623c9b1908aac90c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b370210d7a19ec98f3945b68e375633fc4d3a6faaca6ed336b5b8092060615 +size 56299 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d +size 79780 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b590f3edfb3ea225316a47fb5b492d51b06d7a05 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b0d9f0fc97baa19223a9fe1d0b48aba554a815528ebc527f750fadba08a572 +size 102071 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb953996932eb4347c80093f3a5a8903a54308ec --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733588ffab7fc5079eaee1c295c039e784e057b2dcc450eb0e0c4c9524d4c926 +size 123883 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38bfbf1f98407e89c7deb47af4c12acef702336e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6a9f3609bfc023eec80dc9741feb77dba255bda3f39b6b260d0b669b8f3b979 +size 146662 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..43d37069a64175e0b57530b2843fab14d9ffe3c8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00c345ea11059c0024325f2ab2d3299866e4b0fe3e87ca2a75d19f26fc14bf6 +size 168144 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..521ea84e97b15d8a46d9f92e766ebeb07693222f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e0e53ba9732ee69d73ac235d7b8d1e34d5f2ce606b9e986d6b874a3172945a +size 63917 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef8f7c7895ce79694da835afaba207c93b8de2e7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae3c9430fdf1c2b99f9590117d6b9345abddce8b8d91f3cf4cb49ba1cdd73e1 +size 89648 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2886c3fd4cae32d5b7fdaa82701c4ce9ece5e5aa --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2c4f416fee6f140e5f1cded0a2e00470f46c42b84fc0fa4370db7dea2b581ca +size 114107 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5edd3a58357bf5d954b1d4b1c65f953ad09cd04c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bedf279b76174f9862f0276f09c700e76790f732e92ce125dbab689ace6f7170 +size 138098 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e761c6aa2eadaf48ac7e3b9bf6e1302ac0d422b1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb36d586126013d3359efd67ac9115c829273124aae76bd39801da412114d481 +size 163068 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d88c935115fdb12ebf37b2ae0cd3541bfcbbf31c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864b4d78a98950c9b71fcc8615115f239106831c7bf0a2f0daffdcbac0e59b99 +size 186744 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5021ec13720d74d0771e8e07c5e7e138e4e71026 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d26bbd935129495a703642c854e6a3f7fd840f9c084895c3cd042754402ce2e +size 57311 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05 +size 81124 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ae50b40df2f2e898db35bd68bc47e7caa94f23a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf85544b324f306963354cd79b658b32758ce6f73a4e0d2cd49282bae8f2a1b +size 103751 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a284f726972e1aa8054d32e4400ca32f51b0022e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e4b622250e1dfad31d821930068b0b98ee94893ea0ee44a7cb5f91fd5ba252b +size 125902 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd81c3cd263fbfa8e2a9eaba3d4ad7b38499d818 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b8ef5dae5bdf3aa184e5d7fd85fb9c74976c16ed34a773467fe5fb3a22e369 +size 149020 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41ba7473a69ec73829fa81536b8d0980a18c4c5f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_cb_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f22e3555fab4f1378dc53d008f94ec80bd664a9487fb301ef943f076365ec5 +size 170837 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65133e030687bac00443e0d9d1121547b288ef59 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37a0fde7c127e7ba4cfcbb5402041e5ea9c86f8116e64826741e1ac75600dc88 +size 92278 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9cfb546447e33a90738fc336a2acdd30c3333c82 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1402b8ad2db7cc08fcfdd975020dc0b7a614ced4144b05324a1371586022fadc +size 111558 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6873bb25e768a60ea84b207d5b89afaea8c7b5a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399daa308a870b7528cc8a2234f42f7b968c89ab2d79ac7d3f67d85726e8ef80 +size 132034 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af3efe61302c6e1aec1560ebf0ce4f9c93fe3862 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de216c74a1f74767c18759d2659b6bf38baf226bd000f12253257ef3ab4c63c4 +size 152190 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e5a5369be535393352445a141cfd99046573ce31 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f08bfc33f7dbf4853bcc3e466d28ce204c3b06b82a98e65642fe8d8c637e478 +size 172018 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9196f0289d908886fb9355aaf941bd340851d54e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_best_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1bc338021ee7ea04f5dffd5c74f64704ac4e5b3cf630aa42b64b78c3191b2a4 +size 191964 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea09a4e9a724ad320b20a8441281719d331423b6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06700d81552978bc70aa3684b009d98bf0499f57f155c848070dcb90653296f6 +size 87836 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2eccb8a6bd38a0cf3b8ddc083ab39f0e40ede86 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5013a7dc150d72bd3f62d4fb5659a442c1d25842f79bb4e883444ab6cc7ce6 +size 105064 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..357a21c02fb8853fc864bd94e426573bef9b1b75 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66f3119b5d50568f1a36c930bcf0916cb6aad5eca937cf0c6cab56579931cbb +size 123423 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..785584f02c39877d8065cb7c6aed4ee426d7d72a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9974acd0cbc2cdeaca3a6de50824c539438cef6e25398d1c455e1ad9d320d1c5 +size 141430 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..afa97c7a2ad692a37cda0a7674de167db397727c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fee93c515b9b351fb7ea374eea585c5535e235656e0e0974489b701c19ed450 +size 159094 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5d9ad2e631e416e5ea655718efd72a97d863232 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_cause_effect_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1edae65ecbe915432d100d1c76335fff17f41562e16669025de46b605322e7c +size 176890 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cec4902bcc78bcf6d6bc48a68cc34ea376d022bf --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecd08a0b78c5fe6df7fbd2a967964b247949c5d63c56ee11707ee158415baeec +size 85159 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ec0ff336b384f3f64795243cde5c7f440f42ba8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27340997c80a8687b083142fc4eae8ab1ab133e883c1cebfcd387a70ca80cf3e +size 101262 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d98decc9d3adabcd4f15ddf81571de8192458aa --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1262b8445d55f409be30cfc0eaf222f734e7f32593c021bf73be0eda1389ecb0 +size 118464 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66ef42cc08c923ed4ae0044819122083bd12e4cc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:838671df36c7c1998f860503cacd644db93f0cad5e3c5bad735b7a5f08bd8d9e +size 135387 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6db5e3fd7f63bb0b8c557211299360e6c63a754c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df14bc77b8f5922a81155823b8f1877c8a060cc1c039e7826c564d66858b942b +size 151933 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a1763d3a1c993bfd8f74ff84664887ea60a32b7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_choose_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1461db775b776a4f31ee165a8b41a950bc27fc54c4b951b8ee13222546f905 +size 168731 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab2eb18f8c94c71c78a343f8a5da568b710ab10c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee2e8e7209fb5196a5dead98272a2f3fc330911111249f7750b1e9e8cc5ed22 +size 96743 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ab905806557b1b9a6a31675f04ab6094278e9b3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3d4b52dfc1c43a905b5412b63ac8e5f3bfb3b275c5bba1161a18a5458fcbd1 +size 118253 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..126d8e368c786a9e87126e292a451b5fcaed09a1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8132d0f0936315e876fba893d82aaec48bf91a65fcf88125ea4b830c3620857d +size 140921 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..537fc313ce5be7be8cb9e8496cbf281ebe57cbf2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93bb1a305770c9ef6860118faff2a871ddc0af8cf48a6c01147b129b6b271b7e +size 163230 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..665e5d6f3dfb11c0a520ec9b1715e3b777d2113e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a7fad968a0e3add82766f47b27782aa6b5ca661d2124ede80017d89565f05d +size 185182 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ae26654d1e83211d2731fec3b99a0cd9b10d1f4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99bba95d9a2ec12342970dd489ca3796f097f6a072218d2a202564ef70549f16 +size 207295 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b24af6032d7480e5c3ca7f55ef48508923458602 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6e846ad1604827877be8b99046749775d8c533c154bd882d885c810887a8437 +size 95774 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b5fe6ae3c14610257fc6f60a8588416ea0a2dae --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81c5811e254935d666738098317539f50a5242e1cc83fcfef00da82ff0b59cc +size 115641 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..abeacaddbb9b7d3cbb8e1b7dadc1c90f83bd2d26 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57aa3ddaea31bd1d7e34d63d81ccf53aec2d867ceb9fbe8c0688900990c5f9de +size 136680 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..391ac672e6e6e04d079bef346fe4e4b594a61045 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fcab9211e52addd83a8ff409f81a959b587d40cd67676400d4a796b8ea8d218 +size 157398 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ddf495b89642847b952974a16976a14525c56904 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b563b23e870f2960d48787da94f9b740a69ab57a32808c2888dab6917c07c2 +size 177736 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3478344dc370b53d730e4b1a7ab3425627d41bf5 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_copa_plausible_alternatives_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe1bbea55786c241ad2f45b31c9716fd10abe1c18eee27e0b9c9fa2ea4085cd +size 198341 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d60ae3d1230cdbae52023ba8a331c34f4e9491b4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac6cf21ccea4d886fd591f75458f724b3f92694909783eda12e57b8529c090c7 +size 3485898 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91b93893cfb04731c025e7fb16666fae5da0ea9c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f56f14d36ae0b74ecb17c2e0ada0098a164c303de3c83ce41236fb4e4984f882 +size 8934808 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cb06927227dd318aacafcd7c4c01f59ddd5a5a6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2976628ee8f759d1df4cfda70bcaa14185808a2b73442895fa2daf5e2a45b2b7 +size 5390836 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db79c900c6f1757d046f518cf8f74f93725f0825 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4853fa7c00c78c605d4a0e63ac1c18433fd5c97951a5ac74a3e6fd32445ce85f +size 6311369 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..46e50307ead9eb9ca83026ee02380ad7c34ebe4f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77fe5ec93c892e50dd0607857abe11537856aace9a18daaa34531f82c0b96673 +size 7241082 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ecca8a4bd1a6aafc1a6bd0090a937368d4720ee8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb25e3b6efecd4fddb1c0c0d267f1b6521bd3f214d08b2f4b95601363bf58865 +size 8183910 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72d6bd7451212e2690df9cd9237db08f0379b965 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82cf02af69ec4a40be1c97072e5492f79f68e541b405eba404d14ebe6425d090 +size 3366356 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3684796f9371e2170383738412d856c2bf575f4a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c772d024289e4ac586956b2c362586575960260f585a88204f53ea98f523c91 +size 8720994 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..beda18342e4de3e6e4642e0fa779ebdf12c821f9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb834dd1fe0b0e953f205aae91e5443679bf7a9bea85b54b7586a73e62ac25c7 +size 5275569 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3ac5e4779f271a1ae95b96c4e7a2aee8786e064 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5ca7bd0861e462119e4fde73edf1a154d7ef501508f5b8839e08588c27a5c6 +size 6185108 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1bf4fdff5b1d274fe570d05e9403e455593ca2a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2511183152c08b3b3368aa812f0953b1a533f7c97e5c63d26df6bec7d2dfe63a +size 7094097 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca528edea4182feb11ac3b45c6d487137bb44979 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab0b84031aa86daed17fd7fdcb06c90f7c5aa53db9e1e54f004b6c60cb75688 +size 8009409 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc9ad6dc843b168d913201448ef45236580cc78d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0bf91f8c5fa64e0089a08786848916133863824f27dddf16c6d252ccaa247e0 +size 3596555 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0431ddc4b00b3cebc51ef59fb8faada83b2ffa3d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b71790bdf6c38182fe5d1eb4c5c945dbb7063589b00624787ac1a07ea278cd +size 9482420 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e1dba4f0fd8dc864f4d352a12d24e96919865850 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87594ca2e890b51cb59e5a78aa602e504b4b795ef8ee3ac0d12494d44ac56932 +size 5932698 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ad5279446a5834d228be87c01cf6693ea0c11e1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a4aa0bad50fdb1cedd73db05d8bf17f6908ae99a728a0d62ce3843e9a485da +size 7071550 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f06eaf5a899138f0ac54c611672fd2b3668ee8cf --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5edfbcab15de9cbf60ee21a2198b7227bebddf59ac37014f81b37571f8da5c2 +size 8180610 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d7c2edc47bd1ac59bb75a90d6df5b17396dbb6f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bf3389de560e1d3ed03cb349f8d1b9675ffb95c517229101baf211ccfaa05f8 +size 9279776 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c44a23e071650efe04738a3b709f5632fa28e43 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6058cfce81294a1c48015a9392c6ec11f251b01f706ad1e15e42530892f279 +size 4493008 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b59d54a9c121cb64b1065dc48ae0d9e3efed7e4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b632873011c2019a0e8be73fc1bf7320768f576f34ee67c19083f1292f52e4 +size 11159114 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f60af32cb11778b54b05ee7392200b4ba025f4c2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee03e7084d7c6772a0ac2ba4133bba5eef1bdbe80a998f9f2fb46d005aaf0e4c +size 6661634 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4f0db7c4dbcfd65836f4081a7894b6050ab4e99c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f8a0fb1f98dc35bd0c6790d35a84c463eb116edf9b783c51242fbcccc4d88c7 +size 7744623 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ed51c5794c31056423ef58405532e333effa963 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc762c96968ae63046382f43bf63ea546a357860a6ce8e7110c1d51df06ce2ca +size 8826455 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea199601cf5b054cbc49dfca2f70bdd561d9043c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08583bb1539005d85ae9887443ac081fe1e9cf54a18f78888b4bbce64a6607f +size 9914002 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..325c5a1a8b88d988b7c150c93eb6ea9502de86cb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1ccc048135e841babb31fcb68281f7c460735357d9d95d84f510fe504a85190 +size 2935352 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78349d58ed1d12b2ce657dd8a081f8c1783178c9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ddfc32fb4f6dde95bce013d6da17a7fad7ec8750e0ece0384af644e2600e345 +size 7822086 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a65fe88d9c7f2994933bdc2c0850c3cc4fbb79f8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a42b78d10ef0ae361ad722df5399ddbe5c1137db8362ab399a2a0892d40f1132 +size 4707589 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c46d9d71eb1a5c113480f5790e023843d9bb30d2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca4a517b6db19bff7962ce0afa51f4bf1d807d418e0af810d8da7cdda517e44 +size 5498136 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e7fa2bd57c49f11db6e8a4536fc0fcd2a562a88 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76db5949868111fab1521d09f949686a9ae13dd27502deb5503f379cc02d7653 +size 6285188 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..930143bc043f57518b7d0fe15fbf0b5cb5f743f2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9feb42b4caba7679a2ac49573ba08ee11d34eccf3f49fcfcc9e3ea2b4dd98989 +size 7079674 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0a4941da5c666a65756e709d8558dbcd50f4275 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce28951bd1f9d63452d2b2fdcabd08e05c9514e7c881bfa3adb0d8c85586537 +size 2890016 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..685d30aceffb7f07a09c4d75e4e412e85f26ca7a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded0a133da4787016446962fcd45b77ed9e1a4dc419439c32bbbf2bf1020cd50 +size 10411230 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f6fa6cfbeacaf45a767fc97b2d8b2a8a8108189e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87002237d8ed91833a180d83c372aadd3b2906796135c30100c9220be93e7c85 +size 7499499 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3ef2c92bf966e0c2ec54a8157d70ffd26d51d9a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85dc71d5fbb9ccbab24dc633d68140cd503a5eeefe241cc5f06c005db8dc868 +size 9782702 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6c34dba282a91480c77fdfe6c79710d75d34e6fc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2ee2086a4171d025156d4e88a60d51d661227480618ce6abbcf982a12c35ed +size 11822700 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b226921b54959dab100526f5aa5fe971f45b1790 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d119da09bb399161cf9d454556c3bc45a562a3d896c25e7db32716a97080918 +size 14076802 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9be588e456775679a52c9f769d42d892fb126f52 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89e50865b7dbbd6596b795183ad11ae0b06e90eeb091841b412333f6c3dc66d +size 2784177 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..de5cd3bf44f0ddaad0f9bf0c41607d7d174485be --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a1c6d64aea3e9798c4fb447562d559c21274982f6d0ac461fb94ef5ac37bd2 +size 10114106 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3cd257360acae8da5c7d4e76ce7ccccbe0c0ce10 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea371d2510ed3817f845778b54abc0aa643d91475b212625b75e1ac5816f6cb +size 14632844 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7baa6a45bc02a33e5bfbce55e67c29ffa976010c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1974f51d2248b3289e6520abb5968aea929daa117b607d17977acb67fe1f2195 +size 9570957 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd9fecd39bb2982a93c4444396786cb5919ab478 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33e637710300e42d162895fe9611fb4097b34a878d22c5a0a0b633b40ffd4dc +size 11583659 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4092a4de7b8d2df11f037da690f7c78994421c22 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa295b3ea3da8298bb70b063c276777a557ea232fc8f86c66779b2d35a72338a +size 13794794 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5a4233d988cbea0f1253b848b8b070c04f212e9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30af4a55331420ea07823bff3ce06102c47a1fe9ec2a3db1b8958b143139fb34 +size 2836988 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc5bc41152e674287f2103b9eaf67f73c31c4b03 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5cc18c79082c66f98a72bb4766743f8e7bafb495f73e67c7229b42a71524ff6 +size 10212968 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf5c98d745d347001dae6a7367956f189565b66e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be60be28ebed5e219b88c7436aec3e894fb6707c35b6d047ac4befb7248b6f29 +size 14759120 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c24f2cab162504db5bd9dff1b7dd60ad9f81589c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df4825f084e74ae442469e6c3818200a82adda676c68e1e5765d5cfda4d4162b +size 9648001 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..afd3f48c0683e354330abc77cad35db804e26f32 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aaa4503caeaea68651e71fee076491dd9b2d7e612f9aaa8acbf06a98d33df73 +size 11672165 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b05293058e3f1814bbdba4fa5350d09da2d38ad --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8875a11f8c80aef14ca46ac3a35ec940ea4f8981e08c7628696d8944e3e7e5 +size 13897511 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f0d550a60be60521fcf1aebae597b472b0372dfc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ca5458e60cb3aeeac7b20e94912612bc0d557681861c9aaf4839af7a96e6f3 +size 2819054 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..55815a96b706d702d900ed862e7807c19a845b1f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf635d88b7281499916d7b4f2e589702cd22a25b12ac74d7b69e558fda4e08a +size 10191324 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cf0a345018d01962e9bfc2e7ffe6b253b21d7b5c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c58493b2179f236b52c3c85fed8b05314de67e7256ec6754d3778a311305602 +size 14733634 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b51eb26f6ad3dd49ed7e03119dea3ce93e93f2d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e2fd4b26b46aba1d5fdb39f03205e9bcafafb6518ac3e7bed4d317b0ff70562 +size 9633130 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa4a57b1271c1bd3a8cd08877719341ec297dc02 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dba8da83a1e15555b074c05905fd6e347e0b5637e50f0eb1497966e1c500b291 +size 11658741 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..452750e4e0da17ced96eac4d1a17b8ab8c8bbc98 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cca757ccee6049bb8b54b3ce45731909ea1f569b8060b95bfc667bdcf4c72cd +size 13882366 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c82bca472c5624fe22bf9098a391fbb27529a5a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3975b7467ce29e0c40c478edd480eca06f9527f4ad9dccc9973e8d36654812b6 +size 2878507 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7588e3cb5579222bc918d2e23e45c5b14d72ecab --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7130663865e82d7769e25ec2f4ef2e22e79f08b98e20680df51050d196112c7 +size 10328702 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9883572d8e7061821c2553de82b2402eefe46b5c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfde23fd4fac00daab03243550ceff687a6ee4d9f7a8e8f5d8215d5b34cda30e +size 14907986 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92c2f6894f9ba0b2fb5095077a35876ad883ad97 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b872b2cd7007c3bd12b8b97bc32ca6ff9b58ed9f0f6323d5e8d32e9fa81126fc +size 9732430 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98828826f14b50c449fc81258a4d0c530d63a74a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9fb7bd634272e60b51598339a0c8c29ea1f2c01f0ae3b7e7a2cd0a3cb6bfcf2 +size 11770835 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8a14b45bddecd5ec07fe5349875fc182cadf0382 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1baf87ad0148f3a2011ffe28cc7d3fde3918717b6a13db669ba51825657cf8e +size 14018644 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77fc7330228f5e8bf8ff64f7ca4ee67e2cdf7152 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07787e5388b519f1cf10547017e19b00688bbc4267808fe98fec997e6972d6f9 +size 11730048 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cc082a73269ee2845ff0af0b7fbe9bb9cfda2e0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab2a0f4613761da4ec0dd3155a6b431355d99c56250eb59b738c4dc05027674 +size 11391948 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2b066cd6d5c64c5cdd0ae63d8b65d60175641a7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53769e508a1cdbea3f418f84735811428fda4d92d0b29b6a40e62879a0ec841c +size 8969978 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..023f8223c52d2ffb73b06ec4731b468625e8c20b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7189d5a62fa5702dc69d6d7ce3cde473631a1f96ec0a16abc00e913de54a6a31 +size 5192655 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8db26ae733c3c4c915fd6c2ac9aba47b9483181 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0dcea8ca2c5fb456bc645193fd297cd19c1970a2c3bd5c8dff22d8dbd51b925 +size 5870651 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..39916c3c2d42427dd4fd977fc0a9a39b59fb2263 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452bfbd86b4850904b9172ae5432481842005a590d266ea86578eef751dc92d2 +size 6521160 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521 +size 2115935 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6115905ecd95b10d34c61b916963b58597797844 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c023c1bfcbaf9208cf58ccc9cd3e167763ffbe2cb3b13f8a647f24a27a48474 +size 2900909 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a688daf9a7557c9c9c814ea084e5f53a33dc2a8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acafe018fa1ab13e5a7f5fba5286589f03476b03f080f4764388b61d4d5d78a3 +size 3683911 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..73e27d27e85593909b6700f6af7aba04d60a0e51 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a9d423bdaf543be537350829415ce8fc67ad565b452b2781cef2b581d0b0f6 +size 4458372 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a242a040ce4a95f7e3f60815e19b0c514c4db9bb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e455968a80f75599360d116a1111a625a38f77d24e1b065791f369926c2e1ab1 +size 5254516 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b8542dc5c296c8679111fae8a4df4335046e320 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee0187c6c63295b4c9ad9054c78c9ab50b26f47aa4252bf11967eb5dfb37c0b3 +size 6040677 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3093ff44c544d499be139966705b30c48ec893c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce96c13317015da63f63c3f9daa74aa93fc08f8e5fbbfbcfc23d0b4d96690e06 +size 12256896 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1f1f7a8478b0a6295ea644fd50f9e0eadb6f755c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a95291730060978c7fa0142517280c75ce286632b766dd760070e8f249f8ab1 +size 6547408 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..11c4ee96fe015329c38947b848756ee721e6655f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b1c431de764aaa812c82e4cefff4def99f5f551b7adec2f2dd6559475f5c58 +size 7015824 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ff3bc0cfba38d5543a6f1ec31eba4a45a9f20fd9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f42b1d8b76239002a65e4c2628b35b72a8b51dead2b7df0815f41e8f49b8a9 +size 3776882 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..297acc3c0579e5ebbbf5d6be431de0271174d396 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e79fd99e9878dc601306515a8827a6eb060a12001c975371b04250de287fcd3 +size 4056428 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e6554aa219f2257343bb1b26e4975fc78066490 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4845e155fdcf4a9b43ebca0db55219907c2aa96e5386d27810c76844fc45271 +size 4332575 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce +size 1864129 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..680025ad6e260deb4170115a8916f97da7505824 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e843a28df85aec41888dc4bbc4b7a761b68c0d800b3e6c65d683a79c190fc2 +size 2557203 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8af0735fbf848878fc250225422a4302b59bedda --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2989a502914889bb70309eac3ba491bc0847dc3fc872fd667ddab4a4dff7409 +size 3248305 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec3792f352a454ea75b715c6a067368e23d649bc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a174ce78574e7a37f1ab6dea5d9d855896024712406e2703dd6fc6c9c558a8 +size 3930866 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5dad1049baeb5c6b66b63ece2c53682415e1ca1c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8aa4867ad26e4feb49332678501698338f83adc0e63698f03c8173faaa7ab4e +size 4635110 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e81b7d7763510ac67559b5af0de51bcf9801ac2d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_pick_correct_choice_index_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b31cf134f937aeaef828172b6491b0557bbb88b3e67a3798be4bfa9f2d1b0941 +size 5329371 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38243950be2ce75685090f54df9d95bd9805b60e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f34ceecd8cadbb8e7685701f406c82654b9ce057458816d212e8be772a8cab0d +size 2257372 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..111061e25e924812dc23b529645532915fa85c5e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64e1491c9202fe04706a42cbc9877bb7f3e4c460e4e7a2fb5b73825b3a96be04 +size 3017447 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ebcb2464dad656fb0ee5ab86829447f0630241ec --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:950e6d82ed684a78ff32c295d2e24e023306bcc07c1eec5daa3c9b7f7bc90bb6 +size 3774063 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cdcb6d090acfe0c9e46eb81e682764a358f0ddbc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b52a9470ce0cde1f37b90b7caa90bba3332fa71a54035902d2391755004dbf9 +size 4518739 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd0c652f2b30d84456e16cb0ab0386fbf820f5f0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba36a7077e3719e234d26c9b863ea745e8f511f939f95eee81a0819a9e08c28 +size 5293239 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae7dc5afbc2e2fbf4cb7d01a2a16176ac0438d50 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_piqa_what_is_the_correct_ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f611247604a33ea8e2d66d312cf31d1efce9442f637a41604572c74aac2845 +size 6054705 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1de484e48ae8ebf90a5512066d33c0a0bfe71516 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7ae3713e3ed3a6d7b13cd22708f2ee21484d00437b4395a87599906e137f58 +size 639784 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..192158c250431c97228b66f15e7a1a51e1521a6b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ff0451e62cd0cf8ce113d864eee850c18214eb7db923137428e402ce477506 +size 755060 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2904f2bff7b6c49b6bd2681d07ff9aa2578938df --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d73b315ff3526315c16a30c0ae5b6afbb2be23c8e4c6defeaf4d543f44774fe0 +size 871288 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb247445f6dbfb60bdea31a4b4f10f2e08c8c45b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadda34db78189ff710490454149ed6772a17e7490a019f588f5a5f5a618e4cd +size 985786 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bef37c5edf66cb0a12293c012dec0ac3bc56434 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5eaba60bc8b7b189364ca420043859ca52b42030bfaf51a95b153db1f427e04 +size 1098466 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1edde9815888b7a7a9b2f3d36a49e70582fe0274 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb81c5fad7a4e73a996a3944f291ffd9b0cc59d58ccc71ca7079fc459a0acaf0 +size 1213697 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d68756752c4a3b0a3353908994d5fac48abe640 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff548bc9c073d2f1eb31a6168291ed09c3983811e52eae8ea82bf39994334665 +size 1182535 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..745663b556b13bcff3a27f0e447157d336136488 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:199ab201ed0db4413b0f92c443b021b6ae0c5f8f63f41edcd7c2c0304a63c338 +size 1779370 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da8c1b416a2840d180ee88daf46d91f2e8b670f3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:073f866d7c429f822c9b56ee7db23a8f4a1bbb282746e642c2125865d4bd9e82 +size 2388571 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..906938651b462c19326dced5a2662c13c104a84f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:086a355a953916d8d41a943c1b5c6e1d36c754edb90672be4154ff3a8b65899b +size 2973540 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..164d6948c0a65ddabce3e10d1bab05e2022cd5e2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c99c967062f5050b4614a1254216e89340bac669cd2db61f5fa573b0de147bf4 +size 3555885 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bf9086c1a2ac05ba2bba59919e05d1a95fcc355 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Direct-Question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da82dc704507098e80e2f2bec3e9f580581820ed130201bc108cd63a994f5923 +size 4144713 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e03aea920b0f13125b680345bca2bc6c48c54c21 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c7ba4ccafe58fd786c0c0bcc43593590c4e1285084c2cbb374c0dfb74b2814 +size 1328813 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36b1a77529d75be0807184c51b4654b250de7197 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11b4ffcd21af5c95a1b6d480cc8372060ffa87c0f7ed13f91f5c774b0e14b4a2 +size 1524250 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94a7e2862fc1747e18ba6b460c57fcbf9f0fdeaf --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d895880defc07e25701a7e395f4da5f65123e835c3e4427c17205a652712c1d3 +size 1720411 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20aa9ee22c34551cb92f63ad44fb940a3352b713 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f059d78f60bd7ece655f117003182d8e5bb6134e7f27c288b539de3b798d613 +size 1915553 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3448ef813a5c3b8a84429a8a2e9b0ce255a59a4a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24271d2934a5c8b963a6cc2e37f816dd0d5125133e4cffe84a84e59def6c78a8 +size 2107321 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cd6cc80774c21f0f38777e8a27858cc3e346413 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09764dd75756fd5503d8c385cf57b77e12cdecbe287879289c4c6fc0d8334f91 +size 2300720 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d72880e95b6654a0c8b8e2e71a1d4b13d41a3b8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6880e3539289b0a9e5b28ed9a26f61498b1191977c92058400f52f8081d80b0 +size 1935151 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4cf51dad02677260301ca1237a3e1c9d39a995fe --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5ad85c7dfae7ea91789262a9557a6f1a2ac4007733847e517362267a04131b +size 2636141 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..872b729a53250749c19785a554d0881b6d2cc90f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d28b33c854930e13f0b80fcd5bea3d521f3d2b1e76e38c08cd3119c2d98ba21 +size 6698676 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e5941e6e168c20132d736b91d96f9ad126d4a5e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4529c42c54264b6fdb0394f5e08ea59eeca3ef8399ccec5e1a4d05453d28c75 +size 4038961 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..318e82aa959dd8efb1fdfc6207fb0b867166b399 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea14443220b9a66a83c4f9d8e9190ab8b5ea19da35def0e7e76d4178893e3090 +size 4724290 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..741866ae051efda35236dc2dbe5126c2e1a45188 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cad541a627b981122112d0e97b0235adb5252bc6221e9d9deeca467844bffda +size 5415467 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..027c581a4057a88311738545e2518780e2604137 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:571b73257a40e0f164ec047022b983016a63dade1eab83582c1c45241b51b2a5 +size 1870050 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4aece4098dbb7ceb4b577040349742fed1fdd523 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86403d33840d9446628a0f9acfa88114a9777cc1360bbe3b99ecdff7d4b1c1eb +size 2545405 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4bd0ce9675c67d73f120fb9d876a41c7a3ffd71 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8fc03cbc4f060013c054d4b765e6c4190f9e05230a7f426779c5948e4fc957 +size 6464874 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..874841937b91e0666093d13d9505a86bc200d1b4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d236a9c2b2238466c9270306957325065ee4ba1b327ce798c1448df2491d2e4 +size 3896127 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3251972de8e8da4606b2fb35f48f16cc7ffc3ea0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70efd4ccc750b9992a73bba8cbeb518a4077744eb75530ebceb689cacb2d3ed1 +size 4555429 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e01d120b8eb948331dd731ea78f90cfb28d05346 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9133ad9c6f20aa576795fb46c0042edf97a67c12c1822f304421829826acea53 +size 5220584 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..acb0edde5e2124337c654b8ea689f69663cc3e71 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54a42cadd075f5ce35d9b54c570a2db42da9adf79dd5cd606e32d3988f801c41 +size 2213504 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69cffac16df252040bfa9c43e93e5461bd6ff65e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aea1d4eb8f0cabd86800ff1e634243d48625c61e5aba78212158d08312cf3ad +size 2974737 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f2aa22cad688e4eefb165849b91494617bb0b8d --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9572a53e0a4f8e4ea8e47664402ac92a563254f3f64d5b664b957c7242479f3 +size 3726990 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b36643dc1d35070c69508dee35d327b1afd1eb5e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66fb801ab5fd0d86e383b8a857ee2258012bb3d7915f90c176a29e87075a9fca +size 4478394 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63a13db88ef8575bce4f8b39e3ec670e4f1295c4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510ff55c8b080c6c865e8e5552db9bd567818f4359f50f1f32f0eadf291fc9ee +size 5230197 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b5dc9755750dfe2d03b8586403115b2b3f1a515 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7301e358cc02743f1c19ea94fec15d2cc9dbd09a46f2f496f9602ddc5c2e40a7 +size 5980941 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65612fe714e6cc59dda751ea5a8290c4ae001356 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677ef47cbfcb2dbd105a2477565a6501c5b15b51f09d838c4fa68c61dfe0eb5d +size 2360733 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b212b5a8d90c20c65c25dd358b8e47ff4d2af4ed --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83b05483bdf812cad7d4a1d6fbdc23155f772f07691c4b0358c2f75aa86add16 +size 3193641 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fa22a5b702e56fc1cb33dbb57b146f155ffc4bd --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9154643b13fe12cd3037fc66a3b721a0b3f4417e7a62a4a5a6a1c30156847a +size 4016833 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9867616cf766968b152d96c07336a7d43282de64 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:828b1feef372ec17229c1b1abb3d536fcfa7368e5528e23d9477a81cf549b37b +size 4839308 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58b442d1e21f36f1e2658fa50015750b93dcacdb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670f7553b5ac2043a7285e561dd19f065170e91a4ab33a336fb6d6d3e744209b +size 5662125 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..966991b87f6cd2ec24c3d3b53c8b251c5ca8bf6f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84894266524f73b022be0f804af33bdb04acab84a153cbfe518541954841d172 +size 6483827 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..288682ed3ea12be96c7c13c31a0b2f80967894c6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bffbc1cf24176f089c7658c5b0f76a46a8ada6bd64c06f8005ce1e324c8eeec5 +size 1879404 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..18ebfde084f92747f003383587ec8ecb67a0bc36 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be35e76b72f8972d32aaaf9c394a5201be77111f24a832f85c1ba5134e1d2d46 +size 2435307 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a48837ebd60684fed822533581b3a546851103f0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68c3cb3e108cd6a8b57e9ee2f768756fe75a6620b17ef68173bc964893d1a32 +size 2985763 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..edaa042cc40d405930f0eee1e024128ffa6854e9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e833670e6d4a2ad0364ef46851ed9b4e9603515dff2baa1993378151902afa3 +size 3534372 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a03c6af92c5edf1eb5246d4c163aeaf8c1e95b4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb4b31aea2aee471449a8622ab69fbb7055f06f23d0396427ea614dd9b08897 +size 4082866 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13821346a53ba5639a6dbcd4127f7b2b6f8ca57e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Generate-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b97fa7c093bb69513b7796bf9d58f520917e387b0bfd0d0c287b66c4bc6910c +size 4629879 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f10741b3f581ed234b16c5b1941eb04c8117c34f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5993578ab42a01b649d0b7ebd93ede79d6ebd5bfd4c09b218a359f8197ef84d +size 2370864 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa40185006acd267a50a7899ab80c4e7d478edaf --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af890d4c458dc4ab12bfa1d05e8ee8a8bad6cd0fa93445cfa9a874d7b622ce3c +size 3210419 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c1aadf61271360f4ff0fa47b947d1904f7c189d7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71997bc0aaffe4af396dd2632566b3af4b1adb502a81cfd6b37ef3543ccebf04 +size 4041203 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e76bbb9ca22cbe22982301cdeaffe3d9ae347cc6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade9b05a3119af15bb16168f908dd5cbcb5b47fc0bf6d964634aeacf97067ad1 +size 4871473 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96552a4edc61e36f6964ba362a9a4c498e55fb83 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24edda0e3a2037224d4986d3c3781fb5c3e63c8d34e97c304e1b3ea757a83414 +size 5701709 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13653aac6c938656c5c95592709628fc832c3a63 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0840a621cba09eb015d370815ebf27cecdcc0cf9a9aa1aa86de282b63d5a3603 +size 6530947 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a3043617495b170cdb92da702f617d9088ea83ef --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4141f8751b068f1fbe14fe30f5bfa5e4f05568aa8d46135d096346cdc65fb855 +size 2343448 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b820a1b413fb6d72f6485255a817a092f8a6c41 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a0362cd4ae76fa5b55705a6715195a80cb9707da3509e87d10d23a545046ff +size 3157829 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ccfc29f5a58d91b44e2adcf3dc9f1524d2093222 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03571a2eecb75f94b25703da5e61a797f63616539107eead1140891b1fb620c0 +size 3962535 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..432e467701546b954ab570b757a89cd1aa2d3166 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c14c9a2695a262e5d540c177d8fbd70bd92001669700f3724d56841b3b855193 +size 4766245 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f8929387692222e33d92628e4162fb7ff70a060 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d329b9c02f127441df5dc5da945fbf95bd3d650e02b6361992afde1e38136dc2 +size 5570331 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6479b882af7c531de221d01b3dfde3ca70c784e1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05590b68245e4c13fad3d6d57106f71f588e292c3b8675750158f83100af7e5 +size 6373336 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2bd884beaaac23484922974f9a53395fa24f05df --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8db414e4fcda7d06107f1dd3398eb8307e5613cd277d2bcb9a193f1a6553a8 +size 250578 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b81d17c680bb22d0665158474964336e52e761e1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e055cb21252e195b4e195c5151ba61d91e24ed87c8337727a4d92a56c8f43e0 +size 351320 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68810812dce556f6a86b2bf409a164adda684135 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8a3042812e6fca81b9ea716ed10c2c64bfc75a86204e71fda320a9765bbcc1c +size 449624 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..10de5ac7445242ff3bc87685aa49e72ac27c20ac --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2f4c877794772be3451c9c6c6922b851d1e833b988a52017397d2d6c0492a53 +size 552976 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53c06d8598e34166bd62cff41f8bbff0905a888b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a188948cb61e6093f97d0b7139948872ac73dfc765a6762b50de34ca122bdb2 +size 651592 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7da6dd2da5d103611ed3bae1b53d84c59bc75811 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b91423b9b28c0ca726a0bb2b5a9ba9decc9859ddbebd904485fc38360843ca1 +size 747495 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37ffadbe7697029931cdbe8ecb3f865d05fa05e8 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83be1d5c261f0415e65d85bf8e7fd7671388d6c7da6413096d0079ff2575be7 +size 292896 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346 +size 415394 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6c206ccd95d4a85d88af30fdc32ad083fb00f1ed --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b4719d7bb3b48cd499eb0efbcd750f27849001f704aa320f7bad371a0953d9 +size 535290 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eee178e4320c07649d9504d420e508bbca5ae7fc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d65e6d7a6d720df8e6362f121a0f4bfc40e0fd841ed9095dd9ffece22962dba +size 660270 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c5d68f23019886c2d71b5d2a8beb747e3d9bc65 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee6880cb788424a36c76187336fd225d9bc4fedb3cfdacdd73c2b9479ced1960 +size 780481 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..30ee81927c87997ac4de8338ef1359cc29807ace --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da150bc65b858b6f686be389ec37065b1e2aa1a5c7f6f150f63e790cfea7cd6 +size 897993 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0096d5579cf3b6d0ab0c10fd66f5d6fbbc313a50 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8780856cc09cacc5d78be823c80732dcf6bc397643bc5e4d4f3b97549e883d6d +size 258341 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4b701c3144a57bebf6741b99687398b2e3decd1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5bd52f7751535a7515181b923850fc2eac14f8068fad7c7e24e9c73d1e059cc +size 363043 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ab9835fd4faedc0df6e5438be1c71ad44d970b7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05180c68e8f82169c82b538e0c8fc4e786ef6421ab8a7dd1ca48bc3eb5635c12 +size 465274 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b9570879bafea6333395ca30d90d699626dd78ec --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffba214086f926e4a6d7bb6a40c0862972bef315eab1c15e9b84ab7757c08ba3 +size 572517 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..11bdcef3babbb8114dc5989f237bc5231c924b26 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7959ee3f263d24d3792a2c1ab8347cb27363548557e900c9c58a58b0dbd7a0fb +size 675000 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b22843e2a202a0f01a25da7264799bb97d62849 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_does-it-follow-that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50e66f8653ee56cac3726d1199169e3750fc8bc65194ae870e392b5df64cc7a9 +size 774762 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..241a8cd0ce36158f4821f8d5cd908fe68212e731 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e45d50363213d138a4801067fe3656303b06fb5c88d3e37e7225fb8a2ba54799 +size 261138 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d +size 367750 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47892a2da78379b30f4bedaac349013bcf6656b2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf1647a187dbece80bc7f0306b8227a06ab28ab8471aff60d7213637dd44e71 +size 471883 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..04e8e793a4b26dc7fb6d88d906c37aea7377bb0f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5321cdaf0ab7aa9faf72a60d89f752fa2d8fe111382a54694a77dc18935167f +size 581072 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d03b3c0f93a8e59a4511b1d183bb3e6ca329af2 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20fd66b62151b6aaf2135cad08d6c309e25a355e79748b9b9321158e7ccf144 +size 685498 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e5ea1abdb096bed34c514c8ddfa7ecdb08bf8760 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_guaranteed-true_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b00fb0e2a900c54f7a742d70809262a8606bc56c204b631695aa9051055f17 +size 787207 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a60c86180c78678a354f0aaecd65b6abc0d7cff0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15277c235f4ea4ab5abd6efbde5bb4e07c3fc62126cbf590f9b4c7066314a7f +size 262198 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f +size 369689 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b0b59f725b3cd166102ff751ac9548aaa1c9f62 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada75fa258c2a8c1b4a0c1cfb232c677fbfd5b83d06f8092cf0d840161b96d1d +size 474655 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cc2012b61f175703a6ed1fee8b50f79fde625496 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5cb86de5503cdd3fbf8e94ab6cd5e1d5d487ed3db7b414b2486a82d9cdcd8b +size 584676 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1944c5dfbac3f33a7b4520d2a2718bbb4fdd49c --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9880fa22bab36c220c58cdc14fc0a3858da08fe7b6c27713d012e65c217c16c +size 689940 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ddb546f2fedea1588f62543205d3d3c104f26fb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_superglue_rte_should-assume_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef5235cd08b840474974d10907ba2f21361de7ca822f8e41f1bcd17d3dd71a2 +size 792488 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0ba17a895b5bf8fa86a96fdce42a710768b08058 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7dc75b6c4b506bb6d1d85504d5367feaa8dd806a75c175fbff80182be6cf185 +size 1039227 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54030319902001412fb8b1bb35238645a3cce656 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957d906ef93af881a6e4b874eff4eead9ef622ef70e3b12bfc71079774b0bd85 +size 1300169 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..afa021cf8d093e3304e0a73b9847d91ae026edd0 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d857946318e84c9f0d86f96a9a1005b6d04bbe9bca4931d3bd064a3490cd2ee +size 1561244 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aac6b8a4957bda12bb8c3713b906c89811465c26 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e791203320cc5b6cb3d746b1ffba557fe98c87012f89026e4e667115b3a5ce1 +size 1822744 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..719d0a7e07ffe78a761dd62dc91d1cc3fe33f6d6 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5324784b5734e69fffb3c614698d72397031ca3b0ad93883c413df64cecedc20 +size 2083003 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69daeeb791a6afa8405894a324c8ba0281867dc --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_Replace_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cfd9c5738e5292f77910e61c9b162dfcff04d39a9d1c7f481fe66bf80325156 +size 2343772 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6 +size 948111 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26c99d8e5e7b202f1efe66c38a592bd3a589c592 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f97eea9f816a989faeac76e4fda627ce8c2bc8ca0cd267723aab1728f27f8269 +size 1180522 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42686a72a6c529b0998211314474ee68bee86a70 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf482182e1fe97f9a940972b7fa3c9a9bb2c497c82fe761466f5b135d41496b +size 1413042 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a35afd8ff853ab70550a1989cb1c270cf70d8f45 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72876f4ad114f3a1b32b83999c3125871f81d9b6f732146c3a30d8cf43c85098 +size 1645863 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e1f5087d3cf6bb1114bdba5701b18aab09d1c9a9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d99ae3fa6cdd2b4509fe969d82b1d744ab3a46b93bf11832a8ca3f51e48095d0 +size 1877766 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7bf0eedbe08e07ae9798bc2a35466095d733532e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_True-or-False_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a5aca73f34a22353564126d1852dfd16149bd01fe38c13c695d55491e2cc55 +size 2109873 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b763df7937a395a76060fdc4a2c7e57e75b5cb1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33aafaef2020052b20aec9c072b07c9025eb634aa50441844440cf1c86e8bb36 +size 1010089 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc1480dc4c7a57f2d085c5abc4b656c3389c6305 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ba72e4b4446503060a322a250e3db898f695ebe946e2750321355488d5e7f8 +size 1243176 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bbe9844ef3c8d331d52796102445551c418e16e1 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4d14c310429f9148e89c7f442e255092dc2874d683bface429fcc7a6e15e0b +size 1476374 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b464e0aa8775deef722dc9f952d5f26959be2173 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ad744185b66d3b886944784be37b96537d3f8a828a5e5587500c70bc9e0dca9 +size 1709959 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c66ef3ef41213d3ff57a37b0248c1ea3963435cb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c718e69d17aece6a88d81d10e01ddd9b7b015e2c7e02f24696f0ec0204797fa0 +size 1942306 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3985f5c0d7443a2687b378c030099343759ee079 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_does-underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6e942f3d52d1b97ae65a632f2da3a646501eaa5522219385499783b2247a82 +size 2175120 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df9e69b9e0961eae3b96c67f61b40cd10e03a8bb --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59bf716d2032abf69807d97bda66ea301af74b3b8daf5406233ea8c41ecc8cc +size 969466 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a4273c49bbe2184edc9f77b09167612393396a3 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd51f317c6ac419fd8b1656895c581b751e60f59b822d3398dbf731af9813a5 +size 1205073 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5e8192b27a4602a0bbd0c8d21e69eee74cb9851 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ec62bc2665d8b09892bfe842ebd5902e732020f3dadd8769426bf4733b2d1b +size 1440819 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e068941ac7e027d4366270789f624c9bf1dff61 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ae9a501cef357febfbd74e2676b9f81932553d074c8e126334ac9f2293113d +size 1676946 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7de66be0e3aaeaa498f31af73f6bbfa861acbc40 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a7a6d21c53b9724039b9cf810a93f45deb8e1a5299efb4c877a8ff02dc231a +size 1911922 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b81e957dc9048ad9e48a0f0395f754ffee4d0fb4 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_stand-for_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2dcf0ffb2636e184dc78bf9ac446c6d61b832741ffd37b0da3dfb62d13c7a7a +size 2147260 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87ab384a085c1f8b6d0979b7f474a82f36bfe24e --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e229400473d8cd5addc93b69690aa03166756b640699ca5e1b1e23a79f50133 +size 1016500 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..513c6b1b0d9dc764b9f17f193e2a01ae10a5d357 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ace74efa010090e1fd220e07f39509a06201b9318ea3034e15ff62b0e46cd7 +size 1257118 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4d18b7a8414f512431d452baa4c5a3eb3cffad9 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb053d7fa45de89c3ac2d161795a637eaeb770f012f192884147b7cdfb607c0 +size 1497928 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68ad5e3defdaf0a2c0410d025b43209c43200814 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e705c1104025ca6a2b5c47131f827e7d06d5b5e1802958bfc4e1aab7bf1056b5 +size 1739112 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_4.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f75c1b73099ab84ce2f8a873c208f82d27361426 --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc097bba3e35c647ab273c6260b36e06f7d68b8fbeaefdde3b51319ccb6fbec +size 1979101 diff --git a/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_5.jsonl b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7368037a3c0bef2557662732159a077ca6ebcf5a --- /dev/null +++ b/4b284b21bc4/eval/examples.4b284b21bc4_winogrande_underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d90ee977c36777f980ce54248f55fa4d1b0db5c8793a81d722d3de8a69269259 +size 2219523 diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6d8f8715fc2dbec93287f9eb67c5b7802ce7b051 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3423886062648571, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03277534172219839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07116912691303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015925050224480028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.2998523601292701, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004615423133559915 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1078155721409226, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020097112571708245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03290155614985229, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009635800950074162 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1430151870039024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003133828558045969 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04998894903569846, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001254978454654862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06817638471397719, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001440134234671522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2915042735714293, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004516752469805975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10384529013890176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018689379304153127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06758374630355668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014755310584898722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2855613282697172, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004322222242728911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10254842199703165, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018748895711891628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..125091dbc98d6d2f6110709429c772384c7f2184 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.43549317288896894, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.029438186163177924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07179524471867899, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013752072779383184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.35855038856048876, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005049635846993475 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11204264924342298, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018816879797699437 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.032801419147362856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008197872630256377 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1723852958712864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0035877516735487143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.051345397484036256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011584347174914676 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06711558592890904, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00122163800184802 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33387823961526397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004537882130758048 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10481665765175784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016825296642133658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06830552553162243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001292240550330291 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.33957294457499176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004661961230072316 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10653343348961458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017665097203702323 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51847f3dc7eb3f978a329fc87cadd88b4815a3f5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4731158648079456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019398006063203924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07373712610103833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013232763617533117 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3838400605745808, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004971353366559517 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11565039220118124, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017909510986326618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03418002818557043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007999127016124942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18966804432678097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037070132910163134 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.053828506115298144, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011299593484305154 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0688866789263329, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001197716696827759 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.35416301817745277, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004408869173242153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10799852330681028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016342464145243179 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07036220911566307, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012542262936579098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3642675981143951, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004634298128673839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11029915615802689, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017015090669507955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..93d700915a2dced4dfdeec65a8cf62d5649cb51b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.511033492138013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.016836817368392938 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0711394556954671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012159991412829965 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3852018338546001, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005067790324136812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11234455309812195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001658068318944836 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03260081169248259, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007319564987104789 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18947442868920766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037625536404537635 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.051724489676439236, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010463045011873814 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06636573472304302, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011254814997386617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3527863813991901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004403124502538447 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10462160106353212, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015374663592284314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0678291021786637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011705423632374366 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.36350690371493805, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004671323997080324 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10692341362108594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001592257988435529 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9221ebdf3e9d46b83770f06182abfbecdbbd51 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5255384435057461, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03418580070894041 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07289393265058683, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001221203882596679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.39210555607501246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004907654599821545 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11520145885524227, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016544612830049232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03333523314795858, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007290358506575526 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.19457648128168623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003616303249775819 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.052942763106877684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010297898091120672 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06752250125401707, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011086146136256522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.35896707400416245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004279012704408649 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10655328646588215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014999277772535491 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06949836967046885, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011669918564828742 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3713609435685992, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004540169352829801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10971668045319283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001577124609883823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..53e9ff66a968d2321e5e4eaed57d9949319777d3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5823615010118224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037398869921054644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07346269739111895, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012187681049390564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4042470714837263, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005065518163952181 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11639260450950206, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016495247557604747 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.033940599163428696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007391477550345831 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20304234761076909, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003764961452207978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.054089458597439195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010444166935940533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06792933252591714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011100797550413972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3687266029366595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004388331338140494 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10741878654019481, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015017472778460627 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06991029736225668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001173008019162227 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.38168151833425595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00467124723267976 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11057542829413115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015808583588389114 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a3732f528cafa83f2f3d4aeb6bfdf8721dd44ba4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.1593816005881288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0028463012802740157 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.15106039306191837, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0018614327334657758 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.1293668266283262, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020451981284945325 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.010415064972944536, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005832498460949334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.008381836594921248, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005769442404081656 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.008447632785522565, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005005390644606267 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.12882295031952942, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0022244289545785443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.12997760982624038, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0015539465574021857 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.1059430520968093, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015859113615098958 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.1354477122353216, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002537160964118554 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.11805775221027685, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015629478566191073 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.10719224487960032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017665884689823603 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.012484298141528558, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0007013360733034469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..95eff326e64bacf4b39304c7627cdb510563e658 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.20808691686715283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004313398059878784 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.4306521086221088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005516793126900347 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.2328558522551123, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003900111659742201 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.08181585393792677, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0027980013134964224 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.1900831484868744, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004381497415255648 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.09334851918279623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0025591605276398923 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.1667225810957368, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003461815683539065 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.3693635934874014, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004943896249868083 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.18783561618919367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0029446786453837417 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.1831100821976502, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003960631509702234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.3774482577828289, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0050625094090084085 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.20343772508503727, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035163687455997497 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 2.2982278621261973, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.1086928153245287 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7719678723d9fb8cad11c03c28e280715972831f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.44907951348269975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005884356170552851 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5373597445020493, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004966707028771244 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.42431308600342277, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0041620247128580454 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.25059145216834844, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004580817767902642 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.30420539737108304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004424545920380444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.23334399501205214, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003511605630791461 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.3650320462167926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0051681070707463765 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4461965468151835, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004628706403782243 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3447121578849147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0036720178726937637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.39789891202282635, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0053996273811364355 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.48008802419414803, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00474893114047731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.37564451182458136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0038397764850582553 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 6.486000451731127, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.1760272135645943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..77141582dabad41b815846eb9f79eeb7c3390949 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.5092720807541621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005761992782925212 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5373968350772912, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004897334207979844 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.46569868214556953, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0039167763038128744 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.2902018344321373, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004714799020672444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3073000070344161, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004371879453082279 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2613220581162864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00360096423581038 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.4098376883485247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005103095930707823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.43837950032145484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004496609058976664 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3742438056269717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0035350046513184163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.4492052702368639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005309452409480573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.47692163068169213, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004649900767270472 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4099104459367334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0036009459176647974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 8.592270404743596, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.3478687773942717 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..40a744c4b82cd4d34463dae0b0ca017ce3b53e20 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.5265934478163272, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005660483553931701 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5179234781275224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004868884431211911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.4679567762670693, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003781568636086802 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.3011205615910547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004723582948062983 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.29876898581754224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004360553531900465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2637473868679139, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0035396559441471915 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.42896446744335276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005075014781599364 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.42738457538098473, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045810446815337195 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3813739343787495, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003540381010340878 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.4663362213990527, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005289062452425001 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4618244358692011, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004696080984340952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4136944072158051, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035754128454684865 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 9.748227001648862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.30465771546881343 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..65b59d6c733a47984fe4691b254faa96bd136353 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.5462360493800595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005571448210438329 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5208556355840253, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0048319172639873075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.48448665725831064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003823220765053316 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.3165259707878258, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004816977889561416 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3048098435714976, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004476193519328458 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2771416755952798, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0037021003433152576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.44640610204674275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005009545091352138 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4305349369311683, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004579985246877059 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3962223744993141, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0035914467869508364 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.4831117938397976, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005150478425110863 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.46459910587278797, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004669405569917108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4284939067785979, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035897376609210704 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 11.081728167045133, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.32013487215713093 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3eab86cdd3b2f5525fcd3bb58a989c026ccaa569 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.08806482552055464, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.013753939759917743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.02425622875877401, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0005394400535560749 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.2187977886143418, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002563987829673595 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.04217421071874153, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008234598917093276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.0027418010825137774, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00027769817357513153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.020309096397945828, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012796421259529498 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.004552592244101363, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004257311158814129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.02376285292107096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0004993494816843593 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.21647307133945715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002496433700435587 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.041384238656935184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007632069889173777 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.016764423629076094, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00046755444259852875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.15365295702315013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0020731095703446366 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.029066972734033053, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007033196865904391 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..99764e25dee457047527a8e817085e41ec2fb7a5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.3977041826217982, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06476957892774847 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.08085175511474767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015420226394619978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5398549553723303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004633252724794029 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.13297509465908738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021274594259975206 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.03501440648141779, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008855447546426636 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.25426029625002416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004268932324515644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.057734516186082864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012735245538216635 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.06952000055759049, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011181437346579404 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.49228292845501337, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004312668887865199 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.11558954246536195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001566596618343675 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.07046525801920696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001409866048243462 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.476402513292327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004517149863331296 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.11581462372515029, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019451873798571877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1e6a452efb31d7036d7f07ed502a71f48138a136 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8053876932339799, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0692633939605246 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.0953355786305315, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018482318125930552 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.587435498341808, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0044160353671122265 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.15238609977285375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021050373299750613 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.04635353043989582, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011008091437590098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.31351855748984986, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0042413072919440875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.07451843059126674, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013538507376262556 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.08027003022288653, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014089308706206984 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.5250978321957749, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044081811988050555 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.12964573051803532, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015563770094126131 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08462110334448762, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017266673212118049 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5256991157959195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004362166200252256 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.13493557268343806, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019537513800864143 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7cf22366e2edb3badcd542266ed70f460067bd39 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.9133849288269293, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04864213509199311 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.10564975683529444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002213084642274899 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.590100549099999, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00427885948704963 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.16402787501046193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023163623544869127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.052782129610134225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013492091587997011 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3231207925990761, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004256840681994388 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.08217622838318879, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014861030323633745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.08753763146321639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001717519449731679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.520111059713261, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004351158002319349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.13744565617599114, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017570257873037582 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.09389990009122845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019967608416885307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5308296127486624, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00420828920591214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1459665149392452, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021389387676408454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b9c9c058f7b9d01d6898b173dc5465d4c1e3ddca --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.869050618620975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05010088619712935 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.11579207158486948, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027602287304337434 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5769641854763163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004366449526449767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.17069557641057814, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023989321755758107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.058968068407052966, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017140614292096468 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.316819162619288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004169288630207608 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.08649984259029919, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015781356762232315 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.09510940810876867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002143422239902668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.5023698294671003, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004376544901142601 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.14186920549480303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018314233516765332 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.103240891734108, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0024728914583356716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5194520315283727, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004312781932079048 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.15217682563526053, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021887647918235626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2f19f88fa01ee67a8cb7ddab714d35149c4536 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8835135529548273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06110654983373539 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.11973455465296313, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002935618771876205 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5670989994254594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0044381978706761 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.17411810979908632, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0025066282022309615 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.06166786996995915, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018761458565112188 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.31541818233678315, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0042975296956600266 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.08905956074973984, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016585801215981953 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.09825615481667112, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0023871232471909232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.48839020581913895, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004392795682637931 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.14380871166547562, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019567050216977417 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.10703554238169732, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002636419782384992 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5126878599494855, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004346737958136646 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.15569229150500444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00226672543918796 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ceb07fae726df305bac865ef2cd9e909c038f03 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.027292817639688032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011146302412515304 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.18687932979464777, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004479317229289314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.04522889442637627, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015737069173370269 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.007585884325607279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006241186679155009 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.04765534435088357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.002842447724510974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.012155444595988949, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008835894771083748 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.025041692580921926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008766166267586572 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.17988054876134196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004148939076846254 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.04203637506264007, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012876809656108756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.02362548960154832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00092997017312708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.16710354998948618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0038167321468611055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.03939048904516523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013252431846109201 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.3002948882840799, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.014715720186430872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f340a1e41cdab679c20639a5438d63cd4e19e599 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.18576332777291357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0029496821484972765 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6724955092023494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004391893376238201 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.26592346891155666, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0029378841494143377 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.08325319598690542, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017911862931407451 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3291202433540325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004356879936719757 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.12050338190039989, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019700080370958755 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.1397921610751568, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021793915810409932 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5409023321464792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00450531183061488 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.2022817877685023, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002109071664874411 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.1579221423717992, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026163266317572958 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5798479579428988, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004227813926608466 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.22586865290383526, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0025693268375682216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.3174067250834143, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.1095420616112965 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8b9ce03c5adfa3d249beb439c060452bc5dc1e5b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.2102247428366051, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003936648833938165 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6673737234050499, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004469275663366487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.27845169302997186, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0031924386042951483 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.10221556389262312, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002526456468616791 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3481713656770689, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004502228925703636 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.13465046885050352, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0022013696056236006 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.16004846098807926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003086567397610655 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5368811685423543, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00452435180901271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.21334136742049684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0024025933828259196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.18189899956371253, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0034717018098753866 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5853292242838097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004328558737604698 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.24099516029436527, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0028365419353728666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.8194042853676433, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08001070612632513 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e592780a6ee7e02d3d276ccf28e50fac3f0145e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.2214105510553713, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00397852038777946 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6470607779127537, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004406369519857878 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2895578361728388, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003271301388482767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.10995468701919953, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0025990883422240055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.34262834959953414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004384132009223851 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.14344396360996736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0023093129067051307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.16816306750707308, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031335861563326884 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5184873981262235, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004441211075979107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.22148965895132458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0025138893644262995 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.19252227804988106, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003532058484150838 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5689593244437532, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004202078292368127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.2517396049055751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002894872783585922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 3.0104483725602478, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08167667230226254 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e8e0fd6d9271294412b94a7ece7d548abfef3ea3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.23364828958458597, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004386673691400771 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6182500596257372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0044847981647693925 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2938502007499035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003471270572664401 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.11633321819129998, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0029245832979215612 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.32548790345098827, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004370629409800285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.14528102357273256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002520152138311905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.18086003912409268, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003662637084531143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.49642367395924064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004453695802686763 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.22758047906242576, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0028386957066170496 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.20359868681678098, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0039290266599936495 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5457174954752654, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004285590260813723 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.25592564085521324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003081730914660469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 3.015666054890952, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06037288426011814 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..364fc23080f5badc6d75ba80b92c571292a7534b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.24397794795287123, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004784117876438954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6048647467371622, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004654471513157018 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2956946057982067, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0036965817626566813 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.12634347401889862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00332524580607351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.32128383108894604, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004439902579230902 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.14969245864050304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0027172096623066288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.19006924520843738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0040021127107316675 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.48595046650876467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044648101430210824 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.22972535325752128, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003016005286453144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.21334505184004257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0043020289632197005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5333276897372095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004389519168542818 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.2582096378416781, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0033136409122061722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 3.0967052765116168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11202147855167385 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c8aef2cb8b2d2c72aad4ee391a83b87136cf3f74 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.018405096713391255, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00028575863010679416 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.13437768144660597, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012216701727659973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.0314250857721497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00044204337073046926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 8.223475482087916e-06, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 4.15512150984225e-06 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 8.637745180723215e-05, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 5.0776197761980826e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 1.477135016534563e-05, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 7.496818160211679e-06 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.018401510662055136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00028569714825004946 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.13436090726457042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012213468546436975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.031419100488651396, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004419332549890634 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.01264230643724104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00017903922143252607 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.09751365313301781, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0008310455100700755 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.021716982601686145, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00027816849062495334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.0019246988010293315, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0004673515089684523 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c70ec292f4f0774afa18a1af90895cf20f7b2b44 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.12318551179378552, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00265250718788714 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6558902866499821, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004947345321902676 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.18551745196543523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002702377330670695 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.05344001076519088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001521538259378062 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.31589673451602196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004512575556417795 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.08099215111185744, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016729988751392074 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.09390021055693351, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0019816232535704525 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5356505485725407, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0047206486739923195 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.14267940599206763, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019480493372883946 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.10862326603227693, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022972669057671545 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5918746015837786, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004841757394783012 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.16432822281253456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002361497951165923 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.6637023692480732, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07113198418739454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..645da5a1b37cf32883d4a859ddac4275a1845fdb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1666446893829218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0032707753020478673 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6729567321222975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004305012822265882 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.2377267859934131, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0028511858947845655 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.08075845494399067, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0020915699463510836 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.35411093297637475, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004506419345004118 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.11449533899405696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019357009062000538 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.12643098669316738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0025705410191854284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5356644614517825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0043565730389747955 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.1809798890103334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002186958670887679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.1479261397803003, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028341939795893155 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6104555998853953, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004304349996355649 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.21219482060830694, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002515047834126115 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.647805827243435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07629701303117106 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f010146c11b9e2fa18e2d63c48989325aa556123 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.16348222985039676, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003146872550209196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6750999942571687, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004231543295192062 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.23731957047929436, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002799505256926228 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.08053104845339831, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002115424621313934 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.361316044646828, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004498247573306673 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.11630716134029079, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019838199857175193 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.12237541399150458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024629041187135487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.533219718242574, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004361069891506727 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.1784341269505221, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0021166909618823932 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.14659271750171685, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002803961686626933 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6147977963499615, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004227824501478167 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.2132548329175755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024974068122956754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.7650889809369867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08861961620174373 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f03a0ce225f6fa75e75af2af192111cbe1b15fa3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1597629414302839, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0033033901216669354 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6823730805286056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004113105685624019 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.23104727016762644, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0028026542146094534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.07889136187736123, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021839099115212703 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.36533837647073053, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004382971120376491 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.11317180592447942, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019371364795214888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.11952172602503129, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0025993702279697504 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5378383490680047, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0042111474429222555 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.17336408160930467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020829759705918555 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.14312638913068235, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029110532892333936 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6230515627846621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004122693884939311 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.2079495191882183, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024838961950283595 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.618154777450835, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07956567939253502 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..640f60b5edd97351aa9e78363826af2a4252de0d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.14955217982650695, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027686157067858337 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6935554112517629, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003988293582524454 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.2261504763429586, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0025955875570617386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.07346044534208657, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018168270456380426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.3742165028406123, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004409012138004924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.11088772044085522, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001779337380104819 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.11122194025511048, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021369513975720436 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5461500634585164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004179927239919983 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.16925176967777375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001942005609299536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.134551132386729, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002499557879854736 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6321720185352039, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003993926482242108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.2039003781849349, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002364317599492143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.615370951594709, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07817523000072259 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8ae11ac42b5a5f60824863071a5d84b01410a7dc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.1646736204497174, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001944375965208406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.28908804331097093, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027751587359995016 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.19551467871244005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019193758332128445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.033596976210297794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007783779535836992 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06189607439452786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015755697883312382 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0401546744710191, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008933850730994426 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.1159536094809458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012430606328199091 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.21237606965032682, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021178449235415907 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.13948475018165474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001256562390447179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.15307248616009633, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001797771989893273 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2697257608869243, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002597333354461373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.18192377197057139, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001775025387207881 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.7901688600266696, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09473494560163237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..290b5a327387b9f071c62bdf036e15314f4fc787 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.18024946393550403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023365148116592273 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.2858663200405377, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028577355675838095 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.19965649147362877, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001954107478734798 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.04273282245742925, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012020669128774719 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06727475322952328, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015736861230501358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04582510801169016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009888177785209124 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.1309377911100276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017619039502037427 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2128779538670664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022235556075845303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14528886846186012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013520944350136044 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16786109069022864, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002201832250644331 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2665923708356373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026751422180246536 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.18583509127014775, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018160956867022732 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.1642687288297267, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03609118184667803 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ef07faa9fa5c178d29afa95daa23dcf4aa267f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.202956767711254, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002676187150983512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.29159713975018653, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028298980612215777 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.210239996789526, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019255181063177478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.052380042676589786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015473198207191262 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07214674692201985, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001657658589837987 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.05097602932656367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010520475342058347 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.14900247000453432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002165214590894441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.21545359034531394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00221033373558327 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.15265446172906474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001367842815236307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.1905565529645814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0025620703427353705 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.27385801709096713, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026819224687665406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.19702119815111355, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018063906023379486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.5286650412737517, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08014901866692561 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..18db68c1e7e2c89ff0b41da99627b762aaebeed8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17231067692889554, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027956027935857304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.243172171476554, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032858246005163156 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.1757806794088883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002256613151873409 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.04314429570273796, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013561190191307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.05984664442402973, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015879641807220839 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.042273822347749235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010073987728216936 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.1269442197755899, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021738281830229454 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.18134356725730427, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025688638483573395 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.12839540685323708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00161856743478718 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16152443369273212, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026463298498459026 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.22829827365100974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0031167189145814398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.16451248533027701, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002109355534586791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.6775327228175545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09702171839636106 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3a17b1567bf5f52e6c915f717f54f849c8a6e0c1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.05610293921952886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021934282933142807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.08033002095461989, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002892648072265376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.05586170258027261, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019271475347510442 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.014518361578080535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009832377227663412 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.021124036864236993, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011841595966348834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.013928620381908459, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000708425052711742 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.042762546108040304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017529381402243264 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.06135293147334968, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022662727518870454 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.04182384962792516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014295239104789405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.05281014844060752, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020838090836962377 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.07549636242613741, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002735416186516671 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.05238777661289263, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018093330643826854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.4585803694189491, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.042449805324812594 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..23cd5cee0f655d146f14fc181dfc4db7f030da58 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.01015187837282015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012118368481013803 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.012597080447224141, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012614668952345483 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.008568589881060913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008425364496636647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.002070478186206016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003593417665592502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0031561784431225486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00042801548350165436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0020632447863588753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002724530157499788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.008016061817972518, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010514974373518058 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.009704734876679203, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009871790542858633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.006424819886854842, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006247554238591577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.009477961645303942, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001148548084981138 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.011862110948524358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00119353729888407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.007978769124555215, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007798817433535373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.929822783994029e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.454754027890242e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1568e21e4bd97c8124bc171e58f4548446814360 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.07643821536141272, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00140218679541492 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.12653305762501957, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021916544609098315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.0881869594314063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014841987809875977 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.00991125872816128, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004535614337776502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.018159423646898012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009381758127412718 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.011775575519769068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005303096103283162 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.06754639878793735, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011484776442586243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.11490796429365001, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00197654999272724 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.07869794174000905, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012488046435799164 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.0710963976259558, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012842232965217812 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.11843767033976603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002056362191261535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.08220909707902971, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00136877406788079 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.5845357736513314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03072266241415454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6c38e3fa9fcba327c963bb04773aebb75664689d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.13347948856316436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00190203694307572 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.21739906685237975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002808744726060509 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.15296390666938464, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019322995775618457 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.02419402358312838, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007370518433472602 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.042662063621771204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001398060091307213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.028480082558715494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000844119944335723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.10050917437006718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013279033129049013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.16941427074493215, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002206464398514822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.11635297169383609, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013711527236179993 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.12419910398685438, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001754709084087944 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.20345233530607656, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026368733023479575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.14255020342356567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017856582246053023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.6714920428051223, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07493178085444238 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ba252eadea8b13e3de26739714ab60a0f61508b4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.1554086212544319, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002067211936039912 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.2544322573521774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002810922596892263 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.17865057487457384, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019832213339117484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.037714132074904975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009374766307825127 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.06361946862872042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001591682210752721 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.04342414284752008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001003062599440997 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.12334722471039417, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015067280447067734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.2091785533285848, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002327818378218966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.1435215312656172, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014687451140009584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.14357511331593745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001917549831974918 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.23666987395605485, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00267335114823891 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.16537912003845628, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018481029569662097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.6275735827595286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07295788379483721 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..45006b09dca985e203e48d5ba380ead5aa6c718d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.13637703415923158, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023528280402658097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.2101854175375818, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003134663411489244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.1487528972942182, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021573858174289036 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03442882399419072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001085658422803244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.05416049539107172, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015569213296257193 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.03703687600072442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009586366255869859 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.10957513679331089, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018197153562858968 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.17454640678104377, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002643517915284805 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.12065017013720424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016584003927085527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.12614818245454176, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022002630882711474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.1951306252568764, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0029575901472317177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.13759161249237567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020057670100919424 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.696717061739458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10004414748713579 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee668e1e66a4b065f10d313d742587e69eccd4d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.043639000260860615, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017644092166845472 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.06690857706863987, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002513457134572579 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.0460595175107433, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001696044544780373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.010977304219473067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007111284641033093 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.017770704280928078, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010731197359821348 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.011604393750423206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006238601062608368 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.03598281883079817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014505603927416834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.05685382605643658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002163845582643641 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.038179249417304884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013818818185200492 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.04044545116309038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016519202271058225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.062070973864341086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023531363135469723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.04260911072473102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001575090983128174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.45244912498308465, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04522890822214683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..325442335c53db1fc50e6767460a4ac94f21a39e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.0070189779398907685, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008113662184147494 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.009756903947301174, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0010491271958210223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.0070051523902687545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000741818765951434 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.0018834961678454644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003137323371258015 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.0025948535559862265, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004145432652310791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0018773141620031116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002924665783736417 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.005576197279787843, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006409300377598502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.007827196064222617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008343635995047826 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.0055660827324147755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005823170803562475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.006493130647527894, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007525468046300129 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.009066194317283121, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009684333729492796 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.0064846745016825785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006821624022857435 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 5.251000373867168e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.1017248657586459e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..37b0135e29b616fa46665aaae430186d5cf14d41 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.11663618257355224, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020010876313614093 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.17750836507313886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002420755888491993 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.12655297110315525, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017223356296797345 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.01645523569334117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007281743616814746 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.025103051974347854, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010138593670258253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.017208857113077958, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006040791246522063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.09528496641012617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015116946024390716 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.14985724685972096, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019870170906742244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1047005055608947, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013001810997754814 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.1070586643291757, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018157291974967367 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.164451689142304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002238195137110574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.11665980577339671, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001570131237321446 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.6707040144799241, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03623222494925353 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e55034a1d7e37854e27141baaba5e70a6a538c5e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.1538305631637577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002118137735001832 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2534164331356793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027384226381832084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.175746351793284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019277453775472107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.030776430413685576, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009835988394066894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.05014575490604494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014072381348858034 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.03420510386300404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008827127864692935 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.11549602941828332, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015598015652852524 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.19577461014108763, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002133713428500096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.13274190810557143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013509301692574175 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.14403643967460825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019822677694410597 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.23804944003176418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025835044144481086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.16461109581502553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017958278888830555 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.7421869697677301, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09072404254874503 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..97b5a9265d01540b334d0d37af3e5893259fbcc4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.18530745879024665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0026984526710165596 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.27372904579227686, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002825062477476198 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1952096816483372, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001995746852085018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0457473960215223, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014209292446018405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0647678078130054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001616150713511116 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.04558123756721243, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010186767133771723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.14324298206218397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020861451808508974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.2174336811042295, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002306599328778544 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1516146243997318, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014416996193126431 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.17310048096253244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002540880696334963 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.2564277940851817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002676735414961789 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.18225288248589572, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018597154608296434 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.552370721219516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.056456617033753535 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6893f042c486f8a872dcff60cafad4a81f1f6d47 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.16655273761856884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0030693164785274346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.22239594480968503, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032442335330023385 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.16311913030851707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002334208239517657 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.04426024560468626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015913171237693896 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.05557686275427169, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001569025220107771 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.04051754398323109, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010700958454151245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.1312478956516053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0025056286449919598 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.17774125142496916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026625038105766233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1277212531982069, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017662200246993489 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.15605200382139137, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002922128458943474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.20818681432495342, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003071577904437599 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.1523177208194968, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002187292192524935 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.8903876098590975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08033021470087338 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b721302fc58c4b6b3c0667d33af5c1e833cbfbcc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.05445104964004579, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002443079649660942 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.06600911297746138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025520971880013655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.04834308907312041, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018503707685198511 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.015529728785536703, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011838329122922067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.016747064756245943, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010305687534927282 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.01222026826951555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000704325225575219 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.04394332257272588, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00201438218193949 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.05360636578416732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002104351294157986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.03857646230433444, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014595106562382601 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.05112939118161437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023090030630797185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.06163586533175658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002391851276162087 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.04516151790202869, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017340898610069717 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.3208877330233361, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03494147074048822 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d839d2fede44ca136634023959115c38a879d90 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.007010026455738011, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009017435483523394 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.00878867615147148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0010157525862754392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.006459190800150568, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007207860950963499 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0017497370302994892, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00034838975656659773 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0017941002253548174, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00029001602890075666 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0014447433493777688, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002371563408671573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.005424480425576281, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007093535726255014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.006810607613407806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0007726864270681841 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.005000926315966357, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005492449452936106 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.0065291661542634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008431825007789647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.008127880829555213, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009349057417684032 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.005981178381897325, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006652187976853396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.642808156949046e-10, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.0958818603819076e-09 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a04ea560bcc0474d88bfdfa709e04e0292bb2c70 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1463073350919793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001849976731473478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24687692205659445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026245416604448834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17104966216653122, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018388410541572498 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02868070817665277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007455100631059865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.050970339370236116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013773850589296726 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03393757157227001, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008395819605445009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1134975332744349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013046086784248739 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.19882847469371265, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002160779528818444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13431002294362815, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001329252784566218 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1341572569615249, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001680913399139742 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.22735010800139077, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002429214690741554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1570085078363825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016723834304551517 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.4552750330972206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05646997136025405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4051788c590cc0a7c36d38c9a6a2c792a2a1e236 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19104818364291273, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022212767005973058 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3165545579856358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002818835296854103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21883314563862602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001968035833203396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04737338234449901, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010449765003741809 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.081573363995006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017297723073854117 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05423182118294372, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010465141353409224 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1352656471757331, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015360042939327185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.23168675674466935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022572191515858007 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15600647601501733, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013358035921943341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.17973117727215357, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020830282164021902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29889898054403574, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026791790500151685 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20602296990458846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018407304080033926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.807870663958701, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08388649431195123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..46982b7aead4fd0284246be5d8eccca95e63c4c6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22091940935835946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003036544780507811 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28592540020450335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028268458929152375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.213319730404769, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001947942927707738 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0625668775391695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001863481800778566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07636442527094099, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016705370533154184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05619324678157442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011378174233996204 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16524622277086715, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002504977446892061 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21312172953880804, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002230572391304009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15707830961714367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001424704542577375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2088533869488438, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029209986594073956 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.27031759901723496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00269091513844755 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20133540483239756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018432855451969799 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0758182963274967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07269865886048295 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..99553da8824ba6c5c1679c081d0fd3d2013df41a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.21388175415191102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0037807201763198858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22109179833036308, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00317048229219639 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17690311487463797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023173541475324253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06310561248802647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021350241076741575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05983363177094545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015915281660158853 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04775424867054453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011657814934947282 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16656647324261953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0032154715784240336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.16788106886771248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024768275501905303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13377606619329538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017521905460361976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2032660883469589, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003645991164048245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.20988404520976878, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003033569663633178 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16765421945999662, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002202375233222233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.2676357062273516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12290136548273946 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b2a87b5768349046192aeab42c0361a3700bcd0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07217489085431607, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003030440715398035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.06719640775054124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025953918642397556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05381265634572307, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019615926895963105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.021665967083708774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015098116987345535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.01860172806438717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011134284906677621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.014607128479951145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007893617726734669 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.057955761496930415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002549676328226009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05288705994275681, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020802147270799964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.041897511040161206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015198623386063986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06845672227370601, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028945029688266562 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06371644081837961, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024692144864177654 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05091825334557307, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001857333758746474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.3207302415446247, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03951812995159553 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9a0e2ba9890b3190b5b0186c3ab526346123f263 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.012746310259293975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001498370110228753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.01036719903184163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011041663951146658 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.008600982975105755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000871586481245996 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0046837043167768724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000851048694914921 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0032041325852383415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00048168285623642387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0027143726441978717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004021894165602589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.010658331542732118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013171585867806077 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.008285275201527513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008892336234217646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006953217638177468, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007188116843350712 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.01225616498642844, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014511703007000516 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.009942438567258063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010638347073576458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008236010332581721, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008379564793264335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.2490869778774144e-09, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 4.579875964652809e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dcd97431763cd6514351f178a4a5b274865ac37d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10754772441287547, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016958455385689898 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.17816562206177886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0023545814766439887 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1242136099389791, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016788768510989007 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.015527269147006644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005591631732568687 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.027484473420738165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010336047843406612 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.018163595558952392, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006156401137280107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09256520381779812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001314822114051182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.15709670414112983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019776606072259206 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10783351661135016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013120711524955912 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09949748623454367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015502911503945975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.16623114509571557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002196915069674266 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.11523321457897195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015382866944637125 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.9470864610845786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03249798513588096 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1e8c83e33688a01ae29c6218c4b2072c1bdb0d5c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10903330571413737, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018189243088536072 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.16131809640403347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002537055300872018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.11703392994984627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017519304821591252 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.013663796459003277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006556047677219756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.02186335600606571, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010597085086764856 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.015006978500477466, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006493711106973352 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08286088730918939, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013035354194490178 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.12527349544952351, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001902974449846937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0890622266980778, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011999850822209978 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.10274455693959166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017065171168041343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.15212866545305415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023755694730805688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.11027318467498079, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016371611741646582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.0004698111272452, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.061814579441424315 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..350a014c875f0b29a99388407f597c698f5b2054 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.09690581008488926, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024248475400055754 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.11326850686300136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002336894022718304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.08849504604578734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017680512462351341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.018300502482703724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001212414444454405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.01947166619266159, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009643040730582587 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.015011655554296155, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007047911945416481 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08087122864775938, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002027221626489267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.09562923022814045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00189683385928486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.07359670365804201, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013609375455583069 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09100233183422309, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023243523379837595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.1060183257047995, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002210133291756926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0827072308194733, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016679545909741406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.2275559960359324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.055602030188377644 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f241054c9972b40d23a2fb0c127d355efc126275 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.08727403229824231, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002754747362815702 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.10038330296542393, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025674911073591644 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.07581549153450809, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018567329894756907 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.022062717078419312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016202827842642569 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.020684116383698558, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010865171767872877 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.015436277594316166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00075744970662451 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.07371123650658079, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0023770899696980465 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.08513597314680194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00212812432731865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0633376000334696, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014673941274429858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.08218273785211457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026325580514724206 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.09419184346305906, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024294175505879165 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.07094827851387892, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017436711590059298 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.4434859465762546, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07239736975664855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bb4de162926fe85cd6d3fb0155fcc3f4c76342cd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.02122229087394324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014003737821723116 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.02652989886804251, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001520146131752233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.019677720071019873, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011247541319417723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.004726681009885704, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006457139194226688 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.005360196615771528, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005369786057079111 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.003915565527437388, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003687016348744532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.017594475833257826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011648478992631079 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.022309854870142254, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012646138680947798 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.016278889399882698, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008990662478189324 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.019818945547900205, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013230955041361396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.024765412779247827, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0014222742525507647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.018303637479182004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010484815874735873 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.050241534910141376, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.00739529390509812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a629f5da0c123327961f9a578b63eb2388e87e31 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.0016940318129783684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00043807301732911447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.0021116213008436154, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0004157235401925989 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.0015455675621237376, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00032572665041987446 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0003818107289535861, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00019345670461531905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.00026797247013965285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00010331614803977232 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.00024341445452042402, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 9.580449462072127e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.0013961176573495614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00034646394432675024 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.0017757153532734638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00034186622977328717 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0012780986139275944, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0002518502465672378 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.001588174661722081, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0004292160855573092 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.0019250318967419596, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00039002719041384033 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0014218801930012729, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00031119324514029866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.862729574232818e-18, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.585352301614098e-16 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0b19145a6f891a3edd8e152e72dbe679b040e0c7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203928 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014683991951087967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..243a7c6f8d73e1d56dabf151971820dda8891723 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014842213153411244 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e092ad421ceae307ef1fe8381e90c2c6bf7014d5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014933117490932577 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014818724459095526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..39d84af3630d20e5a7c4907176f655e6a378c50c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01506047203170662 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c94142c92aba9383ac24590b9e66f05a63ff8f54 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014865395385928373 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732958 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..671cd0316a3d935ab89e69af55f482892af42da4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014976758771620335 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01483050720454103 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0637c79b242e20b07e93759e7e6b53b529336dd9 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229871 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014987482264363937 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb07c50aca3df74b3dc30aa696fafdfdc7e2e6a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0a0561b63b7d77d93e690d43372e51dd29d55955 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.358, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015167928865407557 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.351, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015100563798316407 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5995d34b520b8348bbb43ca893327102f33e77fc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.358, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015167928865407557 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.358, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015167928865407557 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1284f04b6ea46bb76d9d737028db0a39eadcb060 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.354, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015129868238451775 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4ac6fac5a58ebb90ea8e5b99d05f4a9171965c3f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055235 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811478 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1127739642dc2bb3c229dd675e7d2d542559dfa2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.358, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015167928865407559 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014933117490932572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a2d0ebb8977f30e181e7179bec411d766308f8cb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff136c30aa6731aff7c7a618e8c1c9b31d7c78b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.361, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015195720118175115 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01506047203170662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..13b6d6f96e5388c7bfbdd364b8e2f0062d6c38d8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015090650341444233 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d374ab6b80723c9b2d703593fdcc8b665189b5b0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015029633724408947 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014842213153411237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..37420da8c4a04cdd210315fb6b4bc4351d395ff1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014865395385928362 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014709193056057128 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d4b279bad855c37869598fb0c35c9ad5c6e01f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014842213153411247 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.0149981313484027 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..387182fb71ee6ff5deee71125cd862a8a43ade15 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811482 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..93bbdcc6ba156e51146b7247250602a2e56df2bb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014865395385928366 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928366 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a12ae14cbae5811bbcd1311a4fb13701a8432143 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014853842487270334 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014794927843348639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..14bbed8d7e2017a97b003c4b958fe87907f7f82f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014853842487270334 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014933117490932573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..14a827f9919b370e6ee06c4e51bfa17c30665c28 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01488827258820394 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..859463bb4c5470a061e91e5a5ce7615d253e2013 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.356, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015149042659306625 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b600cc91be44797483918ddda1e0cb7a4d4b347 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f1aa0f8268dd8ef43626cf26ce16562affb8b6be --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.358, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015167928865407555 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014965960710224473 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0cef61577ca8865699a3439d99ce9f951d75a03a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.355, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015139491543780529 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402709 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..389fa4d6f84d8dc02a37c19a88f4e4d5c63cb43d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014944140233795027 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01483050720454103 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..68d9e552c3cea6ece07f12adacd05b17a8008675 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014955087918653605 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01486539538592837 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b20a7f07606a5011bf705bb8773bb9b93ea6b11c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732958 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.357, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015158521721486767 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab7e28afde4add1989ac1a7f24acfb893ef7150 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01467127282297788 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01478291360099668 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3b0db293cc6ac09df5361e6c3c8fafefd62a632c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229873 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014865395385928369 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..efbca88891bb70d5330df0e3e6eddc10a1f652ff --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01495508791865359 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456734 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ea77b2f3153b58e7100c5a5be58a411afb2d249 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014944140233795021 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.307, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01459328489285263 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..70180e46301eb177c8556ceb55c6cce073788864 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.015008706182121731 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0065b6b240f185cb879ca60ba85409343070fa07 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229871 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014910846164229859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..316488a888f92728e968c1d4084b086f54b37c46 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7481ee04fd89118f1fd6263274f19a8f2705a5b8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014865395385928374 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014794927843348628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..51c8a1cbf8c30ac685f4a5dc8ecd1b15cf79250d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880215 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456736 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..64340a7ab6e1dc2bb5e2dada7185b9c12a9981e3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087964 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.304, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014553205687950436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6677175ce761290b727b017d5ea402c7c8f051d7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.304, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01455320568795043 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014806864733738857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c9f71055bb884e803f4fc42dd43bd6438b18d341 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.015090650341444233 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5542682a1ccf015f517d0206bc941a272a834807 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a4f8c0042f9138a8c493eab410b476c530279b47 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01479492784334863 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014782913600996692 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c14c702afe56fc82fc51323f2c2daecf43a5aec3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014965960710224479 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014671272822977881 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..69b01c4beae6d93fbf257fe02f3f0b95c4b9fc42 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732956 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01475865230357489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cc25c85bad66773c758baf1a6d9f7dd30971906c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014806864733738863 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01486539538592837 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aef49b09f170470bdbc473a611433f52dbe306fd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014987482264363935 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203948 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..10266392ab12785bdd46bb8520a207ebf3305d53 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087973 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c3b7d8f3aa259ad837e91b627ba40de86c7eeab8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014632638658632895 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014746404865473477 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e452dbabb800cbdcfec564c6aa810a417f6e7dfd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014758652303574885 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014794927843348633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5c31c9b2af55d70a5f2335de776fe1ad17c5af8f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014830507204541038 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014770821817934645 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..aa9f1f454e29ed05fbce5d2b5d9af663830c0d14 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014899597242811485 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014865395385928367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a2b2c0dd74f146d20d660dc39ccf7c337ed6f307 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014976758771620342 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ae4515f5f7bc0e26861795e2f1532317ce1f83df --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c3c78d4a9ea4b7dc9a2fd343d2b3640a09ba9e88 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014794927843348632 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574893 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d5374d08e78fafaf821fe687efedd21beca52225 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014865395385928364 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014721675438880213 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0544865a8c695a6119ee328e0a6236f8c0685dae --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014865395385928367 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014709193056057137 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..aa5b302da0d3148bd7f88ecd3e57438e2b36d48b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880219 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014782913600996676 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6e767ec79ebeb962d5322752e500abaae6a022ea --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821476 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.35583333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013826518748493324 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2472419e90b0f84ad32b084fd9769b23dda74460 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01363087184382147 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013728421539454872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a7ffd3e2b26fb7fa2cedd60e3a0d4758d3331509 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01357953127780092 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33916666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013672343491681822 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb0b3e98d81fe0775e1d1c40a068bbc909649d99 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013596836729485164 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3433333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01371263383046586 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..33db337668142be044d060ff2e4580fec20fa452 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013544340907003663 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013664144006618271 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7b965ddc4896d0f43d2aab8cfb3b1004c7878562 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01346230971200513 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013553211167251939 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..46a5660cd2d9929a3eea6b8061b062464f9efd7c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932877 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013508372867300219 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7dc9d3093b33b5dfbe948b462f75c322bf63908f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bf0cb054ad137b119f2d1ce55860e4e708b103e8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013508372867300219 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.30916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013346684134591948 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7b242b095c1dddaa484fc453d7f86312c2540ecc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013452948996996296 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103247 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb60134b1d9a55b92afe13aeef0fbf316034488 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01342456883035645 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.30333333333333334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013275870057740436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d64b0e2a15e4fa4e40eb7865646ecfa5ddb0913c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01335659633120026 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.31333333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013395739415639082 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9ef25938a9d0d7b32418e218e7d6518683937d80 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225603 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..615a6e52e73f1a2d222e22f35383218a968e79b0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0baaac2bdd26cf3f741bb5593264b5b37ebc4d83 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013526454480351021 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01357953127780092 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..786d3c9d5fefd393a8de6e7fa49cc6f78961da99 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013605417345710526 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3258333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01353542204341746 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6193c284ae8f095d62dd160c263ceeddb340a2f7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013424568830356453 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013508372867300222 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3752d223fc190df76d5b0eee3a6db6e9735dce6c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013376268790982096 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013480882752851555 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f5e162c95a0123c54727d25ea4ad83b28b649692 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013480882752851555 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103247 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f7aaa3a1c1d1944059a553873ae64975e135e2c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932887 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01364760294240639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dbde5b5227aa3d36b764b5d263b438381cfec118 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013471620929769144 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..273836b7f41a4fe73d3092dd95d398473fc4bd6c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3308333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013588208070709007 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013579531277800923 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b7b822525d4b3ddfe433bc724042000d1537f380 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406394 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013696658778002515 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5caf408c717cf06e86158337f2a2d388a3f19cbb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013664144006618263 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013680495725767792 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5316be2ad13bddf63fb9707ed3f44e5a2d53990 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013696658778002524 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013622434813136774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..012b98bae928663c62660218c491205f35ce7cdc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01364760294240639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..312aa95de0441f81cb4f79e00613559966ce0a86 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013526454480351021 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103247 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0edc88fc6985801722d96cd61388a9e40b2234f7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3408333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013688600793296936 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013517438120881629 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..33012c90f0813dbde7e6fe3071f4e5f9bf5832f6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013443538681348054 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01355321116725195 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8a20bcb4ed3387bcae57e8b2cb8e4ab194e211a2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.30833333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013336721143136467 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013462309712005129 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2eb07f4669e1870dbdd0b123e1ae527dc3b4bcd1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012414960524301829 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012414960524301829 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..511e032d162b2ff73335e1b2886a3f891db24752 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23037542662116042, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01230492841874761 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23037542662116042, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01230492841874761 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb330eb1448d18750b4d9767039c9501b93d9ac --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01266819862131543 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01266819862131543 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..56f4901352d3304f2aca216d478f9007fbcedc06 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012739038695202107 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012739038695202107 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e0701bd2574ba23094172b78151bbb81fd56c3b1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012653835621466646 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012653835621466646 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..625499c35b2cced43eeda7be87a3cfdcb5db95af --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587092 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012536554144587092 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..435d9934107d63d36a63211acc61f651afbe94bc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012739038695202105 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3046075085324232, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013449522109932494 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5f80790864127c32ac69b124c33060d9740900e2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2593856655290102, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012808273573927095 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2841296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013179442447653887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6067fdc638c051f94b02a99c4d61ca3da1d8e771 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012668198621315435 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2841296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013179442447653887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7ee247c5681360cdbc4140f2109d6cde276338c8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012682496334042967 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2738907849829352, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013032004972989501 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..56f4584ab51078ab45ecb578b9eeaedf340c1037 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012653835621466646 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2773037542662116, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013082095839059374 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e1dad2312931267b4a2983a71dd0ee59d06ae3a5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012696728980207706 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2773037542662116, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013082095839059374 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8eed930117a8773dcad06c2ad277ea56024f1838 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012368225378507135 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26621160409556316, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012915774781523224 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9756f9a8a36680246a0c034040f87bf12fd68d4f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012414960524301825 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012928933196496345 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b829fec29fe96a970eb16c3bb7cac6ad85174675 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587089 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012794553754288684 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8457f7df0900f475da1fbd9ee858599a5175023c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012445770028026206 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012794553754288679 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e86eae82347a6bb84d3553a41b4854e35fd1bec4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012506564839739432 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.25341296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012710896778378607 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb38a0d4f7cf969c9d5fb5c16241d2d038545ac --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24658703071672355, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01259572626879013 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2593856655290102, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012808273573927102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1fdf440bc6ab5d2e567ccf66131b30407e7415 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012414960524301829 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012414960524301829 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..42e1add3c5d8acdbe2a528fb2ef91da61c1a10d5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012288926760890797 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012288926760890797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d37cd68d2a865d75053d50f5333e7db9f1479b1e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24829351535836178, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012624912868089769 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24829351535836178, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012624912868089769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..345cc9b9f28d319699eda6f3325e5275608df0d5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012682496334042961 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012682496334042961 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2239e27913b1893b04c13598bd91b4bdcd43a66e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01249146853239057 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01249146853239057 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8cc2ff178bbd01e2b377449b928b81d6c32f267d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24573378839590443, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012581033453730113 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24573378839590443, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012581033453730113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..61615386d063bea4c0a18634fcc2e26cb9f717d3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26023890784982934, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012821930225112563 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2935153583617747, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01330725044494113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6e056d7ef7050c68924f17a7de160e82808cba0b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2440273037542662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012551447627856262 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2883959044368601, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013238394422428162 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..075c80681b0ebfb0679e1a8879d93997735f7480 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2440273037542662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01255144762785626 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2781569965870307, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013094469919538798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2a369e6a96101a34c964835365bb5af60f4a7e22 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2440273037542662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01255144762785626 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2858361774744027, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013203196088537364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..56c62d8729d21d9f0c6f34f6cbe17eb0cec6513f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.24232081911262798, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012521593295800118 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2713310580204778, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01299380772754578 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..263b75548c5ad33d54124801bbe4a3c03cdc0ef8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_challenge_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01268249633404297 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2627986348122867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012862523175351331 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6e0718cb39400b5d1198c9c33acc907dd345a733 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008885233166386385 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008885233166386385 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3d6fd9410d401a467e0fb4818c4dbd394c305cbb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00880917174472056 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00880917174472056 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2be774e52b5568e754caeb8083c72b616c10721 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24116161616161616, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008778027378258021 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24116161616161616, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008778027378258021 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6fdb65045c2eaa6ab877056ea9dbf06e6fba5a32 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24915824915824916, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008875238553583164 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24915824915824916, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008875238553583164 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4113135bdaf50b98eb2a7fb91da6fdc7d89ce4de --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008798836444222042 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008798836444222042 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e983d2c91ae2b56d4cb844865becd7808341c232 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24326599326599327, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008804009846865536 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24326599326599327, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008804009846865536 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cf67e3b552666a04aa4728819789608714c98820 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.35395622895622897, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009812370644174421 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3253367003367003, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009613427708996196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..de150cef94611679941c6ec4d0c348c00d6b694c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3468013468013468, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009766326091716005 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.32112794612794615, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009580787536986797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..517622923c910a63d6a8129990da51900ae78752 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3383838383838384, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009709034670525097 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3223905723905724, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009590672908157438 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cc3574b1e3217818c99447ffcb7bfcd2af3e0409 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.34385521885521886, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00974666058485245 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.32196969696969696, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00958738669630038 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ced4e744ea72dc81657a84c2bf4d501f34657ea7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3480639730639731, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009774627600259014 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.31986531986531985, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009570821820573587 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7e275ee3b73013018c64269a4390f45961d0d533 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.33880471380471383, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009711980224301643 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3253367003367003, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009613427708996189 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..03cee34cdf5e255c5dc62877cbfed3a9102c9919 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2937710437710438, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009346423298166725 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27735690235690236, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009186490105111899 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2ae2c56f9cc0f7cae4e1ae71be6aee0daf3b5db7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3253367003367003, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009613427708996185 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.335016835016835, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009685160765932363 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a00fbfdd36ed113e1fb27db836f087f1b1173db --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.351010101010101, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009793703885101047 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.35563973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009822854395535489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..21c91ae4d621f50c1e87eb124aa9005a45de9357 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3446969696969697, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009752321586569784 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.36363636363636365, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009870849346011769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c05b38e130e428d7de409b13bcb6325f90088e41 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.33754208754208753, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009703117820790301 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.34385521885521886, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009746660584852448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a60a1662fe0b97077eee0cf0384ca2309f99d220 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.33796296296296297, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00970608053863286 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3480639730639731, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009774627600259012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dbb260867a716124cb1512327b721d4e72146f09 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008850055161459239 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008850055161459239 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7624b6ccc62c83c6849d2e25d091313fbb3edf22 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23905723905723905, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008751754723580432 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23905723905723905, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008751754723580432 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9de6441b3a5098ab6abd6f02d10c0138088a80f1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24116161616161616, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008778027378258018 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24116161616161616, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008778027378258018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..64269bdcc16e61360d0459b2f4fbed6bc08ee41c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24621212121212122, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008839902656771865 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24621212121212122, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008839902656771865 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d609d475496454198dd1d111fb3171d566bf3814 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008798836444222037 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008798836444222037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ff6dac5bcd00baae01b0af6d634042585c564f23 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008885233166386385 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008885233166386385 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..383a5a5d5da02df5e151220daf80acd0176290b2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3501683501683502, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009788295410093153 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3207070707070707, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00957747457110883 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4e6a39c06d8cedab2988a127022fa0f3dd7721fd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3425925925925926, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009738105469984187 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.31523569023569026, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009533589368505848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6588284b3d9fc1343f15a21e1ed6cb187a2e454f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.335016835016835, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009685160765932357 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3202861952861953, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009574152668739424 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c9471b385f562c0ba5ddb95a3036e5f4a6bb7376 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3400673400673401, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009720765494805276 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.31902356902356904, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009564133249441085 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..faab14c17bc71416dcb6a1f78102a68f7dd85115 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3367003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009697166595752472 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3164983164983165, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009543851857323888 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..01205c1cdeaf2e85f5a172e7428cb22d87a827c8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_arc_easy_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3345959595959596, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009682137724327905 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.31607744107744107, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009540440071928285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0faa854a23fdbdd5b67480d568e653d5fdc4b4dc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5143333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009126478842204577 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6296666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008817866528166162 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cd3fe4d84acf6dc293c2a3c0de14f9f38e5e53cf --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.493, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009129336317272389 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.5726666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009033293159951217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..91fcd16c8750a819a21d0f2035cf23fb4050214a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5063333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009129498646958133 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.5886666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008985524690229492 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1297842bb501ea80f6614a819b94e583a913a49e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.528, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009115903679831517 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.5966666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008957972256087354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..01a9aaaed77575477e8a9cb466148825b4f3b516 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.531, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009112665923139411 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6066666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008920048383377182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3586b1abbba14f2958e041f3053d7fc684ca03cc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5486666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00908687931270849 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6083333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008913348354532974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2f229975e04dc808f912d35e61c4f90dcab728de --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6233333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884811049411477 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5203333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009122678313140908 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6f19abbfe56be66c58999ba310f0b0e79fb8b54b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.546, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009091509877386513 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5413333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009098980657278165 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..87a4a0fc63c2bb495638315a8b426759b201b160 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5836666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00900149831714761 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5663333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009049526374650793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..63073f827e811b9ee1aa77c1f85ea024c8298248 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6116666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008899620943397692 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5953333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008962735560535848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a2e1e7f1881674b0d01ff3f58e1725a4f5aa275c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6136666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008891174310695492 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.6006666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008943269429955153 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..82ec1d12604789cf9356d552f2f6a4d34f3d499c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6126666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008895417372116209 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.603, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00893440584870012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..61ebae85cca62d567554691a253ac0dced4f4760 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008846558976258922 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.6216666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008855801251873015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a503d4a19f796354071e8492f5feddcf91d1204 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6096666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008907909838637953 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5866666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008992028793524417 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..af59b16992b62225c592a25f71a6764b812e2341 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6033333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008933122315228996 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5933333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008969751860881005 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..45e7e8eb0b174a447beeac51b64929f994e3792c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6083333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008913348354532979 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.601, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008942016171856509 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d98893c187314ebe6f36a07b180486aee3d76e14 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6133333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008892593055774285 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.607, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00891871708850756 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a6ee7fe4b4b0d337bc8063d4ae2cc6764aad64b0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6183333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008870849530787627 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.606, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00892269792043816 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f0396198bdb02c19620d9621d450e9690ad6d597 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5753333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009026006087500425 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.411, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008984425782182318 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a0acb00942b48a84d3aed30cd9a373a2d55b78b3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5676666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009046234144187917 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5483333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009087472531749428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed7893c4404597edca221ae19a41e9ab0f3c9bcc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.593, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008970906255948529 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.57, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009040312075041279 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..466b6ea8a89b0741082e463c2f4c43549bf669fe --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6066666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008920048383377177 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.592, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008974343780026192 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a40059c34ff6f6932ca85ce6991b3e69c339f80f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.614, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008889751171543848 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6006666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008943269429955157 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..452b0f03c66f86e57808c6cf427594f1987ce5bb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6123333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008896822947561613 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.601, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008942016171856509 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ebc0f0cbceb639f9d8d99afba6751912d2b8159a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5276666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009116243039079383 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0076aff6ef6c8f3940f308a0dd203356799c25a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..14d34901fb02fcf06d27e65f3c855943c9a286e4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5303333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009113413981658816 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5826666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009004578551254038 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e56c7d1c296a5998ab94f9e46644783a555188cc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5283333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009115560243539187 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5786666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009016519157880409 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..000f6dd463b7e57d04bb33d8c21ae3f29fac90df --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5186666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009123866148533357 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5753333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009026006087500427 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8ca493190806c2f7701b9ecf05b55faf8907c5eb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5196666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00912316564893404 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.582, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009006610887558775 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8139ee6309a3b50e29bec87b0e9e35d0392eb530 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.22456964006259783, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0d84cb0d7ad2d321782cc423c576c0c30f23259c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.27045454545454545, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..69d35411e16c6417ce0a1f902c29e93de964098f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2613756613756614, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ec4827e176e6bf0c42af00d93b2be275efd436a1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2489177489177489, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4d9ab9dd9e70320a365a677328f02bbaf914b6fd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.32142857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06297362289056341 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.23462970093697855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d447aa456733c8fa2e5a7a5066d6641e2f2aa15e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_GPT-3-style_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.2857142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06091449038731725 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.25051020408163266, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef422ad461fb4a8488a47ca1b18e5c76ba36c7eb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.1940928270042194, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..caa98031488e54f843a17a5329519f402ef6d607 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..14cb72a73d4a47b9f10d8756f51ecfa24303fa6c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0672477765493766 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.31979092421002614, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a64f35df25c84a6e0369d240ad7313e1d47674d0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2887426900584795, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6b70cc088d413c19de64686295275c2ae0da3e6d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.301994301994302, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cebe1949cc04ae82fdbe50237d51390b600efc67 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359542 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.28651292802236195, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c1e40d4a1ce436dae60fb179d1767faad50d4b84 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.4156746031746032, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c6de1dba6e07826b824ea8c4fdb59d479dcde5fc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a5e5e21954069e78d73a257fc647522c4564dbce --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2794380587484036, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1f239307dbab5bf671cc701cfb02e7fab1b80859 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06585388898066351 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2772108843537415, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3b628f4bef8a38380182110e0bf7c945975342f6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.31333333333333335, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f09e0653100d90425d579e3d0c12fe327e84e9c0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_can-we-infer_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3113354970549345, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3023694fefc60f4ac6446d7579f8012e7dd4704e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.10714285714285714, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0417053005800816 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.10352728047740835, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bf629ff8fe8f1047e3711dadb6c5ae094950dfc8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.27858293075684376, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a421bf85689ac5c914408f5ab55eab061e8e1e74 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.25, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.058387420812114225 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.21626712849026222, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..91564e77650c3faa5a2a365350f92248b446c56d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.14285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0471841613625583 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.13505848989719957, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3a83b1ea1be1d1d0258a129a4470e30ffef44a23 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.10714285714285714, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0417053005800816 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.09963985594237694, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ff6de33120e61ff171e33ada8bdb1ca90a323450 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.14285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.04718416136255829 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.14384662956091526, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81bea282877afac8b86643429d2e44b5f329741c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.5178571428571429, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06737697508644648 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.4046085858585858, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3a699718731543db19ce96e671e6bb9490c27707 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5e88f76554467c088a65c2d34adbf7d58550a5c7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2794380587484036, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..42ae6fb707820c4dba9d3d9daaaada3ced01ae74 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2596413657577991, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1b88410e104b0ce0d982a47c32cd337ce299dc28 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3173681664247702, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..26446917e07d60451bd435242f07385387c75ab1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_cb_justified-in-saying_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3143399810066477, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6692ba0a30e7a4175a9d18ffab74e3ac7a3fab39 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..83ca63fac4192a00e2147d470be7950ab067812c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c1f3969b365857986f62b93a81859f8fee89a7ad --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.63, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04852365870939099 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.58, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04960449637488584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b6bb6cba5d541c24fc896bf221c669d4983bb0f4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.62, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.048783173121456316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..55d5d19eaf6b5743c2fc3582d86f7cdf900ed8da --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.62, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04878317312145632 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.59, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04943110704237102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8d5322d5bb504ce6ecc2fd1e7142a09fb5082245 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_best_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.58, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.59, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04943110704237102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..96226bb9dec9b0828338a694a0de35a460063506 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1f0ebecadddadd338529ef3c12fa87e893409a20 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a4b1746f1843d78be9ecd4dd55bab7cc64e7ec31 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7314816cbcb9b8231f2d0da047be29e3816e3515 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..04983d7ece57b114095969894059dbe7e0c6fbf5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5062f4a16a52fc33126fe9e05339915850618e84 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_cause_effect_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..382728a1e50f6037b503fccb32fc661f4ddb8ddf --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.58, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9447d416a2ea2920b284202ea0dce82ed50a0546 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04943110704237102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bc35ff84c833c49629a2f005d33513a9dbb77c13 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fbed1fd7fb99d2dede28dd7f774523a5fd28a42f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04902071300001974 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04943110704237102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b7868a548e29d7355f06fd2d9138df1bb0e4550 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237102 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c91e2fd8ecc4c44112194caec875543a1cc367f8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_choose_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1186eafa5e86bd1853ff007aa0617933b670aeec --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20dbaf6aea792510994d669d93b9786d6754bdc2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04943110704237102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b817112e86e3b57df9475bcd8f12b3c4d6416890 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237102 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67259c4c3d0ca935b59c63344ce682bd270436bb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b8754d91a2f2808676f2c2003e086d76850893c4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0b035066a020b74776242506c3833a7ffb8749f2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..45c5e53209bb7c382839be166e72aaf666a4e19b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..488a813d924076ede9f3d36e0109fd0d578c2502 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aeadf04c2b902aa6676ddfc45290e873e770ac1e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c3ea6446a88bb7f827a65f1c54a09d00b98ba68a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bb7717e8fd2da9f2694e6841246fed6810f9bf79 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..33486f22ef1d46acdfeb05a14fe194fa2a30d88e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_copa_plausible_alternatives_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..91a8ccce4853c8696743bf5a17ce4396b4f02043 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 2.7427887623503127, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05495550525224015 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.18437430718819275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016081919163123087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.3251983558888638, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0020138546152268445 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.22723646490231306, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0015666486705066473 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.056050363270124, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0008590677097817941 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.10050481850962295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0014187911826345318 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.06946399430025461, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0009869119968554082 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.1456094405733479, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012316335261671133 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.26113451142460736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0017752376482140586 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.18067137808834505, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012648308698162281 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.15934149267059367, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014098113104567104 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.2813363867963323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0018136711023303616 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.19654380094534718, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013999134084878718 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0a24c098942f4e371ec8d2be3163e5ce284fef2d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 5.501660579699312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.059490555517424666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3112584786759358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001907448658666166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5491290672753819, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002745962690555782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.3871051829172928, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001953296778601808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.12796428764311324, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012052147710974312 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2325200457741587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002130744747415604 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1605728054943973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001412706883124633 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.21648135721974734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012587276826098216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.38988924141191994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002309000377521498 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.27100478266727807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013681493938654708 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.25574830221838724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017155621473719167 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.45162452980561607, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002604535523838573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.31806770177523574, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018173275489533744 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..32826d8064237b00f9f06a05e00a438958232a80 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.324394303872813, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10867005579792703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.33094397240746043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018564220392525137 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5631788704398346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002636056309828146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4051242312515321, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018087304910981952 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14418060422905016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012689215486412471 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25265308621087024, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021790130347415517 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.177921888015793, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014368818296271887 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2297030175226026, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001353128997875971 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.39650377304869283, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023226005472347976 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2822165267320512, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013886080174190089 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.278170389520455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017013879950176962 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.47403743423884753, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025306580301756973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.34056029442652097, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017186859749549793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8bcc72a33fa0e9ccb73abc6ed1f72592d19b759c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.023795509429225, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1045133219407453 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3281077630214524, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018892313549911306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5565110906097839, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002632763132900043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.40121542494211165, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001826481389337569 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14595032198106483, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001332168833443709 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.25433155482778347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022419507977446606 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1797184980766115, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014978605293431552 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.23006830678751594, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001412084408622655 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.394784182200868, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023087386618001874 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28212439353399954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014319999333914978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2784043771811358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017584082669023667 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4725249414865908, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002566203128742103 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3404036668863611, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017688250393426252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7042cdb204b1a4fd930cba2866d00a80ecb8146c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.241056271417146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06584261856760189 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.32573683579768875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001877843583143733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.548887629470057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026299920384990267 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.3975971671274264, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001836450632172448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14525291203480517, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013572915734951672 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2508152581679459, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002279211546178668 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.17842808573274627, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015395035598836696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22958828009634885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014143451756674962 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.391229276944502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002334722588500979 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2810484863937827, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014637888600224375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2784229970473634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017664712981204373 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4693077141628463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002584144223809568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3398221063833657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017989152716216044 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4d20462e94ff087052853971a9e749c8626ea0c1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.2465196884878145, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07468916464119668 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3255815725397693, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019032883471518422 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.546540731713107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025733832831453347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.3966890040503588, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001815803597852792 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14519398169659237, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013353231399095156 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.24987277938742591, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022140552438889268 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.17801029501851723, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014940190331615374 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2301383140529255, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001418389584367908 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.39137629197532353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023042337566443194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28146458367847593, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014472139059049296 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2791614306561805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017928244822471586 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4685579869769028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002529144473225649 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.34001582427002575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017807422041094688 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..73864131450fa31320581801e3440c032a7db380 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 0.8753334130290518, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0348620294223961 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.12180299903557583, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0011144789606423104 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.2607151270005577, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.001833065327541976 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.16280348551957885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001309447781869726 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.01683964413978754, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0005356789210692077 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.0356320865210295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0011364294225061037 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.022387020564367744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0006945772086659809 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.1047466751047272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0008761170348141068 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.226582446043322, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0015936684448292061 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.140452983092435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0010520758669989665 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.10382518824250553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0009416399281524375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.2242636517948043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0017031410291911053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.13916001785403956, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011384861820446248 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a50d6bb1b65c5c8391169d75b4e013fc4bdf613d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 5.721366546161991, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.059879890672636354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.29934120585636687, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00167831396994261 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5469683642207921, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025082833435656686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.37872230003166957, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017386604468687269 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.12496074032642647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00112594043655694 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.23529069358454732, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021079015900675175 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.15944824307809596, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013532088062737225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2036609095228398, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011853406607828087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.37837767528899835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022449611252738045 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2590184966950011, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013377464931078383 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.24947318226454393, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015365533884932646 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4560568485237131, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024137023011478282 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.31560678490975924, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016501630567473267 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ce2a73120b128db21cbfb289c56836121bea82e5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.535547229648224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07998247245957349 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3043393031178695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016987372631724537 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.564230482567467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024769176130864756 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.38702008716988656, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017370872647159401 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1329353387981719, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011973152327962053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2547108199869813, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002222655965723188 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17045094780052067, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014026421153863727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21068603957785997, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001216562219549965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3978790249309968, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002297088221037071 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2694273591107714, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013460707667909964 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2565183996017009, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015725570964798953 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.47610471055377285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024377798420915993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3262488238812611, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016716617044544048 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a171f9e10326be4099c4acddd00f1894ffc43d5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.816658602745218, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.04583016968843249 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30377319756142107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016392467233435767 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5686273285167529, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002510804564764257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3878861536787146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017017932936630189 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13391657548983904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011760375900396684 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.25896717520218415, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022974650442523993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1725725568885113, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014322789151160747 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2108908717196471, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011962828612559094 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4006206317458079, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002304730683692073 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2705133936819741, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001354231331623015 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25737446644019807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015454274177506696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4820507894942288, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002515972922796953 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3286981074887459, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016837415400218188 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..13b830ccd5597aecdab1328d40166372c44f94be --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.979957732229111, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0659505224006796 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30391913123253583, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016517877163181051 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5698197681808488, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024983427107223084 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3884017275288488, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017060422782565476 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1344501382831743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011696066678081112 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.25994005857695457, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002276014994019727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1732811817482548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014176809818272833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21017112712931915, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011880019144919782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4004756665207533, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023172780379295812 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2699817385866637, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013509422774341565 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2577902147906641, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015458181802897012 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.48387236758259095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025093856044876017 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3295357685903064, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016750203066771487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c6ea1e9adadd432758280fa042ab485f24782090 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.9467832258458815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0713556533267485 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3040190125185005, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016439994643967335 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5691357371244624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002460959843459368 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.38816794389969306, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001684513494480907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1346910777243502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011732332291231678 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2605212984297539, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002269174033571433 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17347865411768018, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014147739784681232 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21001184801420653, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011889650722311973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.39991518737149667, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002307549465530525 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26961120657660587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00134699971864286 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2587096095734276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015374119276061612 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4845073574529188, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002456173501378085 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.33036410242618386, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016552917619089278 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..df732451c8735277d130b593710ce7f7fcb421b8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020684356751050448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.0008334849399141362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0001358503424917359 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.0015622118728288864, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00025395262482497686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0020684356751050448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.0008334849399141362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0001358503424917359 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.0015622118728288864, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00025395262482497686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0020684356751050448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.0008334849399141362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0001358503424917359 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.0015622118728288864, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00025395262482497686 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3fb638c8fe45dbf13fb992696e499dbfcb9ad0fa --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.5565042233022994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06597968521354808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.08593898665872673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003701585604317789 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.0887838872066671, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003666707231082007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.07140698262139507, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0028371334110872675 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.024892824112996058, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011425344587644847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0371764329986735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017241156866971928 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.028616212681722628, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0012853116962859368 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.06606011638251028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0032423797227769642 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.06093221366523217, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002567923224143668 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.048987083153452016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0019444092164217727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.07579156901424648, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0034585652796348012 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.07390893983809439, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003072194012262202 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.059745065640798484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023782780189300344 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ef4a8de9118f92c64a1972beceb066933547117b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 4.952955446585565, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14703261112875554 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.1775561426910298, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004354374307061146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.2112636167055307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004916752832733977 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.16985011321499935, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003824921158615819 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.061741721519856914, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016305676991503655 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.09131710409720983, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00243568525413484 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.07112804192230661, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018368523862338794 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.13080469299969233, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003698218373012542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.14612619148208336, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003515094714410421 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.11698574747654279, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002667973913528021 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.15254844160782838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0040016749319373825 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.175513921363672, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0041508619445660075 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.14129352369422737, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0032206138029846256 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1a3851249a526c067059944aec12941651539519 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 8.035933183438738, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16111218297605656 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.23042023299400893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004205272460982702 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.2950691553359754, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.005135544471290643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.23676635522395745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003969984523240725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.08901699667028115, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017692731972765087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.13239068456120945, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002720773239274303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.10309288089148716, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020209463742805306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.16676542398506278, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0035028714409443473 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.20592751042876095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037381203986915166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.16447844130188313, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0028207459523148914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.19511953619785463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003830263451296712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.24484238681845968, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004384904615208511 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.19647057857599984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00337544239864232 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..166b972ff0dd16526a628a85e05a451fb19e9674 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 9.838539717642748, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1542199121945329 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.27122631402577446, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004019032209864867 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.3543225457053986, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004967042595033049 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2846908650582164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003829987590059196 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.10895746152226353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017929065525348804 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.16028390936179449, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0027130809496109776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.1257055751954671, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002028879799449096 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.19623445722856525, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0033607871400964796 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.2494589917112239, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0036616398264334195 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.19965383022630442, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0027610441762478733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.22931849058794615, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003672760310574185 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.2951186979389869, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004285864671929924 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.23705688169423808, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0032905873586870126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc560d4e8f73537e3bcf3a5abad56dede35000c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 9.887554411273673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07716295936766063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.29889933654446116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0037338421535844956 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.3963217358852519, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004651910319820262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.3191894028386597, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003572075569447729 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.12313981096867667, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001764254452955877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.1798873724588336, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002654894317462101 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.14176701175164574, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019854849759513705 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.21599407087504968, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0031436009861130674 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.28082966722369046, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00350910000314051 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.2252484886653337, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002627200495231524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.2524224681188342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0034299667817147893 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.33082396406528924, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004058856021277282 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.26634567471821236, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0031018728642803 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56d0dedcd1ed121b8235ffd1c305aed18ad45525 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 3.3868925634521596, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07415023515490869 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.12781588961197554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016392738386146906 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.26686211195039367, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0030010672113457175 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.16968795558812888, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020520636735140885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.04754240795799523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001014281868832704 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.09732440743600192, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002035877095135508 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.06289750165250287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013193385142581298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.11843660338175163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014270824435811313 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.24837983245494402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0026307193382184542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.15745602394794062, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017868944105710683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.11138415788379845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014602010171592388 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.23362654109870873, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027185132812130603 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.14802696104042146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018361037263776232 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7d243622640b268a7a53a159b3c4f3b272aa32b0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.656637998489558, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06138640019754124 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.22448431958439818, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0015774519438982097 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.475937695786901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002823854240730373 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.29857293628895076, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017883517178428313 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.09278798103068853, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010182785337770847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2039364568886483, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021426616098330804 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12449829406834531, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0012625501065363337 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.18571244490415678, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011675141052374309 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3980265502028155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023515010622215645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2478615356783164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013407338809586194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.18244634491896064, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001437571751990845 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3876575760439037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002699802672015727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.24278851384210426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016843478489220692 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4da23fa6cc7f14e6378d00a94042bd74683bc66a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.856901680068561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08124041054587171 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2401800477102809, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014473576335462571 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5163816757468545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027490552702758896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.32161895837988735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00168146427395452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10780300493629737, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010069472486850257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2413921319510364, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002257493837016372 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.14591205568832014, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001294685527449689 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.20140298531801745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011006514010170898 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.43694912141494435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002368493512084033 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2704562548681343, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013038206676934103 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.19780238487026972, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013643352859564666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.42666941865224894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027363306527661384 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2651360409168281, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00165326573625161 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27910da617f14dfdd93a4ab406fc9751b23d0368 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.067980420028392, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07674666618798165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24119330473410452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001436265091489619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.520447176584629, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027841731171524635 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3234918087831591, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016866852215600267 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1098856604345649, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010230282133462645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.24765707981063853, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002344687079258296 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.14908085018598377, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001325863965517803 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2028877822846739, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00110597560698619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.44168375010036015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024210245845319865 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2728226991547497, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001319038570084664 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.19995751457276323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013597786817193271 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4331317668864426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002771288232621466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2684669151387224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016553181450139245 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8ba8e90298043de44fe37af04a720f8d4b029a15 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.301331414189049, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07282835065460666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24454894011699824, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014458985189821688 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5282893998840518, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027239476703194553 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3281997540286874, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016842914784987548 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.11241096101927037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010296837842975505 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.25292679532334217, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023135968641909677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.15250785714191883, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013289295083122636 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.20493346554385022, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011217276272122712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.44669665680269593, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023872195322647088 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27576809325399554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001332292317195587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20253099841039804, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013836573195200987 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4389119231936933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027715255759945875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2720277049182356, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016797663432208906 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..96df9356b975f0833e6b88a5d7814a52dfd449bd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.391640493190723, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07696132388022625 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.248561550528162, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014842009296995168 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5361907676530253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026923692664488426 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.33316361794611193, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016721318654132375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.11498051543992278, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010486619107346665 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2578870983510302, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022815014259324113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.15567663749325128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001316099001704392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2074081838133947, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011575590389617165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.45169489347455444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023905395426844995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27881193769275936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013297414200118412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20657408552804282, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014132706167399262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4467577031168348, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027053770792765386 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.27704641367351746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001654188879883823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d399d122132df44e85fca086a5a0535a3c60cb77 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 2.0677053326576096, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06271134039957105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.1095707231662833, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021591140734212957 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.20802103992679305, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003979629734649372 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.1395687520228349, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0026646833399520944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.043273009183308724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0009648056549439121 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.084551914403829, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019148621747591432 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.05547805696954945, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001212976335572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.08666935739550846, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0016576469738929047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.16514211361615944, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0031559470820256225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.11040718684668492, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002066401071200146 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.09469111770984258, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018586644683897386 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.17996098817894363, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003488975907287554 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.12059646907618253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023117453399075457 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8afce024b9b746b4ac6eb732ca8c39217f8d7fbe --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.776339014367035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08648488550482673 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3689526387645257, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021333449754014004 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5958464926870726, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002476964048840286 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4455306771983514, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00201463745384298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16443602410248034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001373572653978035 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2723125733003977, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002120174112140588 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19999728868621525, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015176461844321897 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25666663991049066, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00146996338508828 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.423240490787575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023419654403491713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.31215943868340223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015312730015625377 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3062178505891934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001925082430561453 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4957583957629278, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024755678177244595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.37009049888168155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019204542565398291 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..14440908f1893e2a0e5cdf37e3f7bb666fa6d205 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.956869478198272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08706326939132922 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.35860576976732106, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021204385851447834 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5877143657823972, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025365074511194768 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4357140603298011, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020471026571017327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1610875918591629, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013996599831001433 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2714598427748779, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022353847705593125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19727207437417654, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015716203234345902 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25147400899840344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014714723008184948 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.42019871172862305, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023315626390538035 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30752457963236757, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015398007381628844 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.30198591705675015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001941444722568633 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.49589524688238196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002550990119421592 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3671539275155354, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019677103053781135 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..32486f2db166c34499b9379c26c3772799982cf7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 7.009191295391601, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07880823868019797 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3545298481996187, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002080164442210043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5832652468397247, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025434678918276176 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.43148059444634873, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020169750963498836 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16017594753225423, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014169660528143824 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2708031087813346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022903011042423914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1964238350803286, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015986177875609523 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24852387761819955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001485017547336716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4157032927811326, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002345334628732105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30412703782905015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015615998587158336 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.299053998385982, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019322475227694653 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4926202062191711, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002581909625016066 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3640982827070694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019717220049021546 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..78cfde5c73a61be1b48b3514563d893f7de2c4a1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 7.000198019935731, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06913407290704363 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.35024781130506905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020285351396598593 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5776691917985523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025294416589362444 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.42692314967926737, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001988659000618427 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15717110741288884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013996429298285594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.266320618349341, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002283184726774742 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19301601907405783, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015901538810711403 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24489748566928496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014705299982066997 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41079255911678236, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023718685348987417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3001609211346324, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015693837295482464 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.29636957517330675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019035502093200786 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.48931428544569894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025795333772663286 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3613177972378212, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019560581480454974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9ec1022b550dc32010f59c2e04a8ef9b2a53f921 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.9767120337505135, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07800248993311952 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3507462370081385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002007947032661163 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5785590837053719, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024745176796048766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4274281275013867, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019410392555796734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15691758500190972, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014061231147467989 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2657298946460687, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002263553464788525 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1925812091012645, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015855872884769345 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24541799797892744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014553330784770901 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.41122561852623524, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002317891502947824 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3006040774381334, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001530792855406818 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2960131787945338, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00188423815157528 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4883301232467587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002513006325165267 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3606676599309041, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019100284921251059 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..098e2ff99ee22c7e55aec8ce0a40327c68420d39 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.09769813350875045, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0015529769050519056 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.24854008316454318, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003822284525903826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.13852188755221997, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0021293763458264075 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.01493502693762796, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000687168087320552 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.03970637215388994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018488604775761157 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.021404604329843858, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.000978061112594587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07997070021861699, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001165505436023522 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.204221486314787, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0029072542956042127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11349412170506364, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0015962350151948002 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.07895320097795921, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012925709790883738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.20223663136840953, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0032525187208582097 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.11211976668519616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017813295955859596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7256315560298398, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.054268166719320964 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1283f0974d5e7f171394edc1ed33c3bc441eab37 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11318588173143941, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016141770639632976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.279979949784874, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0037134168551931756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.15918378891591406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0021744946842033627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.013696212725928195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006678764989725612 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.03434190326540072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016508020659010145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.019363039161219217, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009344774229308126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07914360184018943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00103668752345318 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.19748984092630087, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002485099284856001 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11153061433705051, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001401528481365617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09056691007854044, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012802454821530644 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22609222107906188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0030831336083997113 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1276700935379806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001741533787976053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7100208650541717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06307666643323927 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed492088819db19699379dd6aa7274fc747c054f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.1160701827729658, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017439664665625302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2860944708906913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004085639279103821 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1629955337480239, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023615726845261148 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.017447949224103376, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008032811020562298 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.04485569053060954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002114380505243146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.024834458548132245, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011433201739787905 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08456270084600391, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011815898217163434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.21064825146724378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002957758437750362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11904854566150291, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016169606046750717 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09152186527385546, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0013729069191967156 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22753083497124593, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0033757963655885394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.12880725110187866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018796612939592603 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.9602615670933534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06975113829339996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dc63cea0d1dff4fe52ab2b5b2fc46dc878916fcc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.1198315859407343, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021313635031272324 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2845973554839675, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004807019416971591 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.16536255544133652, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002782591039843851 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.021995282393962437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009659870502554929 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.05477147588190337, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024254506123566537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0308541164205269, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013433559082160224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.0891431800173679, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001518317618261569 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.2133363419019677, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035492022408266455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12329381135423298, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002007770555031566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09477716470429574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017017716066415637 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22706163311880304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003999321665699719 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.13115287292645436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022631518633725732 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.213942331424008, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06983125587361878 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6b09c4dd5de7e4dce1f6bb7820e330c4cd5e96fb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.03617972980376136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022835108300555376 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.06939836439110722, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004267803204475849 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.044055315863565106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026368956007959852 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.006768492713667434, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006816043308723117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.015027042518545813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0014854463791600376 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.008953373288301222, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008618162172086633 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.027808549592201456, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017616362314983645 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.05275279200412055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032412133339056147 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.033494270940581335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019921856030734055 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.029343763312784644, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018717600410582465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.05584466177175486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034859449912314064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.035448110651920155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021385233474543013 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.6027229140523703, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11533577622824943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..66ffffa56684109014970b765b6d1c112b194f4a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.004288164665523156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019144304746682786 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.0001731252011799053, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 7.864942490494626e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.00033237715251661107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00015078706434605712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.004288164665523156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019144304746682786 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.0001731252011799053, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 7.864942490494626e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.00033237715251661107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00015078706434605712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.004288164665523156, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019144304746682786 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.0001731252011799053, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 7.864942490494626e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.00033237715251661107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00015078706434605712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2b248a61b8f1e0b80bf9b15ec83384ab64fd96f1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.13056123327549546, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00211653085075359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.30744582623088057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004345708260459012 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.17862356070756422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025190444667790237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.031215859261324586, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011455957695466357 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0753592824533711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0025928926122515596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.04267694660641669, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014272559106211188 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.10607783393120825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016604953804639634 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2510352242981599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034186066493379894 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.14532363973135456, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001945356869811387 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.10275001625696571, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017792008566591565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.243889829015575, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038153653094762778 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.14077407394066704, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002139215859454691 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.5074370696814168, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06306583053985478 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0165f594b6a57c6eeb76c4eda546926e4b5ff382 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.137488328967968, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001854761861679906 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3409552422180664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004347354842228533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.19355866105950706, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024904888130049163 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.032790830434572774, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00109127458636922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0841464589274841, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028228926118902585 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.046583919666909064, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015343154616235192 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.10887517550153661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014322908116765737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.27169151726551716, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003535082608448959 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.15350551415370856, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019452968584178893 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.10904170474313231, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015480655523710234 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.27225025414903886, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003786898530655058 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15376034054280813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021049152368113177 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9294507192370047, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06927174390558496 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ab9a43904b228fa9737e3df20a59e76939b62cfc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14622869832638033, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001840271628617099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.36110939972212, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004317171977659118 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20561129767344294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00245643585677957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03579409350172391, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011122812276210662 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09230985292859638, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002925197294466604 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05089418045053158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015644827567191525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11460227671669204, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013793463537122247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.28536047123610525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035023837864113175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1614717754951783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018726384968348116 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11587559985347855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015497939952997533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.28811845556237264, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003818249039685 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.163228640222356, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002103524354064436 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.0334951949441202, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0861569471135935 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0af278e01da981a5c896cf160c5c2b7d6d0afc56 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14592947816165613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021611659038272 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3443812486249235, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004671315297787991 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20045136953668377, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002718747872622448 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03602991518922341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011524705369005208 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.08958291369771845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028952310760534993 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05035265573315919, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015735834442296916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.1140823083670884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016349867638249449 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2720512676890386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037892985318394535 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1571872470032892, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020963391106268445 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11426494976351477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017631615041811524 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2716320363223784, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004008514578347172 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15729367100488104, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002267612514707712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9326408991617487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05911662841116225 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4eaadcd041094c79571c1c1d2465bcacafe604b2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.04775403576214989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0029942578637535498 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.08769716230460457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004942918692076916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.0553051713195741, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030382631257604253 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.010407613906275788, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008600375403691117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.023456621445988514, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001909353475291992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.013792609834707413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011026959034726492 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.03800524902986652, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0025635931438879765 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.06826516249123325, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003864253776835998 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.042863987037903804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023336709067174025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.03874591139206328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002614949735399864 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.06947543218015861, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003996515839046432 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.043714690958694166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002416760693299736 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 0.9888155443961298, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.16386874187258033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..af9c722cda538a78f90f9a3c3f747c20e72d65e1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.003193068638626207, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009043418737167503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.0025114151662593677, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007268593279769246 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.002720852866414494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007709104718885456 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.00047578207955566436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00021103651915959466 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.00041802798878270574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00018654044401303826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0004211068403029026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00018200691938578806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0021986780281657314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006286463155410547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.001725510678386032, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005031104040557216 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0018716501706524194, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005361379726101486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.002403627074679706, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006873477461862152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.0018880944258754403, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005483050471105171 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.0020513446899695803, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005879898596125877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 3.9891071758683426e-44, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 9.981986929231053e-37 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..40dbe2a913453b5856df1ff18db24181985c4017 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14714404834539455, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018923895784899547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.35332808198249066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004277854613117222 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.20448305051370513, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024499517599214865 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03290270141635618, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010989269503369368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08287973270989689, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002833032880998302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04639360161894793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015361212037212923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10929554255441323, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001387362394565063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26469649902339626, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033616815736178403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1522143356903949, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018364801701113477 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11672705515751206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001546854617689576 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28315180080173324, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037966298013545237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1627172387554665, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020740152357133783 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8973746751821576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07374608873039461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ef6e2385def93257392d36afcc74e8b301a6168 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1344416388362992, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019127327308320389 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.33053933219336484, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00435936977132835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18885391905418497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025584792794476653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.02919021495874484, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001083579043412117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07395642387380598, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027833547930418757 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.041352212313184845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015256985849036882 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10283551822207533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014281321188522025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25511273674645885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034307162021354593 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1447818747001178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001931305342910804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10637101855903888, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015675521812090556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26365396988763345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003725007857599503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14973596762001254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002121716066142779 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.6712614198546765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08029090892350033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c544bb0aef00fd0b7f0fe2a9de26adaccb203c0a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14045160525988198, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018675726895835078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3448073202698317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004206053313996407 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19729124660372196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024865996635105967 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03250319865239163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001104582027493848 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08285013633430294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028679116547561338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04614704751240165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001558612721206191 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1076421486644076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00138815532226034 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2669756747819945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033509483772327935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15159285154298055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018751369925771642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11165266260752228, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015420215686911798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2762348157117063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036302813408504828 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1571758204423751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002082270276853359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7895958187086474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.04787428909067152 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f087b27de9cbef69bead4963663c017989d5a1bc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1397482550494329, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021614980521941947 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3314206756572503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0048258842534713575 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19247676178691825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027783344646224338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.032034815897723355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011234176352706835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08021224571691461, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00290937059770342 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04492583126972709, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015630404482977962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10664386827077653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016219261205998713 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25469242016766824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003747468811937111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14710040508558128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020844906061187624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11038994072863482, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001775109847716677 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2637144833686383, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004087246305150146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1523907734490472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023066662456799336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.799886255448494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06241605343512123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fe35a49e7a5c816f87cffb64105d5c095efd9eb8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04293551268263482, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025300915700631186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08251909123638941, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0047552295627311435 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.052109895215743926, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029325961974541494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.008902116582691397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007882814434287245 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.020355209333257882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018399757489791825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01203821663060757, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001058756941479336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03291018535700006, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001988611171780747 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06245660895956642, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036021201516783473 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.039428572953528446, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002210624192813922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03518063964656488, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021155739741956746 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0671088243853192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003920638757312349 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0423468971865807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002398760878941016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.912241430159217, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1508779538575514 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..88046878533f42e9f3345a467e4923023eebffcb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002739968547331711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008278012212952025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0022134599272398445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006273422111807437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0023684978392920267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.000684399718764436 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.000376696230153845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00016337393027254123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00030314025597044463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00012776302473939982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0003256051958251534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00013752393773970521 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0021741866307845633, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006503820290684123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0018275205470825698, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005304151694392347 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0019272015023736237, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005653230599258551 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002299258100195655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006916877058747434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0019069310038515171, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005514157386454834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0020216316744952494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005921597049542807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.097815638153428e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 5.018895149426352e-34 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bc55ef87a080174dbe2d53d224b79c142491ff28 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.15006207382743458, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001987657336844973 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3454637430656072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004018757653016808 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20343816500554804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0022810928631204065 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.027682031685392947, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012572133596850813 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0647058637194844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0023098436259221776 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.037047223818116884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0012769054344671355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10668249292401177, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0015581101621468096 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2459446989428375, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0029903868489755438 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.14427043435851508, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001641927464053827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11765973495308749, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017048462642842042 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.27281684754355495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034923333313323017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15967622365586978, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019165619775066089 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.3629401242051673, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07163139222356285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..93733eb0eacfaee7a0d6cd558b259dc413a859e4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1491508085903834, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018269519621024542 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3664832216419509, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0042114762385522956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20959848507445628, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002442544487345855 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03343924101857058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001064909350142866 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08498394458221385, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027112134192700124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04746052242779793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001497983093227225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11001044604391327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013469472913403972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.27255320287378415, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003314920441365678 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15491949613982683, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018314818566152655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11751544431488214, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015368040243137976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2910075794259318, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037311617285632286 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16551777637641962, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002095777574542374 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.8090419671658906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06243960286799727 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..16c72643fbecb517b19744d346f58968c24ab5a5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.15135142269847585, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018503020885729993 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.36992385810715606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00428697893431127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.21227676754700187, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024708186366684537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03621955126854001, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010923924587675196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.09215185931049906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028353381841421765 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.05137980859666271, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015384305388393403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1140477375940746, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001355907302607709 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2809365376305075, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033434191813504033 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.16024351750916332, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001828916419211741 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11966566066238304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015275000983393555 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2949558808752265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037725507061813924 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.1682190350645718, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020795057799479827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9591273158820803, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.044543479812057196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..15519b5f677a02ca9b1a97da4b74ac5b42541608 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1520556889414388, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002189742662234435 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3562962414780344, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004749166123906837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20846450018241194, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027571228316079694 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03687572904819467, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011771782163178155 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0901761315164544, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028796580054493573 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.05134777699396957, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001609365371866251 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11417177031962449, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016577260631727522 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26888326992906886, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037215896036639656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15665407959636418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002098634389081189 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.12014388751084557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017911422977108812 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.28401937552529777, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004092634223313844 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16513176347393482, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023093147777262708 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 2.0596089820557593, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08930377223048487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..526d076414f81eb865422a2423f29ed35a2f7411 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.047023957094332526, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027799550540700086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.09027695540385026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0049774928615611445 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.057092747254164154, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003080646206696148 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.00908866042073288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007468809595545078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.020385013066777098, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016479365749736059 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.012186294567403028, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.000974923421275362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.03498091370776939, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0022036086527599703 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.06597700098824039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003697132762313373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.041702960752974014, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022604615815070856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.037710892572660165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0023299806643745616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.071991225804228, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0040607831831936175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.0453763496307077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024814034302527927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 0.9295235714400739, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.17952600712489128 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..84f2d9b4fc343bbfd7ffb873d82732df1a98b1ec --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.0029488555074161714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007930006513468652 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.0025816173879792048, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007065729972344169 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.0026690932382273276, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007150299022018968 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.00030619737889438225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00012483905412694334 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.00024479974951673066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00010164474700168012 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.00026998206537521717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00011079170761441466 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.0022740359120988827, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006037958962838299 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.0020309527255223335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005682477636303094 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.0020707423522561583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005514160671606608 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.0020831557924846085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005457471442594348 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.0018757125016714748, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005287065797899205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.0019001191955137153, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005013736076589153 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.3118412132647612e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 5.142891951955452e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c42f09e0c5a6b1c0df3cb0bb246d4add416c42d3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.14761415510887813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018460271861890638 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.35644215533066814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004313928291972872 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.2056827728723364, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002442623835139831 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.03437672503730759, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011443426120385147 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.08689221376038479, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029525081998479433 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.048478969239621084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016015359697998386 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.11084459960593879, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014045626956033276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2695965057726157, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003458083185065001 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.1546552506639713, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018693227706026462 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.11575748816387031, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015363157261191857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2824907234086633, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003845732411905414 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.16171184786817824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020719541841424964 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.9932437983297846, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05005508121283258 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c0c4a7594a4f22b2712e21f6c574815720b0b9b5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1290438512870586, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001908907445184339 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.31582870770710825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00430942059053028 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18107269784715221, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025610519024537653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.026404127167916132, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010432396448414494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06561202026843391, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002586868170319681 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.03724797091171915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014590126800901643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.09933917048841584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001409747027873095 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2451300217584245, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032869960015931565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.13967411856886675, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019003725867380186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10225509404962359, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001564036497709352 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.25220739881743365, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003637572454499514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1437650081283148, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002115707145967448 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.483994717023367, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0583433944934504 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e38f1f08981313b0e575d270f79e5162fbb47fb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13513852151062536, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019145000289811317 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3276171253130558, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004253252892642702 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18909993339896647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025443533012134306 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.029483119025250706, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010991930679911828 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07320552894804701, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027333394649254345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04154990869291492, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001535468901684112 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10394340309946604, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014280355461601787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.25410078113161033, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033207296201257777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14575285141182545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019148076940627124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10579391735030412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015362690884521812 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.25908485264591374, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036064198067757756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1484681379100036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020754922487701485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.6369889601746683, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08510169629278132 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b60550d02142796513fad95b3579f286092dede6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13077969013065976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002174976181761563 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3051790928330733, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004806410522034264 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.17983877354806166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028344653807296284 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.028769652888890242, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001066351955086345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07052778069631396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027420302852526044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.040167379424322004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014918819374576107 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.1015239172914668, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001644583430262152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.23886024900651934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037908298592866764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.1399605239628858, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002175089499821927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10201260281036947, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017070040757652396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.24078075446433067, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003985094366002531 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14082010524518954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022782275382841146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.6288699845696568, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07351917145628349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0102df2bac7d07486871122f88bdfdc8d735e9a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.03885731681664587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024465188352010866 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.072218223256745, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004400630629119156 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.04710206848660278, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002827851220284823 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.008293369893348976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007674824532485735 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.017409607367553894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016283584197287304 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.010750703825901103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009638910126157537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.029640111134550683, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018709888056480106 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.05506280693983692, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003373140482326596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.03580387652237057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002145060472538106 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.031047920466470008, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019660483285081367 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.05761387256991218, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003550284822808244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.037509929346318714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022590153287918896 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.7016713676020814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12854976408337526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..71bcaa83b23f7a3b1c352a56f827ed6e3cf1612f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.0032875929102344197, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0012833920426189144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.0003777789691102385, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0001360677657980004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0006709563634213615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00024206325506310295 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.0030017152658662095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011845885037617687 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.0003434736517860532, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00012248298015115857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.000609696868199602, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0002177019049863885 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.0030017152658662095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0011845885037617687 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.0003434736517860532, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00012248298015115857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.000609696868199602, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0002177019049863885 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2f029f2ec6bc20e2281f484519675c631250a3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 8.293650196551072, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.3291933211979279 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.20985881282206706, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005678392457961964 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6760010324649197, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007109960469106136 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.26518149573262756, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00598353213568122 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.1581319955963504, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005136391219179097 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5233448090942575, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008115996413199544 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.20168519353681674, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005583925104466641 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.2034802697348527, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005608894909532193 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6584424374867024, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007242236248876609 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2574325851884139, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.005932612011453031 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.20469461558102828, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005642747231844393 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6591156896691646, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0072783852307659385 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2586740738981541, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005969487363499531 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ecd669c8280edecc361e559998a95c45c7f70984 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.9708485559775974, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.34463867046580154 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.16692070561892322, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004836046744236672 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6789384494865478, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006554803195463939 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.22252428799090185, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005166919209222038 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.12325642385963735, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.004315397965867194 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5132917664492825, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008124723992810777 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.16670313208229318, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.004842857551789808 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.1603231731890274, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004760330118066573 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6553730557520384, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0068118148422351065 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.21392670473431238, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.005120862001741173 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.16247005134061046, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004797762841990119 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6612423664976972, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0068194387242296725 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2165274990499404, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005151253631478041 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..732d970868aeb3a60d084bda30778556d7ec802a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.9957771713059085, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.3238947499660514 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.16352590100073616, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0051809522811463865 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6961503957283254, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006309156641782605 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.21337595032188442, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005144022732419237 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.12411789818339344, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.004723425320608877 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5262901845589097, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008088441034716209 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.1635128839739126, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.004978071911619273 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.15682105604362034, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005078191461050391 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6706778746147497, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006640724268201555 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2054038225613471, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.005126617953639098 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.15931759159992323, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005115274381138 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.680473606884987, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006549559772955488 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.20828419170971285, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005133723055482889 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fbcfba438fdf6e3c86e412006fcf04344612bde9 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 6.990071211182333, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.2967253826874956 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.15871004605077652, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00515504545306226 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7034466308669174, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006207428258406954 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.20922245282570967, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0051976113847955795 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.12184134221668164, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.004666177846152433 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.535375224176284, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008001057333235266 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.16228271943343794, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.004992233069443837 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.15189168360996885, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00504068577523969 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6780782501114493, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006547450748075444 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.20121070269350022, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.005165478141129265 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.1543756601976579, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005083274387419325 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.688408073065803, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006448637747839622 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.20417200076191475, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0051750936785322785 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..603b81c9b75e0078af06cac84b60877470897660 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 7.240744535616564, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.2586062831369915 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.17254149538622868, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005654525592751173 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7169800690761489, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0060374304726635245 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.2205327621041109, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.005523926029284187 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.13569869881237576, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005133251060824273 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5506147150470818, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007924999642538401 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.17392590467314775, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005274573095809241 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.16613299736339213, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005531569846691979 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6938260684470078, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006371003512990666 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.21292785321625532, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0054688898695651805 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.16847957385780346, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0055567140187079665 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.7040155922937625, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006248279598171387 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2160261905713099, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005483109104579196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7899d9a50ce131adf4451fc47eadd378ded2d7ea --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 7.501952156457839, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.39271130030770846 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.1907119925251693, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006242576921723348 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7229868834154785, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.005996824073230608 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.23639707996315365, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006047125566678411 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.15227920059782085, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005751542723918666 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5588359253387518, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007903814632285042 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.18898088840112187, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0058122997504793505 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.1847082573134137, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00614731549204923 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.7001578119554788, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006345471739520215 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.2293218614443035, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006023695302966697 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.1869491193721871, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0061683914136742134 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.7106290738998889, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0062144876381464535 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.2322126006841029, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0060236108326316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..031f69424f717b986ea186fece9df2790a54a9c1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c38126024f76ed3baff4c2a30b1124d09bb591b6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..105b0ab64e6bbd6c49f8cdad084f0090e60a31eb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5108813928182807, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663061261117748 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5108813928182807, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663061261117748 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0df24e5258acd6fcd5421f8c26dddc8379bdc0ae --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5016322089227421, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665762007194866 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5016322089227421, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665762007194866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0f192977e21687455e249e8b3b51a814d872d83b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738878 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738878 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f6d84142663dd155b8b90340a14a95b8d5c0ab1b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.499455930359086, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665817258899182 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.499455930359086, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665817258899182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab7c8a59d762e2bcdd222202dfd005ca4d75460 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1743540081034935, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010323609186140817 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.020414014144919227, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005848929509747042 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.22265270386663025, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004060591384362697 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.035178147983051675, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008727304917432003 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003228525003290761, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00016737474198048977 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.0383496959158395, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019867052472860103 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005612668710216912, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002723517397472548 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018559243344714177, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00048315200547689713 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.20821746365510307, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003757747342281253 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.03213626849465759, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007335055493238853 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016991539384663735, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004971662800825557 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.19247194188686473, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0037031262038145878 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.029311011598519118, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007339751223464163 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4f6a6cf9f3f3ee9ce92ef73bb7bfa0c57bca95ce --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.174213094733269, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.02197451232409411 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.020403702655232312, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007707685830758424 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.214395939532725, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004149318989451621 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03403289853539769, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0009305982872464937 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003189729472705606, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00021737905154245153 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.037318744091622835, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00206867790744111 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005467793325653137, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00031728961152867736 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018933871870268792, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006694389007203683 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.202411719381226, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0039026013675624565 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.03171656567240596, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0008276871670439616 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01699166069756899, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000659008917435192 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.18391954436557392, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0037126321626407112 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02830120340253178, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007928221584109298 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..02ccc6be09a6e6522a94da5a6fc867c4dc33472a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1491691772441667, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.016463506105742935 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018743075278271827, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006585934745600448 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.20113968945623895, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004069048528819971 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03119020870747495, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007848115949067071 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0026023719861491207, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014730866868978165 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03304396129526227, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019097087061322854 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004589484275527073, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00024840528981741393 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.017648037515718574, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000612945231693746 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.191148004137635, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003801093134569005 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029387188096479906, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000717400002936173 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015525163139534474, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0006012960856164704 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17186503534549524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003612669608221627 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025633495048222255, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006527133219539642 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b6af8488a30b072c6d6040a9e1958993e6639f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.12911173855917576, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.011212622020653384 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018353557289208323, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006164592823400039 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19679724157173895, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004065759031603333 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.030624519846734758, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008233916974397996 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0025089231703953195, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014314849944617542 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03367952377754561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020381019021143807 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004446131485933507, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002459591462609964 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.016850481271058308, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005360133862246574 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.1837392053172025, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037458785538945077 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02816065241160017, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007063876575393347 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015093144272162332, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005171501945296606 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16891544570433364, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003653462651005743 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025134463700752695, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006674889527922072 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b9293c4573773a0e4ab5c6191a3be78147e896a6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.11653255801457632, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.00937865353959567 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01764227588810449, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006403823122550869 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.18185072302335697, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0038480944045729292 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.029037401002642696, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007664957090889966 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0023525543276751216, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014914943547875338 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.028175098812063096, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016880276568031164 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004018163648220329, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00022597062969223787 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01642546114280103, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005899392695445675 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.17098066504375273, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003570562801659878 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.027024334508787648, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006867914982061877 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.014394564094771958, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005138074352751769 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.15527067254188423, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0034206077082441716 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.023723025806798186, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000614747502166158 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d181ded2d4e291ba3b891d83b41e7043a54644b7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.10185172940494028, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007051902591792091 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018013100442129417, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0009190145899227636 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.17846455901442868, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003868171470663707 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.028066473296435548, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.000754026689838101 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002344978071093655, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00021411077145171528 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.02798475436220232, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018363804191775573 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.0037514567534632703, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002162496835966985 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.016632311383909484, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0008812891847948634 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.16638495652422183, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0035613541742668953 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02582308765153566, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006671470754149905 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01508965511135499, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0008651573938015946 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.15460309329405014, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035069550811540557 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.023227601838740366, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006168362129579054 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aecce581c1568af082c6d0d131698f578fa7eea3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..af69bc51ea75f48ab4dd7888194d86583825925a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4967355821545158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166557553076037 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4967355821545158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166557553076037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c1fc4cc83f811a292d34a80186b933a310f85664 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4836779107725789, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011659606710151779 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4836779107725789, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011659606710151779 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67c564769a448c861a37b681cae07e9891b2bec5 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4766050054406964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011653047155927788 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4766050054406964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011653047155927788 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e75a6547dd5c33e4b657208a06ced025007f5d39 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4896626768226333, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663330673075898 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4896626768226333, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663330673075898 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5cdd990c0a71ea9464cc33b5cfd1344b170ec985 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4885745375408052, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011662778026451675 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4885745375408052, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011662778026451675 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6ff85e37ccf06906c86f7ce36d96e7861845f468 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5609357997823722, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011578865649321299 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5625680087051143, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011574126069682387 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bc9ac717dea3e2262640a041428769eb3cb24803 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5680087051142546, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011557407210100255 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5674646354733406, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011559142916063143 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e6241b046473478672b4c069cc286757723535ba --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5516866158868335, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011603326108334514 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5505984766050055, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011605936624156083 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8d1112a33a82faa33c2b01160e7fdb5f853b72bd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5625680087051143, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011574126069682387 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5625680087051143, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011574126069682387 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..87bb3c5718c1ab762bbf0bbc1fde2e8f8206125e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5489662676822633, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01160974720073308 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5527747551686616, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011600659443292926 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6e46935881a6f102d71d565b7c0556d01f7461b3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5576713819368879, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01158796354550718 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5609357997823722, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011578865649321297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..764c0d0756934f4a69c50ff840e21c05a3996500 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.498, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015819173374302706 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.465, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015780495050030156 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..92fba63b26df139c1895b78daa7accb3782977f0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.65, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015090650341444233 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.621, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01534909100222535 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..19c63600d878b69cb9b572ed247544ae8bbf1785 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.664, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014944140233795023 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.649, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015100563798316405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c4481b9678036e6cf0faa335ead521d5cc03b704 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.681, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01474640486547348 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.663, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0149550879186536 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bb0861074428f5a172eaac41579897402487ca9f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.686, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01468399195108797 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.69, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014632638658632905 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..97ae846690736f9d3b85cf1bded6e8d9c1cd9e67 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.708, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014385511563477345 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.707, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014399942998441271 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..42e67dd07bc4e5451b340ac15939af0d18db2623 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.862, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.010912152632504411 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.808, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012461592646659966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..99aa3b5d2a86fad9f65b0a24f6f4a86616c7e62e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.896, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.009658016218524277 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.895, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009698921026024947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..18fbf6d711bb4d5ffa18dba3dd93aa80d1ba58eb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.917, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008728527206074787 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.905, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009276910103103317 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..284280143527c4a47c7b2701ebff2cb1200a1a20 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.921, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008534156773333443 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.91, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.00905439020486644 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1e5a1444b13a6f635757484722914f7d4ed0474a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.918, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008680515615523715 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.913, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008916866630745892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cfe973c536eaa24350c69d01d0dcc29045511a06 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Direct-Question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.923, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008434580140240669 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.915, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008823426366942293 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b8ca7706e97bc57ef9665bcb146a62a3a38f4cd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.422, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015625625112620667 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.395, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015466551464829344 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cd2e01cbd9be4c730e56eb5d9185e728e11faae1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.43, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015663503610155283 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.425, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01564032031704011 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6dcab3b8ca5309699cd2127238bcbdbc8aa6ca69 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.441, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015708779894242676 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.436, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015689173023144064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9706a089fd203dc447c2fe682fb9694fe36c3374 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.481, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01580787426850585 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.452, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015746235865880677 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7fcd475909203a6c852cc3a8c1de0b3118f1d2be --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.501, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015819268290576817 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.484, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01581119837311488 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ce2db299983e9f23605be3ccff55127358a38ec8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.524, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015801065586651755 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.5, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015819299929208316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6833ce0a82175325842b781696ac62fad2c01ea9 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.571, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015658997547870243 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.506, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015818160898606715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bea9839146e677b434754aeac3d8abbc49134a66 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.427, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015649789644462217 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.412, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015572363292015093 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c5c921077c0e1b2ad862dbfb999cb44c81e298e4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.431, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015667944488173498 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.414, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015583544104177522 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bd3594f098877f7238f75f5d139420ac4185a8d8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.441, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015708779894242676 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.426, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015645087688113814 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0e9ea7f05a81e8f18f0c8e2d7e7dee70e3df76f2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.448, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015733516566347836 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.435, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0156850572527172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7fb96c8c9d5fa2e0cf059b8717897b7238ccf34f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.451, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01574315237958554 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.438, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01569721001969469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..140020ee86e31053278b4a2ecc157cc107ff7c56 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.569, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0156679444881735 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.498, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015819173374302702 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..87e5557888ed2055db9be4efb840037c66bdc536 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.55, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01574000469338385 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.511, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01581547119529269 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..89d68ec2dd1847534d5775076719cdcb54f523b1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.565, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0156850572527172 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.543, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015760691590136384 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bfa323868f09c793640f0215475636ff85f9d90b --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.571, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01565899754787025 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.553, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015730176046009063 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..44b796ec2f574696db08968047d96c122ccfe0bd --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.588, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015572363292015098 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.563, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015693223928730377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..496d58f516c032992548a9d43b04f3304b87645c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.599, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015506109745498325 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.57, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01566350361015528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8387bda0b956db0c48891079f97566058b20eec --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4719401389631213, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011544210396951669 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.49545697487974344, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561954965856516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..520850d810b9e84622e16407584e3b328079997f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4730090860502405, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011545573278697235 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4927846071619455, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156122826464673 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45f03dbc8831a29fcc9ab89d0924bcbe0819e0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46018172100481025, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011525709570367521 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.48690539818278994, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558466383367182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0179306122c356e3e64510909966df7510123d47 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011528611805439893 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4778193479422769, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011551049647290312 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8268fc32a86aedff8df5f512aab54d837062e85d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4607161945483699, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01152669031601459 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.46980224478888294, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011541325320336616 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8c708862ededf09baef0f9fcb3ca1a137d01ff46 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.467129877071085, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011537420054210303 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4756814537680385, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011548748301487317 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..432141e048d845f90c0a7d9d38a73a28d6d0b707 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.484233030464992, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01155668204219638 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.518439337252806, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011554566910658105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c7a5f8fdad12f24cdd88f518b1deb9f004985db7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4794227685729556, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011552636515221862 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5104222340994121, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559920087347776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..02e2c4ca1b2cd1895a7876d5d6cbb76993433367 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4596472474612507, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01152471548624065 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.481560662747194, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011554566910658103 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0c7a379c1b815750525d27b83801bd89e38adbaa --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.46873329770176375, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011539803085637733 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4751469802244789, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011548139823074772 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c4bc29e640831313f27bf5a79ec91f5c18a8827c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.46018172100481025, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011525709570367509 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4735435595938001, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.0115462348137774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b9d7aa39eb747e0d582b67a2e6a841175e82febc --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4580438268305719, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011521653168224729 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.46125066809192944, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011527657726586461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d3904ae37fbb71bab4806dff8b383b966b8c9fe1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bd6e010b392f9921b73c1354d2b3bdbaefb9d75c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a6f483ea1d20471b431c6792913aacd03044e852 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..39c8e6db7ab87bf405c1913fb908e582be258550 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d57fc2e326930c9d3c0acfcb531b8ad204715e9a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dfd6f6aa1b5e7f09525d83612e231d8d8a332777 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8419952e470d45c97e6c329e9e72915fae92263a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.48583645109567075, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01155779233130167 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4991982896846606, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156241738830021 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..91e264a33d9881725c238e855cca787145595dfa --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.47835382148583644, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011551591851683338 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.504008551576697, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562060664045727 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dd92632ca4b939eebf223d35e3279869c1e0147d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.47140566541956175, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011543509045585206 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011555016408505476 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..82e2eaab813d4713c02ed705ae1830145e2d2ec1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4607161945483699, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011526690316014589 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4681988241582042, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011539022035111226 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..36c5d000926dd587ca0602a79922f7916fa88b03 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4537680384820951, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011512899199863032 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.46285408872260825, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011530479981182624 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2dc52d7098ff28a9bd95d6b4911183ea25425b80 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4548369855692143, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011515167912227987 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4548369855692143, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011515167912227987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f947840c5bc9e2a57f6bb05e91795542c286f09d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4804917156600748, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011553628196999314 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5114911811865313, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559378273599126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..06cfb1109ece86ba1c33be7ffe01f98988a1dfe0 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4681988241582042, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011539022035111226 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.49812934259754144, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562351329083266 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9a6683b486801e4cf01415fd3695510e160545 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4494922501336184, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011503288699799179 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4735435595938001, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011546234813777399 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c4f812f27ee511e5b32906486e8bd60edc2448d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4580438268305719, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011521653168224729 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4623196151790486, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011529552555884575 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fa54db482f72b87619acb642f1f1fea187564c82 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4569748797434527, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011519544865928062 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4665954035275254, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011536599118298168 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..57c5f51b55cd011ec4b2b9bc160d9bb676cdda7a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.45056119722073756, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01150577173876986 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.45537145911277394, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011516282203726655 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..95200b6dbd6fa39242cf074eb32afa76e6c36d8e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.47653429602888087, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030063300411902652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2dc456ca76909c2fbe768dffc17863e6804b0876 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4b17d9df9180679349be6bbde968a071083871 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..577632ab6fbab566c32f33884b1dcaf44bceca86 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ae41f37d8c8630ea9a48f567492f8ffcc62937b4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..268efe691a08f228ee39f0faf1920e78d03bb977 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2d8bf08bdcdc04167908110a8a0c0b15a2ebb418 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8bf725fd5d72052922fbeafdbc32e56206c5273d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5072b25db2dd3415fdd3a9a7f73c8628c98d56f7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..444f8154e77fa9d35e6ccac636f23c8b3ea864a6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03009469812323996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2f04f4adb3487c6822e101d3fb0f1d8a50145cd1 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.4584837545126354, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..44105bd79d7d0da869d94818d083dfcccc94a4c6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.44765342960288806, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02993107036293953 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dd3903e4648dfd2b9b506ff285be5db597e599e8 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..85477b636f9d696fe23f7abeccd5d63e01749062 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa3a008096753ec431551e23365af10c5a11c995 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7bac7c579b894ab1fc8f229bd0b2b4463768613f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5523465703971119, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.02993107036293953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e8e62e734da739ce870fddef771f3e1c337e33de --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..71bce27d855d9f32871af7bd322e9153d33c9059 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c24050a59dcb43d609fcf512fa9766fcbe0adc7 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030009848912529113 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b31656edec5466172cdd545aa87090105cdbc864 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c332b25448ea7fa28ba47a7a1e7dd0eac3b14b79 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12583cbdfd4e81d4b81caaf4401affb58ee3f663 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..20ff0996fbe8098b59f10d260e05c0f28d58fb48 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03009469812323996 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5487364620938628, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029953149241808946 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e7478d445b923de3e3fae307a10a3577410ebca2 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.47653429602888087, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5595667870036101, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029882123363118726 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c07b45aacdd0d1df629069d81286b0fd0fe69deb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8b7030f7c1dd84e1f6d111cb4b46b59a2ce4155a --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51897297bb2efc055282920c6f4a7dd2d7767cb6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029992535385373314 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f0051ca27a167159b470b1537109315efe3b53fa --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300523034631437 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030063300411902652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c44956f8356b6562e091afd7c78bb3f81e14ad --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..991aab34bc297a37fc1eee66cf4c3252e3256bc3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_superglue_rte_should-assume_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317184 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5487364620938628, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029953149241808943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4d30aa58dd0fa9e5ec685feb7031c60afbab8bb6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497697 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051500838485807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2ba34a54da91089571f60de19b8fd21926594813 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915845 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405150083848581 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dab69a0d74e5c6ec6c4f913b1bf97c8964754955 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4846093133385951, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014045826789783668 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.48855564325177586, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048804199859325 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a98caba8071c799c5d5430989cd96274cd86dcee --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225632 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4846093133385951, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014045826789783668 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c02f243045770de41e0f5d0ae76d94bdde446c --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049516 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.48224151539068666, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014043619596174964 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cccdc23bc1d0624307804e966a33bfe83572c49f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_Replace_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915869 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0957638f9022e70cfd5680cecf53b5c650768ab3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..41b21c1172f902b1904a5dfb25f7e9bde2b196c4 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824192 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405090552122858 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..be4d3fe81672f67eb94ec08654e39bd7f62d2a03 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.489344909234412, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140492945362904 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e7659e87d22e884884a297d6c1f3f12ce66529af --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367589 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405248130604952 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..df78fdbb95fce4f9f99eff662790b75ed743aa92 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915853 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cedb801475459ef579627eca03b5ab317dc52fda --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_True-or-False_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405174596179051 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..88929ea3fc1e9149893f4510bac229b935653cc3 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824192 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225636 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2b115b933266c84a9f9f0258efbd67f350d1fc1d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228573 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405213114691586 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9555aa62b26ab27694b972c921bcb08df20e11e6 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048278820405621 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330352 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..48d1d66f0ad611173f8acf909c086a1773b9e654 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5240726124704025, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014036189665395136 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..259777514ae4b6005dce13bad03bcf73e3a45a09 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5169692186266772, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014044390401612976 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1adb932b804f162c08632797690ae5c103e7f538 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5169692186266772, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014044390401612976 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140501700944977 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..05ecab1be6ed95d5ddaab1a3adc75deeba0d6c84 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824189 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052131146915857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..682a3eb20715355110cfeea52d761579ea21ee7f --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049512 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367582 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b53fec41a5b5e2b2b62a81123aeb1e6602c64470 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405195606407689 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049512 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a7efd7148d3ac05c91f0b80f38a6601033c3bc91 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7c102f3c3f814b30be552031c55849ca0db9ee65 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790513 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616438 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7d287b13b4a26b26ebb88d8818affee33f1409cf --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_stand-for_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5327545382794001, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014022300570434137 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051500838485807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_0.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3552174fdc0e636ac8798d8899feec4716146c3e --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140519560640769 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.48539857932123126, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014046492383275835 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_1.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5b72692a20b3da0903e1b79a1111ef4ec03ba496 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225636 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_2.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..375b6b499e2de6a0685a955626951d45412a089d --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529024 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048278820405621 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_3.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..834af2b824d3c41860329d13335c47da727360ae --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5153906866614049, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014045826789783668 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_4.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b6e638208d375434a39298eb09f27811c013ccb --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5256511444356748, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014033980956108553 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5303867403314917, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014026510839428743 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_5.json b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1eed2c4e775e9a268cb9b5ebd407fb3da0be8d12 --- /dev/null +++ b/4b284b21bc4/eval/slim.4b284b21bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5256511444356748, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01403398095610855 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5272296764009471, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014031631629827696 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12772de150b9fd17c173935daff19e7d9931ede8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da547892fa63ed589c413d41c7dee7807ddb09efecbdb4caca310b97de57abd +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b98a62c404f87b7374982b5674fcb82127514c0c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803042e55e25b834f7a782714c0791e420434c8b2da5d6b40008d7b53d0027a6 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35aa7bf15ddd3d1e17723183f84d21c80b3eea14 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:579878217441f57322f881cd22e4c2be8b5d08b05a1fe48e795039873967fce3 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dedc6ae2d3b8a39807e60faf01fcb5a56391038 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01980ad4cc58b4719fffe205771b075d48ec0f2e623fda8ed1b9ef594ef9afa4 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ed19257a58332cc2e195649ddef64eaa701d587 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcf6b4edeadf16a08893ce0bbab5651b96801692ff8d3af267f4d3d0f2303e07 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f9e83bd21bf000d55a562c8bf87d729736e8ee3 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f0f525588cff28a694383fc9ea2cdf76a03ac853f6929de418c92eadea06a6 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b8f4d72bcad03a502f59e349a758aa1d3f89c82 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0067df49686b9c57024b6908127f4c91fac03757e5f93ab4bbc4fce7ea3a54 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24821f93ee5cdebc337034accb6082f457273c92 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a57fc5acb7e6b1e76a1966bc6f0862f306131d1a49a4b05b1941cb19038245e +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2aed07b5120aa7b5f4b84af9def089fdf6068881 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c1ed75c4b1a784efbe8e8c99f9b3c2b734fc75f7d4291f4ef8173a1844cec19 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d4e878c9259a6512b378c17da1de1f8fb41b8c5 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1844a7aad1118ad57f649a26d3fff45e9215b0cbccd1a3cd7f9c7c55f93f27b4 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f83efc56fdfc142f7d776b8915c00032ac9e581c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c9cec1bdc94a75baa2ce61c34a5bf3d18079598d6640d9bff23263547209c8 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe3cb05e0010128109416e96092312c227afe2a5 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b78823cc06a4c1171500860fd4f8f520ed399844c591c84846b9fa2bf631ebe +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e463e41da27219a9140d5bbc8e5fc3df113d734 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:869dbffe00f14d694adbd00ce8b30721609c58569007f9ae01533569581d6325 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..473e15720cf758bbf400c0037d9ad48d12da9383 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0144fa039a0f620064dc0b2826b4a2c495b5c74cb3ae8f275db9032631cac1 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..014f1319b686c14dd6c5ec024c7b228400f60903 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:407c4497c2ec95ff5e14d150589f3be422882b264b777eb7cf82bfacc56ef56e +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d31a056875cf4a429213440738193d787e155392 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef68b6ebc93a84cc2264b3cbb5f90d8c239e0455792fb942308a7598e579d58 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..134b26207de7e6ba4b4cf34e62525ac859299a61 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c04c7b0be6b6f7c2725118fd9891774a38e1ab4d51e0cf16363d9ef4ae49798 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15712143cc6bdf4c4e6409b1abc06b986c71db76 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405992f1cdc0c8f42154b7a7bbad82115b1edfbc6bfe142102c9c3c8905b1af5 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..374163741670bcb77a3b48bb998e9c0dba99d26e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a6cfb13060287193057f926228ac600b14f05381d0594795b3161c599deddbc +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..378ccb591aae6b69b9a533189783e6e3ce8f8dfc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d3ff59fa830c06160c11561932cd15b10cec9182038a95307dac98fc113ff61 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae3d947a3a83469870be4e13a233cf1e01e494fb --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e89de517d706f597f6c3e0c80bffb13531ed1bb462154c81f840350a9f074b8 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a4ada8885360e889b32190df85dcaf2dc7de61d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ebaddb32d3e346e6169144aff1270c05312ec3d848a409bf3964d0dd5831b7 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b93ab5666c9dadd0de303f1bd42e3daea6373893 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6309402c148ffa3cf3f177bfc88de885feb0825e39803d7c0034215ec6a0f341 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f6a237b1168c749e4695840638665d1ba74b429 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3209fab13d7a10a3e35c88e9ce3faddd76f1565c69b2a0ff656749ca21805987 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1942c56f573b6597ec9c0028b4d5a4fd0eec98d6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70f461666447dfeac53543b17ad8ffa78be94592b138018c8a551617e35ae391 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02c0bba72009a977d4c2ae9af45bcb9e03ce6857 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16235f36149043d93813c89f4c837fb6b48d13bce93f52e3ab2e397d3c673d0a +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e57cd6b6130bb3d3dfcefd778128b5c8696da53 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b84e704973b574465ead4591c573a1469899d9e311affd9ed59914e71003018 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fa58f18f6e4e30d39092e2e1199e2b4de551be0 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e049c6d90885f0bbb9f4b3f8636d674e3de11edcebd78940547e8f0d4af2f8c4 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a0b32166934a49b89413694f69802732684adaf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51bbecbdf6054f5ae6205ef30bbaaf7eaa488cd97dae9acf2f8c6695cd90466b +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..580b8f160ca1d0fbd05031f876ab23c7e37566dc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b2bd9e617d930bfc5e4ee6348dce584e09f11c4312c244d756cc49423d9a3c +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..246d1a5ec38a234c670bf1ee2bf5f6384fdc5d22 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88e963ff79db308ef2202e54585c221b11436e50b200f952d8cdf0094e1046ed +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d6a4f34385cc4f52db5b2ff5b1ab2aea901e142 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f560b6c634e972ad6d80cb2d910fdacc6ec6f918b4b3e266b6c7790ae96f5c0 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..072a917ccf219f34e054b51a30cd45916fab2af7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad744b82a2e33f7bb8214fc105ee35859116c65e630e4af8d09d0ed1d616ed77 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7ceb3bb778077bd4000dceb6a81099481be396b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4cef404847386080b187b491309112d0eeb6e51472400f32f0740d4a39d64c +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca6149439fb2faa6672170b88a979bcdfd0b69ef --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8039a337e27f36a5e4abb7268a10cdcc77b5980675a4ba44ee9d35163f24db0 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0982b2df19996de5e3516c3bd80f0259bf2bfa10 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b27d0d3a1c5c2e984003b5046b9d41e0d2f8fed7f949bc3311a9a1c1a80327a +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57b48167c8259be18adc615db0e482dbebe0e6e2 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f7d7f15dc4f5bb6f97f24459b829109244ee531db36f1eeb4e0a7f4e8614c1 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aab9da0d82d57cbe9b57a05d472e3a37042e871c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3179dc4354fc69c65e149a3adf43d461581d7a1c2321b97d1b3a8b766345dcaf +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51abe08881a69e01dcad84bd65bdb3d4f6b6fecf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc9eb417347d4f6b9cf8c873f937fc20621fdb5e1d5f1901cc3b4de513b37aef +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4ba1a5b0ca4281505186111421d178ac7450ef6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f74be8942e673b78def86335634b8af978d1300d1d3dcce529b030714d409bb +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..795f712053848f225178a502d0a6515e82b1004c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:214dccdfc920ea1648b1ccb8a2f8da85d6870bea33a178066c1268d79e2865d0 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..902439699d062415e979605805c5c7bd7d8a3915 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ecfadddf09e0cd3b88ae7f7eae2b3df2b2fa3667d97be789f3a74722f75deb1 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e93f384d617ef90ff143074f34163d82e2731f90 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af7852ad4a9f7e54aa75331a41b51eae0c1a0140af8620ec24ca07fa646aef4 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c768da9ba8ae2646ec6d6b6a595b40866e07e70a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b94f0c0dfabcfb0f8d92c182b6af8614e7af0b2ae65426ee9ce0a25b6bad5516 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e250dfa221bbed61eec1765aec7bfe9666fa8b1 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:377d4ba3c4c1a22762a33bba13d283831250116fd37a7c555401075455c6b320 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e957241d4075256c836e71500c28ac613c975617 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56a22eb02fa8d13d295a527ed7523ccd4cb1e31a58ae327e6032b7f58f54ddde +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e489876d8ba2c55aa033a90748d6b832c2e83d7f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9ce4e33ee940e858845a67ca932c07b1fc7be3ca34dcc0f2a3eb1281fa2d57 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2db3a2c7145d41cb0a809533de2c5f9f6a9078b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcae7f2cf651de135e5bca303d253f6642427b56bed838fe8a27b81ba41772dd +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f42d1b6e823eff9437908c2c60cb751366b199f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679eff8ff4f7cd32016c46a3a9fff14673d85df1b10c15b6812b0ef1982ad1e9 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..458acfbf9a553a6abdf0d9a3d1fb8cb569707040 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb2bc786e385b322a00a2a124fb47fc0a849d87aeb33173097ce4ba45b73b84 +size 199058797 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a436c9e08dfdfacac83ef23a8a812263b88e364 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab8899b314ed303149f706c9f656a25a58f4d5c8591788769d87e66c76a9d9f +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbdd194a68e4949c71cc9a0182d2e5ea69e2b767 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d03352a943cd4f1aa6a9725a276d013d854414e41c081fa195bc0dcb4bef52 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfdc2139ddc9f12d3141d3d244dec36e7b7422ce --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0c0276c4426d9b940004b00165a7b1f6e0dde5f202659113b327bde386b3a9 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..140a33b8891e2adc9acff62869251b71761af5bf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90bdb0ae40f180e8827d5ba333d0de764ade6f21be7dc015f04a114d9a643074 +size 199058733 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e38198620f5c5594ae848af44566d0182a6abe83 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa88e1ff96623c418bf472bab6230e47b8b71dd85f8efbb0086ac3556943abc6 +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2480a0e8891c2d5242fd1a87c16fa28fd27ec17 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001cf9e5841b35e046cafcd5e1cec0bc9614e51755ef73f9aa35e834aea8fccf +size 199058669 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bdfd8a3cba655f54239d77db3ac829409d49408 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b9dd97e0610c8a71e9a1657e464dc2a250dc51f41aeecb38f325166151d435 +size 199058925 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e365497943e59beac8a8d105749eca973dff26d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eb6ab998c81a69a64c16292cf599cddb2da6a7baa37c72a2d681a5f39cc04f9 +size 199058925 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dafd32f57ccbd2320cf099cd71bcea856f7739a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ea07d7f947e04468f1a427cb36f241fdd91a19cf7154e46622f1711981f029d +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfeb946d5d07b9190891d1b576cac49bf08a8a0e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6643277460714fe4f81598a4c82dade3acf0c5398a29b256dd5743e21eb5f5a5 +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a96b3425133e47be1e273db6395892f3e1310181 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8903948fad3c8fba3cd8c6acdfff7cc0851f4f7727941859b39b7ed01d544ea +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a82f1b37e41b07e56843a64cd2b23e5fa1e9b6f7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f43b275f65f31525c57fdd1d6780102154596cd8d31350956ff6aa2f282b13f +size 199058605 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3230c152d720a92e7ca624c2cec387936fcbd4d2 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aede4f06e2f76e88820c222350d8efe79729f5e927c1a594c46b74c769af2be +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b19e9511143ec200242665a7a59400944a8dd5b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30ede294f207390bcb143018910c59ce5a6bfb8b531ac1f7d8489f9a204263a1 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5486d18a8ad0dcf4658f6483ef0e4dc1f2561e8f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0054ca569c661af164d1cf6343b36ec212bdfabd278910ef600bfa5c78880e87 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2a42b1755b313919e6a1b11debf0be8a60df13f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5033160b10d256f24b7d63b6d30d067593d0f298354a39a75f3f03476ff4f3 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..452706b14cceea351663dc4c042e432edde9a61e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede2f89216a951bf50f3106f9a206e4bec569ee6d871af480e643ec64fb10c1e +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3cd6eacd73bbb3b2633053a79194d0a0515d404 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b4ce9a3040ca9429aa3dd6c57f6d6071cd3e51a3bf9031751ff0d5bcc46e9e3 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8af06444ac6a2d873c3a76ba09e8dc2be02fc8c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed2cf239962ecd7bac88794f14951272227db63104f220e8c9d304d3d7dfd2e +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7020b97a73bbc1193d4ac34e00a7ac37be5f519 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1271ef43794936bc4bfbb6fa814a63872968d576179230ab7377fe4957b648e +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a577688a796d4dc9cbde91e17c1706ee01d84c7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0bc5055a10cfb896ec3455f618c814c01a81df48c5102e29735065bf17e3234 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2ecee8c9f48747a676dc4f9fe4713b4a71652a8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc0be4b68cf6218e58d69714ff224a87008f81e1fea166c548f3445ecd25398b +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e7e77f0bd8ed19407891d32a2d66807075a1b85 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c41082024a2f9f728d01088dee4dde225df675fdc44b37a89e427393d26bb31e +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2387ad2ec89e63632625c1c0bda57f99caab724 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82537a324945a1b9fcaa7ac13964aadabf74ebb218b377d3534530c4a81e89bb +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be59822fd5bca8d665e63ef58cd323117164c061 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7bd0aee956353c8279548a6c2a4add822fef8b040f0ed51a4db999a05200b1 +size 199058978 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a46fd2699902f2d3832e50429329a433d2c97d42 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20a07562bda64c6fed651707d4f9a8ff1a4d4feceafb345f73c4505c03b093b +size 199058978 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5622f8ef490d5ff720013e3d5cf1b1fa055bb68 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffcd75d4b79de3b13504dcae97be94c08e1066bec5a98a999d74773c64c017f +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51fa76e6290ab82835c994146fa467b97bf34315 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c31ee15c278b5141227b340034f238de2d4aa3c858447f3eac7c44b842cc81 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..688dba0586e494d688b6d7245916be644f234f76 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e9b873126ce7e5921d26bf067e801d33121ec1b76902508ebd5e1332d946ce +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e27a2b07848cb7f41cefe801f8a43b085d0cdb59 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:361cd1e20a25696d3fa37c859af71297e355d8e9ff8faa43d5edce489f8a0ec7 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6d136ded1bf585a44e2f3aed176d99e2915a2e7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:238603c58890745d0778896cdc96e03044902f5434cefe0e5b3fe44ff3587fbd +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf2b11e2e99a327dc2536e70c44118cecd7687c9 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb31cea871a465dc987098d845b2c4e1629f7f4abe2c7eda2cacea9f41ff176 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be028051b70de284b8a451afd1c93b5b71b31265 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87884af3e3ce1ac9caccec36b9af6d8b59c476bfa04dd559d9a7d1fd10073550 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d832b9acc97c8ac3c3a0da064ce7e0361edb5016 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f7e0fcdc20cf0919b7fc17b4e30a38d41f479bd5639c27d0724b31ba7e3057 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dc33f445db58f0fd6b442827c438319f5a36250 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:183216ec4bb8765d068f633eac9eb93a404099de1e7e0050649001bc6b0b54eb +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6325d4058229fe850cef37b341670c15af8722ed --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:777b0f285a4ca1e87749ff64fa03b347cf9696fb5d214e96476a3439b78ef64f +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d71c5d553f6da6874e27ab118adb407d746d9907 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1377df9fab3e004f8e19e02d814628ece3660d58051f488e203e318b27ee48 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af8774c85a05c661b7a7a9e8eafeeca8cf3538c7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fa9a9b9567cd5da567011bafa7797ac898e99aebd4757a55a9f7fde629ba012 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f82398e00dc4b199c19839ed876b758abf78d51b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18927d148ed2d16575fc00eef29752c2ded673d17fb062561d7535ab94db7529 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63d0a153686ecd38350c486ee4bb885df72ee6b0 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3fae5a08c08b4cf4e2132d714dc97f484ce99546ecc26a974a65c87dcedd84 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69e71f6d2ce8ccf5ba8984e7cf18a0d601ef2ee1 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9748cc2a572c8da69cc76d84a2f48a6ee87289cd78cbaf7484bcb7bb0afbec26 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6952b1ec4c54c4020e9312c051bf0e34627c852e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119a82f07442cb472e79e8ad39c4bc562ba715e250927b5f576d04d01ca96141 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab8282f2a24a7c4b7f62200de8ae3d2ac47004c8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c5b6a7a9f7a0763db5ed45b444e1098801dd6fbdbf19f1972940ad92e7dff8 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7db3a95b80185f9903a65ea7e15450b39737fdcc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7712382b5d5a0416fa2eb493e2e5890e414fbe4e7e7d1e9f56708b99904cf008 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8513afbbf1a6a4ca44a26ef8a5e43909ca1aa57 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30780f41906ca7e112adcabcbcf230739641b3168772dbcff2ceebb4850dd1a +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76e15a5f04f2767e6b164993128ee60f916d4e84 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05717bd14802919a36db3dc2fba48a034e77b9907cd30a560e79198b823bbfa4 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7cae23135bea81f2226d1be9e90a2da74c31f23 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5284e2cd311ebf5b23390e625441354a6a89f4b0ce2f37295c1a7c24cae6b242 +size 199058594 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c88baff5493e0989588c65b6dd99c0cf92df13e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73b42dee153222b1ceb8e72c7e482719a3d8dc9648e0ceaaa629ceb7a617a2b +size 199058594 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a01112a1b1f9c47816158896461d2a80dc3a5fb --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4301db5b3ae4ce1f1fe96738d3de587e52b226c8ab01656ffffad9a92d988f5 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bee8772f426b60a8426439ca2c82e5f4d24e8c93 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc8f97bd43ec4a79d70cdd268cd3d1c45c8843bfaaa960d7a30f8bfe3de8d80 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cb84667feafbc0b1954566968051cf6facd05b4 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516349866f7cde1a6c4a9a9139789dda3ca3d5342b7125197e67f09365adc8a1 +size 199058711 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b608d4a5c17610e0b7355c9deffbe75a1da8ecd --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28a4cbb87ed28aa7d98af446b502ef10ed25ded32a5a350a2d754ee75e7f7c26 +size 199058711 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ed2c2a30b90f6d1146477f3af383eaf2aa09016 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30a0845ed823070bc19d383f2c060b735dbae4782c577ccd5880b93212ec1c5 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bfade57427be96e285d9b12a3b202465ba0589d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ded6da5d3ebbf9f8256940b45af6473ebc6867ab452223ce4f51d55b9a6479 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2459e0a43802bc2245e9c2c0139d89792d9a3d54 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d419c7dd3d2cd86083a6af9ff47b8d961079b09daa0d4aa840c0c56ae5149336 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23ee76380f8068b78238edb53fafce720dc8d821 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a5a438ccfed5992165a19d5d540b865dfc770fff13c792c1750fbdce417ecf9 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1f3e1bb90d2e9e3cea7cf9c98f7fcc15281a46a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbef13f0b84cd4878b4d5bf217fdad78c0f44dc54993c6c265ce11e6bbf8a491 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79bf67315c4aea2c6b693c0b730882f2ea5c37f6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118f5bf3da45a964d570f198f23e6f61e66c7907e384942f0a371a9fd1e8c3b7 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..698514c080829d877cb03218820781ae0475ff41 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37ba446eca3a7d2194c0b84dad0fb3aab3de4f8946170d09f932af133b6325c7 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15712e81e03561ccc8bd6307d1fb36180e242d7b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0008593c4d2c6b94493bf63849544c5d17ef00b0c0e3d1b5cbe54d256b4a68cb +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cb9b38b1a68b074af65a37deef1b85f639db320 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7074ab68330ca0b5d836152626146bb20dc923917ea810d2261246147cc0ad +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ef0a204344cf04a87f1f6ebb26f544deba6959c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0cf363ea8cbfbf0c77f2420d4a35882c807fd348fbe77ab28fb70f9f645e3d6 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edb896f315b779919ebf88e3d567079709ecb0c4 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd580435a72d351f9243ccbb7768b1f0da3f35dd1724c8a9cc188de06d0f2ee +size 199058594 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40d7b76a0ae9c455e26a0392aa7610bd323de79c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde3bb36026a4e62c3cd515a535c330e2117cd450014d343b38badf06e21430f +size 199058594 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3510f6c33f54adff51b252383a72b21d06f906af --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd76ae38d8594333e0da16a6afd99cf170a5a3b95c861ad472d9cb30bf6c0f6 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63c1ea140d3e92c3384f4e58bf5af17716269d27 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3675edb27dd58ad5e3541520a2cf5b3492cfcf2ddca7dd3ebdd2e5376f6c652f +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..abc795a4a4eb4de5959f5b750e67e8de96d24223 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef43f9bbde01c7cf1204571dc4dd609f1d76578e9ed950ae31a367e249fcde6 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..194f115f174577b43fa51e3ac9776e97bd570ab4 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdf77cea4598096ab2ca3c4d2e4cc7696d9189e9457bef61a05b54b7d853289 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3828174be20f90019b033fa4b400aa70a3a74947 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e44b90b89065a9976f4eccdb231f0629332f307e2c32eb566ada8e00516bee7 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45a852a83a0fb5364644c377ae9d5b39741fb031 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9d6076af4f454e840acfc603f84521a172dd9137dcbb93a1b0bd906020e387 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9796c019f4fbbad7cf86bd2691ba69e44b3aa77d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c26380eff5fd52a688a737fa0a8d1f46d844d6f220bf3ac833526b2eefb8d9c +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..804d67a126f8c3c9641b04eea95790d4b46d158a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c33cb0db97c089f2052ea296a3fd465adda1001338caa8188f53db89cac558f +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81174eacde8c61c5e5fe31938075b91c7865d428 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fb25e294886d1de3844f8ba4f3054548629342810d8fde4c2c94241ee9bf204 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0a7b32cd654ffba75f2b053fa691b6d2c6652fa --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fcdc7fb599ea048a36f230583025db54c104bc15b377ba87acd293c874687c +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef9bf346a1e0c8302246df88a40f89a753dd0d79 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6c9ba054ce8d955454dd8dd9280326d37db48f04c56b835b20c83012235450 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecab576a812cd4a7a1a4b5553d35d0f776ad6a82 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7336b4cb123e75a52423e6ed29bb8fa73d0a8cff260e373301ea6a49960dfb93 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09ae826396607b027aa1de4b8b0d06607d2470bf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4632095fa06c92e6300d963df95470a79faa664cd9eb7df6e543ab40f6b705b +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86f09851117ce00605e9f6f9a463cf3ded94a017 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98bb614758392afb184fc07271a697eaf847b525d7e776254e1187c7f3dbfddb +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e5d855b90d3bf9a505e18d62d5715bcf5c18fac --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa3fdacf5cdb7c497eef8349aa3d0c97b1302095e6d3b0ef89c3b86373efc8f +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3938b42e80aea495681c7bdefc16e0f98b090d5 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03c62cd5d55fe6b2935673cb6a9ba010fb19a6081ce7655d18c5781755987043 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ece65ec612836a5507da2dd4c670c0e063f354 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3090a5dc492a57773a77c7db6a60fd2f0b661da5984418e13c4f13d0ce9a914f +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cf1e0b5e6d5af91f0da4cf7efc8ea16ba59d7d8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:038e29bcd77a731fc7bcf0564f279907c89e417cb4186569ce95bca856ec2217 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e217a00762e93a55c7ed13e5756a55a0b799fd0 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a5557b409db395d6fca8400643900b9e213db61606ccd1c0241eacddc316ce +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..feab8b3aaac165f99eeca4f7f6bb72c2e199f478 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2731df1bf07efa59d642dc034bf9c46a9c10a54ea667ee3ce853a482da980669 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99d0b14610d31327282aedfaa60c5f21001f3a45 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e4eb8bad7df38e3bb8977e476eae90b0c7bb78cfb245b5ce122d95b8a36425 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16bfbce330c90a888c9807ec207aa54843ae9e8a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e20f89ec752e75e8d10c89092e185270180c1bb43295b4fea67875ed08c5618 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ebdf88a848d919aab1a8248ec5f758960dd89e6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19a550864a1d6de6c30c359f332df73c753288bb4dca69787301d8f198d3c09d +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8474f406ea403354e95cc177f48d937447bc15b7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cd034352bfb882babd96e608d16edbccfef78fab13f506b48197bab26fd5028 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90477e5f0aeb4ff9cd7816d7799ef97f981176bf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bdef658bc109cc614d81d740b1499b486a0998c2967a4073561dc4ee61c85f3 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d689b8167a2844b9effb8f69d6578a6cf51fb10 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e170902f598bcbc7f2eed38b52764d5e2359b8782bd02ac1c0234e80ac8c68a +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed5bec35e91c74a92bf496b85da42fe44032e510 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:536d28116d8818cff7f0a92bcc087ed642bb9ca59d071a257e12aed1865cf094 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73adbc6284963ae4284e3e5f10e0578df5603585 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d16ea4d3198c42ef0e4c7900b7dc6430581bff96fb89575e7fe9d7cc678087c6 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..760db6bcd2b6d95a9ab8957640a266b39144b101 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e944c8579aca241e24fabda2c554f767e95277f7cea649c57ce6c2f66f650a +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b72274d3827f52501f7950034d7a90a817cea9fc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12587dadf910e8d22f4a68fce80c63d15659470c1468e98ed49ddfb6829af124 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f73685e6611f1e1b7553eda9c51d1ebfe0b16fe --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de5f6ae0d07c5c5ce5052d50554e678eb3375a7d9dda0f2a6fc2fd8bdb04adf +size 199058775 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a0e5b07aacebdaa7b4671b0428f167bad0a302 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27f950bfd12b5bcb259df8e9dfdd8c1c7e86bb9ebd808ee26c9f8271455abc30 +size 199058775 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ca7adbeefd973dc1342b1ce3b4f3b8250c285fc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdbb0e51dd25f635ca0d3da7c59828417e62d98e2722f0d18b3377348c6046c0 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4bbcd9e98ce9352daa73c6c083ece08c9dd438b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1f8f03d717275e9b0a1ab8a9307df2a87c97b3a3fb3b806216c01632b481ad3 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f563dc301652e0ddc9746d7b9fc10eb55ec14fc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be301d712e380d4b01db7d6bbf3750056a84820b92d22481e1d1f88d3a5e2c9 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8273cb224a224d12fcd429bb215d04ce4a77be91 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e24e14e40d958f890f33267d2714dac3f18cc3b6cd76fbd2afacde5c28e956 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a11a4850806eac1e4231f60545a60394e17c5a24 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fd1591098056d1569f523c5e94eaa7df4f93dde6d498b9d44daa83ed12d071 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91d60e552f46d12901e4c9b3505604f99b835119 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6073ff44eb79d74321bc9c398766582175d661a43bca6ab863722eee5affb622 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5635164944528f8b0a9c207169fdb8865a330728 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c302b7af331b72b1dd0834a7165e7d9ea429c6bb1dc07f0216d055261b7f33a +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..522456070b23e2ddaee88d9aa782c08f95b3a422 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b96f860bd64cad7fc57da69329fe9682c23867381f63d88fe80b24002aad00 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e5502722f28b57d234b58d60dd9475c767db861 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d71d0ba72ffd25eeea69e5df2169b284b3f1a020fca3db4408e9025e531fc85 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e56fc4cfe99d095c3164fcd687e7e585b067cf8c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0975acd82f4f4a66990e30374272a4be0135cd54d43eeb0b2bf0f497f72e33e6 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aab2bb38779774e0e0ae49a9c13546fa66b8c464 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e96865557da709f217c347ccc2535f862850d3579a971e4a7d364be9e86e756 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a74de43e0f14e3b2cd72544e259309151a831638 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7769a89d9dbb19479505399b838affe30de2613ec993a9244e3c6c20ab5abe +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2c617ef3626a30b816de2221e4b17b333986b18 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157ded99daf10d93f7c941bf0258689f865004046502c9fe285eec5dfc13bddd +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e643c7749099b2be358ec9975df974b0b366049b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f049934d237c1f79b8d830a2ee97339cf9c794decd8a8cba43c4628079d41b +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e2533c5c6512b6a701d71c0a4e72fd17a0a810a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e0451be00c115f3141982e71419af2117adf82b2a030950a58de2e8061553c0 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da97975badbd1689218571f66d5bbd7792f40c10 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db72c11ba64dd33073f3d10068851de74cba03b1bb6375eb80e858da1cd4119d +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9168794e6bd4e86341345e08f0f5bf82735d4da --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938eefa6822253c69d1aa70dea93ea517fb464c35b8af3ca321c72ce7124a366 +size 199058914 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d099846bc2b37f4076f22835062be790a058687 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a799ad7b8a1d866048fa670d9fcec3f4d847ba5bf82fd4552bf08d36461f60d +size 199058914 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6d091c5cc84193e26d68e5fee645e92131cd165 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e4b56d507fb27c9ea0f58536bff610920381f6a44af41751acf007cf14c847 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e27310ca9b54c360a0cba7ebad0ec0de91ef440 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cedb0fda43c9fa7a4f1faa5569ddfef6274f74f773dc9f1d9601b97188cbe00 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fce20920638ed5ce8665489e79e028e35c66f1b6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836c67fe2bad28a7d91d6a269f94a5eb7520ec85663c7c4bb9df25cc6858819a +size 199058711 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a7f5abdd27692a2b7d84cddb3ade4be436e1d16 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac67e4db937e05ba0a0def8990574e15ebff2ebeefc36462c640d5371a95627d +size 199058711 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49b9282455dd43e4500aff489d99bcc87ca2ac93 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1e28a3206dfa1e646ec00743c16feaa06a68119df150246c3af99eac44bfa2 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0edcff992d06ca06b8316011123d9c1528a091a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e1f12b91842405e840bc8bda9883cd2d4df89f7d7ee17f86cd5dd84fdbe5ec0 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7959e4e283c44736afcb8666f093d2b731fad802 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4894b3517ab44ac88f2184ecf0755d3621afcbb0aa8f92c237e748333f530fd +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01e981348c511b321658c9b1e1d34bd747f60af1 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa634507d3065ea98bf55d657a939292dea0870074d256816c3216a2e4182ea +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dbdd34688bb755ff1caf91690d6b9baf3588249 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0613cf4073af12ada6e56a7ca38b04c9145bdb97ab526e44cbf968b971055ea8 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95fa4ef7a2d111f0d27817badc2c1dceb1940c12 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bf450f05c3483f2e93dabe6d447a1bf4f949290dbf424e5ad0225ba41375754 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cfb407a50999cdac560a7440089316b33a6a956 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2270d7739e1e1aaf016e16d0393616abb44f97d374c047cb6e0d527ac4563fb7 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3cf8ba521189d630f4cfe1765bd442cbca8cb2c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecfa7af4201bfff349690db9bf78d34546a3f434febd8c8563dc391a73232382 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbb7d480e89b621bc3974ec0a22ca4a40acd7b7d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43ddac9675d19b01813b16605ba4b5d7090687452868a4ce9f069fa070370cc +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c4a2081cfc85b9188d5200b1c96e343c02e98c8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e80cee4a1bf6d0e0c2db05720021765fc6513a368e3c4bf0211e3cf347330d +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..750cb45a9961b40ce4cd2682e5789b558523a87f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace729227a4d3504e7226813e09feda312de5abfdcea03e77916258df4e2b538 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c28aa718e9471ce9175127d7eebb4e764be39b3 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b489a74e5d1144c0aa0c2e349cb3e5ad2b971049e6b4cbf7314ba38447ea3f0c +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..010477d76ea78a799a71a8d4f4dadde554012fdc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7203bcce020a9e3e4425c828cdaffd5ae56bdc00cb091ce18ef151ce2803c043 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7acc364d459365eb725ac5b7ed7798d5090cd5f3 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ffc0b9533ba19ca020576740cd0d4f8e1b3c02ef651fa4483c70e7e017a79af +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd83bd2e949793b388f9ad14968f74d29c154f03 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:699d4821cc7293359ac6cc1bfa7fe6e05b3f41c71932601725ac0609501dbf3e +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8299acd9dbd43e5394388324c2021de3406ca043 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f9a5954abb22c885b85144ba9894ca17114f0523d5e8ba35fa955480ac11ba +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58d34e01a35587f9a50e6ce4ca784f8d2c2051c8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:009b172a3244cef319f6fa306d9e2b0475b32ecd5c02363aa07358fabc121d1e +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bf93ea4d903e975f758d3da6e1b72d6bc2e2fa7 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0073d332e43730aefe53d3fe8819256ac3a93aec84f5cf0ebdd348d1d813519 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0a13ea00d4f6b749af13d49e98ca91857e16561 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e574fa8d588dbe42198995ac2a3e38723e760fd2f87e08cd1baa906943c58f54 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9027fe358f900bffd6f1d1ffd4be9a0a77d31822 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e616ba570b3dd1adead2cf1f776471525f7222ec2b4df7ccf550102af4a14b +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b117ca13e9c12727f7609ec0c00ae9fea74fcfe8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ecf68a9dd0e31db4ecd55d126906bd2d1df65d02005b1b6445313da2a9bc1a7 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6e4ab735026ec6623baf15dc7b761905e87db20 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35cadcd481cda00c2becd2327b30e91166266eb3ea438059af6639c6be72238 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..532e9e054857b8ec92004f605ae2de1eb0f2c2ea --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bae4523a1d2679f44549cad87f7a11f5e69fa1281db03e1613ed2043ad0de33 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6162efaebfd4fb9069f6d229cae2b3fc5cc3a513 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99aa976168e92ea28b57834332f8db807eb0e8a33add742c6c565bbac70d5499 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..995a8b61d622d1026a2a415d8dc9adc9bae2c02b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a6a6df460843f186e8d7bff1b7e1f86beaed0cf5ca5d6dff6a562d39d00620c +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3167fc8b8d08caa6717c87da998eed4c88dfa051 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa3e954ac185f44fb397078b3d1c0840fb27a6937917437c1a4ba08bf121930 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f1d3636f21528188920e7a4ac7b64df37f9ea47 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50c568929562f14bf353de83d50fa77ac39a0a8a1b65f61e350159b3f97894eb +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff34e53a036586f6831e76f65f68268998990460 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118b217f3ab33e36fef9ab0e65d8d645196babfa99c22e792cab239d1f5317b8 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a98600f135ca7b4172f58c08eee77221cef74d8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f707cbe056f7fc00bd93762ab936ace8f735cc5ac9649c94b0adf05ac886d4fc +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e4a5135bf774f199cdfd30880fd61542e0b49f8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c34f5be9d85f3a1a7152ddb854f9222245382d69c07424745c9a750e46c506 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..440ae3d0edc0494d119e762d83cfda29388e4f32 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63ee121fdb66f945868663a1c7be63afcce3c9081e30995d0a9e9f3a49950df3 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a59e133470cfb7cd32841d688230f8be936c1ee2 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91c1f8eb17769e2a0788006d21510ade26f100e065c592860ef3c8265891d056 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18c2e4d8f4a056644e214d06aca7f6eae3886888 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b528ae579fe19318831718e4a3b873e1c20e84b09329633155a923f11a81b9 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf243c68cbc5ef84ce62ff63e81ee227cb14a13a --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc82cfd533087ed10e912735f06fb9c0539b0d4effb9ca5cc0df70bfc7b0f2e2 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..262658342d46351bccef604a86853c581e4bb810 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7cac4d7a20551f3018d889597f5283d06b4d87528e03218fbeae02badc8c62 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af11055ffbd5bdf99c0c66071c2934e45c87c611 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb324b716f80a1a497a93c06c0e07aa53250fb2d6e26f1bab0f65074ff4a67e +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15dd1bb6c89ae1c6f3e7029cc73da41618e0280d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a034a856b069bb78b5ca44b0b7cd4622b57e100189711babe642210633e4076a +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fbbffb8ca2c5164cd1bd29be7d4832e622a1492 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ac6e14625ae8d4d617d886559a3b4f16f843ea9d0feeabf787e8edc8f4aa987 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7c283ea474ada59b4674d05628ca0aa075875cf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b6a5692ec7de7cc858590e459806cad7e121aee3b8c7d9d3fc68a4d41561503 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4b0cb4ef10755b97c2353c9f282eefd070a8d95 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:859e886efd561cf522be40ddf4fc23dfc1d8a1364f3e58cc5c20f45056bd0876 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5772cf376b5717987cc949d9255af932f8267bf --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a09b88e2075a3aded27b47a525d72629828e10305228cf663ed7d4e8169fb4c9 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da0e242742a82ba9fffa93700ba5f751b685d4ec --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5191b540c85dd4b66fcb56d44e5eeb6733fd3936488dab2999fc717d90df4b8b +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69af3689b32e907a2aa54bd2e42959042572cd60 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d898a3a09019fac618df97a54a2052601faf8cb01ae0afcce7086b057c613738 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcd2b8bdffa28a1f8c207730f47315be6e36c7d5 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0935afd6c897496e25d78bed5be2d3d00aa8e253d61a83359982fa61158a8047 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4662dcef1744161ae48d92b3b544c3950d8e27f8 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2b1693a9f89058d855dffc474f37a178b088b305e639ee405dd6b14eaef7440 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc0572059347b8ac87b84b636af7225865acea71 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d02173705c11ce4c959feb29620ceffaa41dbb2406c27c1694239074b103a1 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d272fb46fc29dfc5956ff1606dcf6b8a6b7a4807 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57bf1e2f4dc23a5c45c0bc3e8987836cdc08de6b51119b473e7633da0c860582 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a5e9eb8bb624c98e8bb9d8fad8cf2319491d2be --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed713d8cc172c81755f7933652e8ab9749ff9b8d5409d42c4ae676e94d0ef39b +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12dd97e9bcced481d99394081318cf16f6c52328 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b868600d05baf32b38181e96dca6bb82c8b60367a1441ebbc0426cb6bb5a608 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a177e9d1c740abd104aadf153da6f58e3f2ee6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f50af27fae27bb63092b62c3bbf8c2c7e5d3322723d29bba0b002d5549f5bfa +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53dde41059179a1034e477e147ac424a92ce750d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f73cd912227157eb0a32feff2d6f8e523bcdbfbc2614618a64dd9ed234b09cc +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..387e82fe85b2f69577884e4b80d1a375c0d30150 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a7702849d725f6f1b866c2d4b2a293b994bcae224d9f98cef8b84383e08371 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c90acc70f564fe1431edf2f6e010d19c50acc2f6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987bbe3ecb150f05f1e526b7d0a6136bcfed6b901064db7904ad091487e0ebd3 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bf48cbd3d50ad0880ea8c6dcbbfdbe55c6a2a0e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4d5fd70bbdd6bf4519e0672c359300271b7ea53cac6811ba9a1cef6938903fe +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4d7cae7343409d93bc4af157e879cc4441c958e --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26681f031d1ca342307a734d4a3b22a1a6edebccb7d2fc6b30384c79bb85e18a +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23f28d82a3cc16ce20118a08a397a7f0f9a4bfd5 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bff00a42a341c45f3c9863c3e898e8f2373b5e9422bc35ad7ed5afbdfeb6525 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00eb28b9f90801b35e29aca839d86b03bbd4bf60 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03704eeb4ee9922540dcce9c02b832c27eccb9c57724eb646f280b362403769f +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24cb3d0782ebba9e211544fe895749d7e120683f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6b4b0ed8d0f605cdbcf34e2b43f3de46aa0458439238f8c0393df7833b8bcc +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9b9dbf9931c68c615cbaf7cb16b97ad0dc1cebc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd65f7b880e7b34b73590d27698e4d4a4176be9557ad328affb3227a7dc0aa9 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75c7a5c6a8abd1b55766e2b156dd369089132a3d --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:254adb5215f0d9f302a42500504446cf757e59d8dff4d789b829a7b69d64eae9 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab11441ad20d82de812028d098d6e286092db3d4 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8c10d031b1cec07c37b9bbd18c4d8d1940b1ba437584012472bb109640beeb +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d89484be4e63f6310b91d5c42d6bc90fc5a770c --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a66bab72a6af260b66b52c50f294e87c2328d3da41213e64848acaba4d1c6b3 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed6d1fd700d011b877abfff1949a12542030c703 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a7893af5cedb2c81054bef279df15da7db851b3101864a14a5610ffcdf87b8 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5bf8a8656abd46474612a00895dc1b4b70fad76 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58dd2698cd2bdccd365e207acac147d30cdb727f541084c71be1e378cb874236 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a8dfe0ac985e7c916e8e371d5390f086fee3d19 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d21e9a8963f5b65ad6160d1b1f6e827df8f702fa3be64322cacbaa8dcd993b77 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea0b9bac1f77b1b24704541dc1b39cdfbfe5059b --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2739b1313534b788b5a7bb6fe798aa8edcda37a9acea880afc527cbd49f07a20 +size 199058647 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f28ac2d8b9ce38acacbb2719bb2f3d5eb3e1ebb2 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:799ec4170096506d4609507b7af251881b415a9b10b50feb578e0c0847340584 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1812e0b0e31c74a88efeec6310754f2af8221261 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2fb690c1865a4fac77aaeb17d9c2b246ef0fe5ea442006c96cf6312c7db2894 +size 199058850 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98849f81c63e7243b5e50ce880b088e671b578a3 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b22b63d457c034e1859b17744d9e5e5ec45dfd056f150ff12e7d83e9b881b2d +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1d591a099c4d45b1559124474bca2fade861f06 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7111afbc666205f918490d2f8bb5d21ced837ce5754020198fd5e672def9bf29 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3141e07ecf044c63dc814f690dfda5f9b4c5a96 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d158ee361db163a0787e648d6cf0108a33c93734c5a104c87ed6dcb8092be2b8 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36e656ad598aa1b1b9e869e39325362d9bda58e6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a58699544939e8c221c761111f74c2a02fa5beb66d938e04d79640cf50ec44 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b01cd987b1c738daf72a635d47e7fdeee12946dc --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:693dca13738fe17ce56b235150966203b5fd523f0c7ce5106f5dee0c33b07058 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..000a321d12aaa0c701f4313400956dd1a42ab865 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b54dbc6d5c222dc98b637a9776935b95814d519439c19cbb089de6789be83e9 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14c3548ce4886d419f968d55dd243d424b8d2c85 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c772576f9e59029b6e5c6a02bc135c926700ef271d8044d638b066f7e56b0265 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48e4abc89a4c45e155deb6e1a37d282cbd28bc66 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ce7f7b69c536a39b2e42e3e186d74b01c306269a189cb9586536790eed1230 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdcb301199de5001fa1d02aaffd6456bc840d0fa --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f585e00c64b5e74ee2054f67058f5c2e31b70213c491d046e7edb1e5993a5b +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d3c3f490921b1496f73c7c6ec0fb6ceaa035d3f --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee48f175b2edc2f563cb71a37e273c9093bc0f6bda902e58481da0d028a4d47 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5d048dc2f80324534cd4487972e945797a4ab51 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e3ce705683c4121c32384eb01a84c8be5ed1b65e545464e78a8b1a7ca701df4 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16f880937aca01071a748a1fade0c368d3534658 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be230c9a69df3113d5e3e19d0bc75d897c20e9943f60644fcaf9f916015fc942 +size 199058722 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8919d1843ac2ad8e6752b536234fe27af0ddb005 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc17748e760588b845d8fe0980e5ac2456c9ca8105d3b02b639271805eb5b322 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6432af5d906d30b1c4e4ea29389eb1a24336b8d2 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2488827e01a7bef357b23cb42354ebc3c6835b5c719d64f1c91b025309f17680 +size 199058786 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ec7fe73624b40a2b8c0f67e6dda9de6830ce40 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ac0a665c5068dda07366c4c1540efd5f253ea26c739e04c73fbc61fa411dc9 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a11ce688b586dc57ac6f7e1ebce7172e3da749b1 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847ce085391ceff980c6aa2b2d3952b1f406808e9813de6c38266a99ce74f3cb +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8daff02f9818f071aefb20543d19dd5caa64c399 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cabf2f8232acfc38d77bec5604b9d34c2454013a230ea9750064828240ff3baf +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2080bf14b7f43dab98573b39dbc87cc66d919842 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb69334da8361cf8736c69258ea5f9835af0df4dc6318c8c06da8f75ee88485 +size 199058658 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eebab9653918f90a9fe93fbadf1c59ca55e26d3 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e2759a5b77e1d940e51895070aae7460c3a5e70dd292957db08600984507ec2 +size 199058839 diff --git a/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..011a1c38e9d24b91d3ade0c04eef0c1548c99ff6 --- /dev/null +++ b/4b284b21bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87aac379dd8d1466c5e5a9357db1d7956804d68451f2a5a2dc4970b4f9bae741 +size 199058839 diff --git a/4b284b21bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bbf646dd138f233ccc464cefe147166e30fb7cc --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e9990fdb646a502d0f34f488abe32a70171841cfacd2e82c54ed71ba2e5201 +size 167511299 diff --git a/4b284b21bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0444c644fc942bce62cbba286a9ba270cd8aabf --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b209e1db68e23d38091459e2df4625fc05bb155bddcbcb02aa713cc208eb9cb +size 167511299 diff --git a/4b284b21bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e243b0afe8cd0781b0b17365f6464fe965a701f --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a721b3d31fdc6aeeed9bac7e0a74607274d05ff66a1ac2d07ba07674d120450 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f0c312ce7658e12806eb0bfd4a30f51f4745217 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cb470be5e3d3c00dbc72b2d47f592e2b165cb7c40f1df18d3a75b0e4a10bbc4 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b82bb6e8a7449f87154abf4aa5d1585a59154ad6 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb082dea1bad965b4f7decb70a6e813268e759ff6ca0e2b808185eb27e99be55 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dc0148f9d4aa2726c6172daf5487fa900e50d2f --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb183e299a49b855b43b0eedab659992e30cb77fcc205009a44295398b215272 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0774c2fbcac3c21af63feae89de3e320c8df1a4d --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5448d69729e126f39c4ea462b8ee563aceb0e65d3453c2027914d3df44d9d668 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e13cdd6376a3470e3df0c424cbdf5a4451245700 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e3b7765c01c815d16b7497f8e2473450950f7ffefebdcb9d52a7e12a7a01c7 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c289f8af2470f0ecccf903ef86f95fca994930b --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b997d4b27f498a49933cd80033a1723281e26d1085a72c2c3657d77cf2971711 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b30e45eb522d6553e238112319b74aaea92ffb9f --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fa7020c8f516eab5ad293396190b4ee655e42f4d9e500559e196b6328ab069 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0831f47714367a4c334da8d517e99a7f826b740 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fdfa4ad2d61851935f5fe319ec721783eccc2ca8bf487c46059bec6dffeba1a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cefda3f2a5e58003a37bf2da105e40c42c89c263 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bd6881c7861cf52b477be66e7887df46e3c7ecb1664ab2261364c930ce70008 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aade2b477948fd867328ed61b011f4c22f758918 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee4f1d1036082293fc70b06fee8e8f77a0be30a5ef41b18afcdd96734215270e +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c253e0e5c54410a3c27a6359d1cf45897e5d1d5a --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc8d269b853cf3dc9d6813256c3a803c4675461864fbaa5904cf2f333becbea +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2a204d12cd697923e8f1ea988096ec96683c684 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106ce38072c5ee9453e009e769237bc95adbf0e5c9cc48d972c49e487cfdc192 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fc3ceea1eb75bc60e9326341bb48634e7dcb842 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2398066ad7e1f2cd8e28a9ab7d66735423b9d410178d2b12f3b66d7514c46248 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03c842a8595d0705f661e9d9d9f5eedf53c23232 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433fe1d18aeec4785b362ed1214ae09e5f101fc3cb035e948433493ac0a1ca55 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..197e557080b5eb084b2b6e6ffe6f79c166460561 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe0c25c1f91563f8c7861ddbbf0bd18a485a76bfb7e09e71555c2150b979279 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ded647d442ac6c10e99255023e2a583cf60e8c8 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59bb07be60b381fa90d2f2ce326aea7ce1ca1f210915e4fe78366d3b0db6ee5f +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01af6f6767edcdf552ef536ccc8cbf21291aa1ec --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85878cd59819672da2bb1861ae1bdd17bd92bd442212765b7673a55b23127d7c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd4ee80746e125ffd7fa1955bdb37e91a56219cd --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1744aa94bb0c2151ffd3c0150a188a55fc9826fd1c06b3b2a56e948d39b5ac2f +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a753b657b96e8d66dccba93611669eff8e807f02 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b89c3440010646971691c8d14c8313ca3a45205cef09fc1c19cc97cc8c3b3c5 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4326fe73447a01ec84ef4e7cd9f291e2c5f42704 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed97a45265d2aca9c621fba70e5773e0eb1a51855bfe0a6c4259a1f2c6b27d5b +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef9fb83988b1a86d212cb4df5064266fad3abe9c --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1c1f2c63203363a723f6a58dc6233b88f589a5dd400d5185f8513bae97c27c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e6af2c79d206ab3cc9767cfef5d0547b0c38d47 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c20cded832635392aa304f33b651822deaa94f08964b7abbb78626d5a4c4305a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..751df1db4dc4912b46b4a267d7a20dc8e09c14a3 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e921b0ad484cfc4a25dda42427b8f81dcfeed2ed0af14a355e0fd7bbae2ee560 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef173e6e5fb93d73fe0512aecd3913057873c92d --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3919a93c52a7efb90d2ca7e988db85af3d3da0332e224ba00d660b5c8e729c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64cd99ab9bb6fee5166a517555b3b7fce8d49b26 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c5e1657c130a5585d0cc5b9739360f810fce2b2e3968a76e0dfb899fdde120 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..629297d1a534102e39ac359e629c75ef8cd800d7 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a22ae76be05f2273843918fe3b0557a73f758f1bf9c00759328d76afa2853db4 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00907509b6d9669b0afaf4fcd1e250f3c6f0389c --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eba6444bd22d3fc31f9885a61a1d58eb16d8c86023da9c48a52c80c53c3622d8 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65bddf9482422ffe68307b18e631f3addfc4bdf3 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03c8247226158979a0a92590baabde7eb41d30c52d8080214d094dd7c59cfbcb +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0744d034b98619aa7838f5c9d04915dfb2fd944c --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e567870c8c0f8833e307db5854ce4a1b7ebf2878301c9e99a9647008e7d27c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8faa3642b2db8907505feb8b834fe5da412f44c --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e333d6f649e825a421a6b767aa57822dd2dd742c99b7f4150e9d379f1ed30dc +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..151dcc7763c9021551a0e690b1754e84bdc946df --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b743f68b1162ca3a926fe383fafffdcf081a7e4bf3a610386b33219513cd77c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e985f11db7192a106cd59c562c7ebcb290f7155e --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59097b02408707a6b2db4dff42d11db4543d01d9e0e4a559bc9c91f2a033caf5 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb625d02ba2e7519ca4263c09b18dccac77e6398 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fab7bf0e9dbd72bcfa9f6ca9aabaf8699e0d59b1439d88b1d78cc0fb17c3d881 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4071d59eb1c9ce5e479e3d438fdf671fc80db04 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40635b82e2ed2d41c56afdc641081cd11d8e08ce65863913c05288ec91ea9ba5 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce0894adc745554dab16b0eedcb57f01d24b9ce8 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5d844625ac7fb0a1f7459931b73e9feb81fc51e237f493c6bf3e8332132cc3 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3ab3ca6576adb7d23d79d767ee99d7508b29bce --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c746723a5e87ee2abb84212db1bd78349778e7e93cb1a9157107b043140a4c +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f56484724bbd2a05ef4cb8d70d8212f3ae5ba93 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05709ada7dfc811696d039fcc1e3d98683306106d8d3e6771f17f9e076782c6e +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8ac667ab2e5186f9008b785d6aea2bb09c8c686 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8356d8a063cc7020b65d29029c9ccd73dcd1b1d7395cf5744246a81d00732aef +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35a96800251bb3bbc34b5f39c385615507420718 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c5a524a5ded034963873879d0a924e0c48f2d675121cee39396eab848e1721 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33f8cc09781da69dc99e64ef73933846c0d37af5 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9d288551f49218a9714ea6059d94c178eccc2dfc363c881c4a5e158665ff80 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4263f5fc59969d1269ff3112b565071b586248a8 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c02844de0e20156b7001e259eb5732d5302a447b6288c13013355d6a092b1825 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d33bfb52db31c372c371bead80dee22dba951bb7 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16b75e5047bcb8c354e135592fd5f9cbc81550c5aae34605bb80ef6eafb40316 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fac12a9922aac061c6c47b161730a8941943ccc --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2211643e06e5cc05bcde8fced998bec2b9ee81714e8ca00ff2cb0086757844a7 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d71b6a816494424406d876f2422a0e2c0cdad0a --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1a93b40ee6ee85eac5ed4bf53675ab7538f5fbffb958bfa255b8fef488ef43 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b584c27c719287f71b710fa6580188a46fe63f0 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8391c5d5b5193b4148310e2e43de5211d3a7c2568de3e790ae6b232bc9bb21 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca2e4364005805ad14d00fcff2b719dca3874124 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe744624c01cf44b0e493180808e9492c565d1cba2b42cf43bb2b70cced37df5 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a36eb8bba56a149a9fef88428b98b0372bcd0f4 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:751b908a351a6f2cdffa6a9ca93dc18db13f193919766b36b7d470e6893ba498 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..613ba088d2196eb713e3f2c7c7f591a20125143e --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e024f54051dbdf9280ab7fa147dcc9e04215d59796cb4a9ddc58ded9c0ad323a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b600d0507c2daeab91148f550ef9e484d2e02815 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:890235912162d45002c30078804dd78c54b7c848c7ff590cfde5860d3805ff14 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..386fc8d4ac35c549bfc4f2d7533dc2580ab49025 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0004184b996c1976984d9343ef33ea75af08d959e22e52a08e14d0263c44ed6a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81ab16eaa015d1862a52a0b5ec3fff84bfc8433e --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0a0ba01a5a2ae2c530549267d0843361d8ada82d263c260c1a692b785404eb +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68e274b16f812a3b8c8fb1910508963bc387e6db --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8447d5363a417ae090fb0d086fad25a3d3ad76d92ef36dd1bec37d480221eab6 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f385093f36b9ddb582b81b3c226ae885d6cfb5d5 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0b1ea4d3a599c867aaeeff02114016ed707cf7049eeaa27be17680cfbc9ee1a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1760cd10bf2d672491858facbf63faaddcb47d4 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce2fac1f63fef909eecbc485daa5a616b8a5f2791db990dc070ce7519b8d4014 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6bb9534193a97581490b5ea4a9ea911f63e44a0 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35c20849bad9213e91a1dc55d59df261f2dad093504eeea4522dc59ef5da1a47 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..398e3cfb57e775ef913d565b15d87678335e6ebf --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b242f8e4c009a03e3e41759a3e90b33b54acef164173b3465985b605a4819bd +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b98551930ab28cbebcc7ebbe4af106a150ad5443 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06394e5291f63cbb246a94ea2e34e4096fbcc119668da7ca10a15068da3d656 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d513f30e5c16bac6c2f52c199b5f268bdfe7357e --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0faf44c024f84fed4156522be2efcda15c5e5cd58bde0588cb61ad639ae22ba +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..584fa9dadb0f1237ab75207c7bb0a47abdfb4c5a --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6647c6c7d922123a9a59334f0d84aec7a2b015cc826c148f5efcb8d9ea8a7a0a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3028105e99eed1b1a4c7b109fbc3857104bcd8c --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f2816f612d0359d26a045d731b14bb6c0b19dc5df636edb0f111f6ca55b4bcc +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4df423e2a875e4f11a95e46c03c63119a6a18b3d --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6662da8f59333117bf01ca1d5a300bf0bde245f9a09c06c1ea662e299f754166 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89a74fbd4157faf76b9775a83be33066ac789319 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c26e9af29b2ef0291054a14b6ee2d37a217f84b4c4d4077076e84ef944ccffd +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..413cc0e1d8327a43dd3199782bf59d3cde97807f --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d17d174aadd3f6750a66fe2e60856c1eac31ae664943137e46d8498fdfdd4fbd +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8553714bbf73a9cefd2ca4d7852d48bfe7f4d393 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65cb22070c61325f8a50212e733676fda74972c9b5983bf1d2480a2aa6a25978 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da6e41340eeed20249d75f9cecfafd22af74c017 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987e58871581cf5c2812c493392aa5bb41f9343eba6a3b865c0339d55101c950 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91b8ceec54e11db53718fdf6b301db29b4d30b77 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6820d5847eb404212db017f4e3eadb8451cc133a8a3dfbd27bf18d54457dc0 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5353a75a0a7b7e13f5efa66c14e634cc6deb51c8 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e0f5bb6084f6fe92431dacc72d21751a64d5d69cfc1270af88e530574f1ac1 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..289b2c3b16df3ea4e8728efebe6e5a5edc29dc53 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbfb9b93f8236f826ddf53d39ccddbe585f06d34fd31cf416b1e15c50fd01913 +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42cab2200d44abaf7d93619d2f318cfad32d2003 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:834b85352a195a5530b27782697f2a0ab6d9820801f8cc1e0a6abc6e6d6a175a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a94248e5c03890d1b870d0b0ecd30c59b28c060 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e392e4c9b3adb8a58a22f7aef8396fce7669644453b98c744271cacf592d84d +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8d76fd3745a21a3c1bf7453ed0cf6ff04fc24ac --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b880fcda8695ddb8eec098c9384577ea379cf5bc1818e279d988f7603008d28a +size 113308931 diff --git a/4b284b21bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b21bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccd01e127a5f2104b1899f4abcec6aedc22d5c86 --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5678d14ff9a5048a9d3f2612065ab89c0a6666eff60974833d9438278a518d3 +size 13507 diff --git a/4b284b21bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b21bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f116f852a0ac8405a0034d3b193acd5912efc7e --- /dev/null +++ b/4b284b21bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6283d3ae584ad2084482c28c0b664e7a152fea34a2f25ec8d8921a14335d70df +size 13507 diff --git a/4b284b21bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b21bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3f9914bd4666b064ad32a747bbd5105865dd5f5 --- /dev/null +++ b/4b284b21bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb4267f66bb04d0595cf8a25693801079646f7a3884a017023055ea9ffc5cf60 +size 51443 diff --git a/4b284b21bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b21bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43fe2f7709d53042520d40d85fa0ab9122d8b66f --- /dev/null +++ b/4b284b21bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc122300a4e449cd5a8e829889234cc2fb3914a485f5352d338364fce046fed +size 51443 diff --git a/4b284b21bc4/transformers/config.json b/4b284b21bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b21bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b21bc4/transformers/pytorch_model.bin b/4b284b21bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..23165805e551928cf9adf2f80507ec701f531c47 --- /dev/null +++ b/4b284b21bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a664c394afcd9adab67ccc03194ea21aed662c8f748e80d8fb9c0e6c88f6ed +size 8781203669 diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..203b12c4b305310053a2bf830d34e388f4f6ee1c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.2871925225988394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02880794237734816}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07031750338322859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015362201736309874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3012155634284117, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004546846231718025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.10707093959955763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019099253416430774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03252619427180376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009041180535715348}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14439430798437106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030595449553106713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.049917192299013896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012014538250113653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06757287768017098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001414062975537199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2931917692240097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044525581749204875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10327080072990603, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017814345648663893}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06720725044881726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014407633141713585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2882066628071488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0042625595600572705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10238598559666155, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001793711398293261}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..749fcba083d25c10ca6cf43d6d169203705949e5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4674006237665374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03675015156688127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07708396436923028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001529013971135644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36235242098066, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00504305962540817}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11843411039548267, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001895876971489225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035725346847754684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008827277116185362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17866943863024684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003599106083841207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05553061893758205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012200850248834274}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07294152863590639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014064426998360536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3437168036630356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004711682138202734}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11228525846628372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017420331023114827}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07337404630465882, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014496612765965426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3436003452114997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004616609203187745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11267135213673385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017806522868172628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3ae36dee52edcc81785bb87bb96e9d01fa67682f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5250778439407279, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.025625496064299234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0786831611862841, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014969718758498687}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3956998500897082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005131459404981971}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1222441134481138, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019020155784340502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03682232020364324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009265440871287532}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19779728011829195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038163400420330356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.057331612844470456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001226447389142751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07302274174131225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001338647584797816}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.36618568072469376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004622159858156629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11353522248821303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001703018837429438}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07488527729009754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001411146414679001}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3754608054763451, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004798151749531035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11628410210009521, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017922853862293046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c5e4813679f30429297c4b4ab8a65d39472b687b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6240971401779115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03840020245332954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07946921950052364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015734983149428565}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.41443709705557313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0052063482293464285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12413425365513392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018801730074252724}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03727374629297188, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010068745541143666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20933728572588564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0039729036741121115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05836966723015618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012404189838209753}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07340984582992345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014110667213403662}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3803567082991153, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004624202286888448}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11463438747216832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016925693943895671}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07570628299965847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015124490142500056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.39213432148715394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004828378800565317}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11802649327609507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017907209912884955}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..71890f043f726bd71bc447ba77019d00243ee94b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6460958847523566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03922360003785139}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07760008602513783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001320690166501572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4207626130570328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0051343858175420766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12269659017119199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017597424238451932}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03635274690781486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008330185169416092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2129557453468375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003851289229933982}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0577700863367864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001158809400479912}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07107029230772488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001171387599016451}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.38200909854824977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004464667845112728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11231459598044416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015694568258752962}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07356474812940257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012438027129333135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.39729543398826606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004755911479219925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11625392090888889, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016649015391818642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3fa8a2f66da78a729ae5c7129de7e9899219cd45 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7283147266727299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03506630990313516}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0800529618846568, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014030415087213262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4360056147654019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005237161872126708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12670702220068714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018353031821599214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.037349148159645094, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000853308549188945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2217598521275178, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003994885901439153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05951196634046783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011799130523628795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07255538123365188, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012004887370386942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.39357574348180674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004547413362244599}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11485528722019342, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001589365738462336}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07578798351325475, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012955589973183616}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.4104972172441367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0048123434799543785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11989099231027077, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017190774588667075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cb0f4eb3e0a11e9214cb0cae02818f7aad40523c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.09435580335073716, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002596237406338737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.1516877183797435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002163617861294294}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.08821581167136193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020280559575042486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.0053159784831301035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00043010712261271663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.009522281054005119, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010761109699630326}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.00537375341399136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004491934033010578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.07882470114338755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020788842929298805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.13940682930714066, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019495456097635088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.07551459506478578, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016129890419721987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.0777715925714806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002284643122449491}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.11130068960104114, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018656577889789815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.06986508795300185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017500362429631329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.16810516169972559, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03835304479839481}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ad1ccf05fc55e26aee9f4ae4e273c0b7ac7f98b6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.32500435790610316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006219134826883584}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.46243909963396884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005727997203394979}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.31536194630864184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0048986014551623686}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.15832835758132308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004379479319821069}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.21989028843694963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00443907246545659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.1493719195270224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0034781292314063805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.25838083394802175, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005195219904829289}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.3797932387763246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004882479498294952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.24943460339721188, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003903652479227547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.28371843954863074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00558326351046381}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4042180845615538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005247552299585261}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.2742602935438785, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004363579640120447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 2.6466191277757654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15100935076720456}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..22d40376531e3e879b17034ee1a5686d614627c2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.616964955469801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00592579769672606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5254416101366304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004868870996231356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5130705309379091, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004347345042742489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3649050910641516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005264819050488393}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3044911557323109, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00442897115331052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.29741471013327875, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004142260867688268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5063613364405887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005565487436877947}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4323292288126833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004568270840466541}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.41944130358080134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004138185147643418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5436203304123873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005624136142894095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4628187968326305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004668999589191003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4498297629937119, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004109002891379284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 7.5083947202338726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24848690178409746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e4e8804eb6f474ba62e0098659006e820099e0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6407710071562776, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005336259744860889}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5360462579768369, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004707261284923473}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5412014771452126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004008094970635858}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.38414086267020836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00510054295724189}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.316436535679906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004429375693886751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.31908027874923556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004128300468278769}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5276989567128523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005209222935383238}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.44075459787644217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045004795324398335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4439501498676375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0040343196115624315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5658065072497239, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005198931528391741}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.47242630953998244, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004540483814405851}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.47592175675183224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003941128663060527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 9.28684502126115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5074769619297956}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2edb5ff4ca7bd09f24490ddc2331b3d8d8cbbb21 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6472264998572703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005232105513902178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5366498372396236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046402228185357716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5464856118324216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003910057721192816}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.39097181878893417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005010695619680332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3220332631389833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004380478546023843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.32559945531385337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004008792699742887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5313681099482047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0050483302509796105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.44300639511893825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004479689083403708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4482338075737081, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038829126617500277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.569330259899693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005059793660411658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4721692944626995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044818770782843265}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.47944694202870075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038182134462323098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 9.795110512629257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.43643537944667626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1f6263c8d084db4c3fd9e87b0e46aae885cb4b63 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6580257286903278, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005154899649610462}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5377144440002729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004751931821359596}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5525906757014365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003963242953623838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.4024984504821446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005105136616228292}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3256015347638036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004546014562570848}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.3330531711847648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004179019933874763}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5454660436185705, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00507487374817911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4456656435992453, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004577146376256788}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4568474950562419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004002759515298098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5815729850402755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005083872222185427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4737226545590055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004553659067486879}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4862086181416132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00389366720553536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 11.239061696658661, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3211949581138453}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5396fd1d777b7f6826e2d784bb070926b128ee64 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.13642739401224172, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019132472003375194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.02512636113552703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007036632388253778}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.21863243644000946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002521017541213538}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.042903100190703786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007010978773798984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.002350066060168301, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004125290354570353}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.01850848216392922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001239122346922424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.0034599979122394713, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00029151015532393696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.024907100271538515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000690766936930914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.21742396152099233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024595518988296725}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.04253972308092736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006683881645207197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.017035317660162044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006541825206071444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.15198121220346103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002036913011848439}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.028854953753925324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005590058998891695}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8630ca1bfb23e76d9385c991e811d7954a76f5e0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.2344773114465395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06144744886971806}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.09359962530750512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025105917511609984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.522665803339837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004850735788452025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.14240938766090952, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026994539955292815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.04032025283529164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014131816715617135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.23323991686673376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004275513488096086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.06073259012334097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016141608477161177}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.0769023862248869, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018572847386138004}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4625095347149072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004342267829958794}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.11888572362932227, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001965744751513682}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08184718121577335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002207850839836764}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4608387006930489, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004681197366055284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1245843999084538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024393680091218312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8d2364e8466df1822f9af630ab186092344f57ad --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.6773713320753056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.055884247961949976}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.17374754160822523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0050939685408092975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5730195532268542, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004521823239131311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.2057962810551555, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003760812343326168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.09140223665380956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0032890628357242144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3074558179703969, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004375074576693778}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.10509886424101751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002496931131477509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.14133526089953277, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004177683006061348}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.49304734486333374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004509829618388591}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.16838171411089273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029883563268557796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.15428774706595433, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004553930547293362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5140723424370337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004433969406651666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1825157877706738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033370782552026696}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e71ef5ca502ac8d430ae704a83b51589b25f9c59 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.688973732407593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0437940722424649}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.2263783160965279, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006171991779324731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5624565194412028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046331481122687}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.23997327415870215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00426770453497841}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.12650521502940273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004187905052421771}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3116380965308516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004367945574727286}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.12855310203356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002943917792986252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.18506548497150052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005182255852735757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.47864760075928686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004626469457756582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.19630420632254614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003553339268118638}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.20150484374278746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005543691944915573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5064505915998677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004511689198493829}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.21346988106806253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003833256489070223}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eb9d54776385aedcf906f63f68239d9c3737642a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.670081659519132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04318517964945629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.2397708671243343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0064228338007496924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5485806406380264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004720768862432214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.24712078753708633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004314381507879295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.132848179955477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00420337986829722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.30940781093399633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004367428964629067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.13327902883600057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0029460826534397667}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.19425619955945797, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005325133026757208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.46533329832959947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004667920119273198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.20127645493389038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035809225423920244}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.21110404564884136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00569641155080923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4921812900394637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004548259054500709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.21840163342192948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038191209490347515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3e37361543fbc180f2de2df02e77a18d53c21539 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.7150662066576599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04867244279674884}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.23129723699957813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0062156402527426885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5532761590144792, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004718580529051603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.24413807121404715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004196903213714051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.13131503132685515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00424768261815168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.31689123692271287, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004479228620477408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.13440771646832117, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0030479854784096305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.19041781904289584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005317512197816564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.468350868728647, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046258788947290365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.20077123718465184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003610791227071776}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.20527402650433593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005591111023710644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.49732487082024746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004602620634284861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.21698522412439886, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003796119059919846}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3467e100097f72a2d1028b08dfce1f4093e5e0cd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.013350416749489698, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00028057458287476263}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.11566374488953345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001749855882209205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.023310980636958793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000452486408355606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.0004485308245799727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.901239884774927e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.0036672786059082207, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005157087714575659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.0007679014767691473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00012819459126050034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.013285581389076958, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0002656851189258093}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.11534380952368356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017118106609243738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.02320710445968874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00043026318572661666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.011828106724862774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00022725277636703424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.10529252247490435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001497843448633412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.020705297270136803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003655496821761756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.009519886388116046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0030236756925875077}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6a70759dea52e940f0638fd087bc4529edf98dcd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.18596600590889065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003645906867093087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.7000841391242195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00407563035243576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.25888123233754007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031768498729056785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08857846914965355, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002396373719643474}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.34773296216867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004163769887333789}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.12187611759524282, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002198007938376146}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.14607148785062837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029509545373158728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.582387391289673, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004300891719123408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.20438070771678543, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002413318992124119}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.1590760662106541, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032411687263806696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6045971668841404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003947144030126864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.22097954689386484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028005845900640268}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.191415616124935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09680100323362864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0429c26caae87bcd99512a0a94cd5f4171873d12 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1980304639632034, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003492199137777792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.7112810316711771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00376166113477323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2805422037797494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033268027640946463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.09886442937653378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002345021679529973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.37290220171207966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004228119092633622}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.13915045372935977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023234212020214113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.15240021506621623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002802864425131344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5764693524969581, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004032933171743271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.2169472494379018, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025420675625343024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.17080807902531406, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031067593345175896}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6220206309306955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037970602252338824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.24203406785259418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029372092841247473}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.8569076256551087, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12052575313613716}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6aaa8fba269f3bcb90b673349a1c7577837915 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1933328502928586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003099530624368226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.7011350938451281, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037491005183801886}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.27975069293703747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031284036937637358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.09519751402031547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020025816969127396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3723729566418955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042697919686992426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.13862295274121508, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002135737901854583}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1470822171154783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024663239297952433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5599189483902042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004020046406729631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.21407754529660591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023998887716300023}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.1673033495587541, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002759668112872226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6165957546031788, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037550091734433974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.24241624159869068, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027709885961488493}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 3.009544719305422, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12091914294594716}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c3bf7995de449f94ef86e2523bc274c0593f9f48 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.18185566077745052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027072441708704876}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6878760207615766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039399069406699195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2677925532107245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002981957765622307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08953701779489043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001710625628350317}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.36720908036084704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004308436966018264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.13292917476889163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002014006244700826}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.13720570266102783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020576199930684665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5490721849829181, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0042054270913741984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.20379974606633744, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022399193895611676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.15812569287913272, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024138703089301825}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.606968571968092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003889795075167132}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.23315369915847917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002653830698926144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.9471554297303775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09562350894285683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fa07160070b21ee1fb9e02c7115c596f31c234e4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.17429298605765095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002453866849905799}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6874234755183823, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003958252758008644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.26156722580164604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029559487433299535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08498484032096375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014987715737771314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.36659589385018765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004439215725950152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.1292376475089326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019765919490193793}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.130533611669738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001731325232954926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5475992486934416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004197239234834233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1982820935914217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002156722479988033}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.15152370925757377, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021398507103325367}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6088523048467857, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003930409593530016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.22805092628919627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002601257261272881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.8828869742378753, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09032299188645451}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7a24e2db4cfe0075f02cc0fa624e8dc98840da10 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.10655760753679919, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019077191437516404}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.12089604584843483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002329629845339565}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.06975157019283658, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009137011093224832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.0003312213456760952, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 4.9493114517125344e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.005980525954602008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005793862846553182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0005675109134810492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.471478866388383e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.1004586755525386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017955289781684024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.1184027498174915, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023585973835660435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.06626241359825397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008570407826010054}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.1014087241228341, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001861610563145598}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.10839827017927679, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002044855763678236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.06538396642694783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009096365360827909}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.010910686930411773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00030602545126491815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bd80709d59dae886177affcc2ae9dc3b7e05a3e2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.08062063267183606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016464001390685112}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5804077018598791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004705014575654514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.13287078506554892, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017687380102785963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.03519804726011007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009691687220744844}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.284901492168855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004424090476216274}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.058019313464160664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010621558481173597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.07336501049936618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001340183479865301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.545991165694801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004636447373365752}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.12184770112424802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014608368603050287}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.06830124095667763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001429554164098288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5022686537167637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00445439816772978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.11269533007372474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015437562768583104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.5758893404824672, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05552280312712737}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3973cac4e8a208b35f370ca1df3156ec48599a35 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.3420015606083595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00759844434394234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6045685729269148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004463808851257848}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.3340303326789718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.005770276552964151}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.19480755638535052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005308301829142384}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.33266720411338324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0045078698777430235}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.18536983565498788, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004299838463484588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.2821382753377313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0063280271310375115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.532955297379096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004605656886036717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.27872869979256815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004804872599934385}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.298538665654306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0067989076976235135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5279377913839256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004350056352869954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.2902240542226799, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.005189448695060585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 3.229929819791474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10809788252140991}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f52b7c07409672e7cedb6653e1053f2ce9bb60bf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5037779287889935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.007188822322314131}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6013235088491181, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004702106644962896}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4624643695844432, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.005423092759830048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.2967411625801182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005575768515677308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3427937666222623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0045155157665415}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.26767500932744387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004580954162492676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4061070236594833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006289328700511993}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.49351648432022127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00462047959595513}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.3727832655109453, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004858552003307264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.4402820106855099, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006558923231150342}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5309933184677589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004638055543942346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4039703218874318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00498186857977818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 5.459046430007518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21326177813295816}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ffc3abbe7240c9e58fde0d4f7bf560f46bdad6fa --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.45806774309488824, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.007318355183210045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6241468092977022, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004668663969159961}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4340669909812108, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.005456429814747237}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.26701579357766814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005503172843923408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.35329352839698064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044599483677181355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.24713173271908043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004395624838836395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.36755677377705864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006350405645219005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5032053457704092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045314258456629845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.3463395333726421, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004817987549566419}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.4016479351386044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006624569651668937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5561992816740847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004637830171648178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.3800941783693374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0049021744640361225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 4.768286137370153, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17352437696704526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fcfaf2b7d6429e44a6da5ca081ecfe7b8c61eeed --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.4012912849159947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.007332717541607647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.64320056686215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004496400900386177}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4004976431750891, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.005533519853929168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.23173467180213086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005322008089541369}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.36045746541298906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004418126556266405}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2256483945335245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004326347089488998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.3181899530902145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006233483121725614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5124106544670798, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044121171030032285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.31629484401516644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004837305063133512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.34997948174033583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006495798513744922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5748853093658587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004482155650671063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.3503718103768193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00488155342773186}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 4.0649225050991795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14182226731160913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1afe64c8321fe3750938bb078a7510e4ac830a76 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17814038490300868, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019339544316495465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.3141329562889899, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026819443880949302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.2117111727875298, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018438088393860322}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03845870479092204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007912707782195452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07088130315681604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015849066154849387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04599914076874335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009026810349493257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12177527865393714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011744038036189456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.22582257646250492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020828750946991857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14701777739136027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001160350487915751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16477812405030756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017822475337332542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.29175188388878937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025122798019773414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1960912017363633, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001706624400289841}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.0534810991559866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0666110922750702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cbaeba3f3a51dc8789430db187a8f16ff69bc918 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.164850593284305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002029018039576436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.28405561359995446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002857993695751241}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19313302430200963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019316051118957926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03503377865570394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008530621853281363}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0623540484617142, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001548287542059019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.040864899057913275, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008899257763011835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.11907730193169616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013816606669132387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2131003193203166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002234666822976161}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14097329346444698, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013041417812161638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15259882161084798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001885756918977423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.26389227182331143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026742133450759734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1789179111545324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001788238357802055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.8865327529226577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04249976221266161}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c8902ca0cea5f75d07e59e9fbc36a730ba858756 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17401161171600613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020881821450366707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.29609796476117983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029221116925695636}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.20240023881581198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019530480366369383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.039624014311144054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008900870750819028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07078627112644985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016722377383979908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04643912783550571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009627363006805538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12735865745665292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014154169873442247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.225450514063261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023420645348967563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14993944604674414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013384024265687326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16080285795577998, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019296398265542425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2753859473796806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027696876252129017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.18742123863337812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018142650005546468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.250874940153623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08256417332234947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a77038955408c08b0c042b2a8c719b33523d3dc9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.15417391268004804, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023622060928554494}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.25177591335426297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003390550226163827}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1732009883115761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022644018750802943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03643560093458038, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009248695850334793}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06270148510981541, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016134073336328298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0414745893981831, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009648670616110777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.11525851431820534, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017153378060412703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1945255102820067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027243009700905294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.13035603600788886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016318784961578442}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.1433709185752066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021997928215724285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2347702913915573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031942105153544165}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16116163645425682, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021129773225797956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.419872373280389, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0632053559231718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c7e80d36f213e28fc62ccb9c55702c619d852a1f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.05190785653580835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020577014263843353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08336762757152712, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029759640480461146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.05560797193518763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019328666401600453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.01264978957372063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006811352498881881}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.022501646103038838, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012662878267261353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.013892451555974773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006715642301954771}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.039660746703855165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015372422364523625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06621442775842074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023962464441846846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.04291751591005431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014550040577974023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.04808436429768246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019131925346893503}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.07749318488885995, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002775519992790014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05153615578362743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001791767114164613}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.5407149916214953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04531884877289389}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91d3572ffb42af4359b12f07c8969f06fce27096 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.008747426074934984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010334072389606014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.013166466939228978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013165673788761627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.00858766610820728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008521488341725035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.001993543076039664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00032468030555408186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0037795884860438948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005979090260362486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.002181790528509318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00029155550977350524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.006817828377343832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008550758600300307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.010456992886461796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010697198590483594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.006564407691410186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006360402903853286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.008318710239963927, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009932928437688979}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.0125244294943219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012566507407968427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.00811170366067559, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000800158042567113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 3.875581456441756e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.138882234754457e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93a4b7fddd56ccf9ad3c07ae9e0812d3b6b63e78 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.08751592392277548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014885876589069494}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.14641105372315416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002277503755216182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.10170268353078239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001552854434242085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.012652600762562841, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004575918826167268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.022474938777637253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009218950292896009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.014876069868397498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005216997847218128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.0743006100872787, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011597328400413602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1279023695193692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001964736141764416}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.08720494916711495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012395877618178168}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0809683487363323, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013609972489967807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.13640036278318976, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021233241725256875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.09436348999877432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014294898556339143}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.7056427126817756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036208763595939567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2001df60dba67d29fcc1bf1744ed30ea26c0d0e3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.11704708597499369, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017208124581957302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.192663652857006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027090117742870658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.13464216389707748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018008035510529402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.01695970372606848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006027560326870975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.030780517480147162, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012354005199119644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.02017226312757468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000714000343756126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.08787724537079063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011884855038448482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.14920400372412476, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020664312372579213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1019122729108625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012525770397120181}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.10950085735058879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015902682243807002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1807018882877703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002516295905690825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.12602710319462657, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016626827697359793}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.1951746948073685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05498403783354158}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1cae14b7a80cf9614bba9ebd49723478816a1193 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.15158367096384875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002160293878134524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.24779508708098152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029503505282398473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.17364781357215728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020669795912559206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03517592630819259, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009362444811665309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05870812861597759, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015348108636689321}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.040268213401871214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009739966672053971}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.11724613742268182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015394113402297696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1987576493460671, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002371525557855466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.13599583814052624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001496120401269599}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.14002594651258354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002012494612958899}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.22991508782822548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027786907548584838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.16058302443864383, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019230855651408392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.3341861618808797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07849200046628084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ef71cbfa19093da9149d8a3ef7e6df9a514a25ec --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.14222703095938236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002476470176258003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.21389332067876604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003224457836713378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15258466894138525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002231238932667558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03453726675386056, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001111852850451432}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.053000124965507024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015287742053960468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03686577574547315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000986211178727786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.11196872227962629, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018972303986128753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.17347582553687865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026374790749814484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.12093827771832451, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016818007172860084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.13231796413662955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002328686040271986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1992898897205879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003033843371726955}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14180007256699717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020827290155971973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.5938240082605772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1039770016299349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1621e2ad03d2d82c52b1f4d38423dd27271b19ad --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.04866014150454847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020483076366846933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.07060422651814766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002660644139013408}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.04871798574935136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017935599450487872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.012614536951186476, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009004920229567338}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01860558990495083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010812577940661158}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012358132206090394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006693685268812792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.039145972316534536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001664987060904554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.058137528796694114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002223198690894717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.039216854579977777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001419722439788474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.04530693688620845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001936598783228226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.06555591615366842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024912463291508627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.045169216003218556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016700917030067347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.4670924044746616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04946249100806658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..74a7a8532df205b98a132b46502c557605329eab --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.008041599675695846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009393646375217224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.010590450930500163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011298193688755148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.007375281725767917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007598214513616661}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.002035037570270077, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003375852776151478}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0025466017472514603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003590177860459669}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0017789109729195612, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00023905673552756363}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.006463314313217977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007367508919664071}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.008641735031241883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009172071264531992}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.005916436260589178, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005951477346238279}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0075302854587578905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008806851003942225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.009830760205551979, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010421634809987866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.006855742708837078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007014519003240475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.7431796693295205e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.3491931466841714e-08}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..75b54eb322b20b35676758cdc56e35830dd8716b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.12685534513451716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018887365773073797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.19257637263679828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021883455691780597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.13902820692376117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015977409115201353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.016462988916567202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006707808949734338}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.02514451000634953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009497451781410877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.01758898085911187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005845142555911145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10541733933259849, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001488446061768874}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.16496936421708008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018405198920376908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.11668382606343859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012164143612498303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.11691349355480424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017447856277329906}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.1787124652688377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002037393870462275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1283175804360348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014569156963222744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.5934908400066984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04577158251574039}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b395efabcdf80ed52e01df355802a6dce7045e96 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.13945230313441068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018821753367623763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22481790415194117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002565910341383482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1575281230169748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017363328564521583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.023062949689263944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008856830610119579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.03747209579391718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012382175456694833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.025193685294665726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007407727831149609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10342195238697749, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013835117875348828}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.17152534934129882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019955254451948295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.11740619203596399, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001212497657479112}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1306018801081659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017653906193587606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.21110368454548847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00241423606851821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14750316226109086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016133291787871627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.3183556866717046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.047701497933856064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e67a884bd9dd0edb7fc5414860630acff7376bd4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.17257284656874033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023886955187102186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2624694392412262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027556283158437244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1865965522646182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019292479914873961}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0386635351793387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011380738975515533}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.058135448256113796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015225329835211754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.040241799290576724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009520200071847311}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.13532659869410305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017966336725582315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.2128994118965257, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00228475982913276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.14758577085564545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013996716496550154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1602504598601161, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002234879306428014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.24407970791374364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025830465673148976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.17307145674730884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017845742440570694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.251008452787017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07741675426485511}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..84b087c82b4f7e5fde2120387688a6c692fb20e6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.15910068424231602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028942512572503855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22204167759688737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031930213084605565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.16004715198678016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022371069572285113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.03901258316156372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001398659027827721}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.05258690577442277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015262833449626498}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03722078217429424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009797813487457106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.12723416561359413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023377769258987246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.18222910972085204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026727913824271736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1282770493927655, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017127973815793287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.14797898460424116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027432966222511084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.20640960313414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030075792341701397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14829642927866662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020736961426557118}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.699909605867917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10552509596678104}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bd6aa908c1a458cd71de5f010d08c81902ee3f2e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.04968775822887688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021399482740943475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.07040614521918066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026795450412985373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.048912234414672136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018303249700878453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.012311807543146269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008824124738308823}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.017322028166765847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010491107468874295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.011579301131776956, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006430202421005574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.039942899566434244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001742840540946017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.057791070453037226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022301074454849456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.039162591548067456, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014375174353609152}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.04604352091174109, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020115234889020277}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.06491403804508811, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024844589212858962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.045047671152385296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016882686974440538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.3436540247746996, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02229209860808525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9f15e975bd3d7d4faca428fb85f8058005efafc9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.006943053713544341, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008438126140001231}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.008894699789752037, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009656196544936773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006704558536864876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007268399140477246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0016201459969527312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002822313186020891}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.002057657522078538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00031150670601567205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0015940878813423497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002489498244534957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.005508592130950043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006491909281406889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.0073750235680540436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008073064738323795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005412786572883017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005765647323043862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.006472729964406455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007966089744646425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.008284399822459577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009017405138743037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.006217775628514679, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000672203252135069}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 7.509001958231661e-10, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.9581849644344306e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..30c0064f7c321ab5a65eee4f3939099037c579e4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15222794945796614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001854391491208858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.259279440153063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027052349535794085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17809655126532312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018236388559653763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.030497998250497895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007413435249853986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05425261740808977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001399269212332445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03601951697280678, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008300101222283796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11697977840134756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012794261124303204}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2062333696623697, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021597198817625285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1384741786574652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012894950527783321}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1389877568936686, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016763555392295747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23808538118300834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025032807703488534}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1628530750095549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016530415994811475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.6192080325529026, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04050520870343064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..79d33eabbabbce36446bfd5b67ea417ccc440b4e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17909391212556103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021684904700656395}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2986428547454232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002901154619840759}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20484942755932814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001960405666708866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04098672752953724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009402949935154375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07250823843480829, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00171407343966866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04757609861819433, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009961877982725383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12586154047693615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001447455908294373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21783174447787615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002284390945146576}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14525233898326748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001303267897245817}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.16826083836269604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020324268022024173}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.28157870760475356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002752878166728396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19265194446101522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018360173432108782}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.545643626821724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.048869510836124584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0d780fb741c84a20292ca86a81d6dcac921bf351 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2332474242904849, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031768172038711467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.29410268890498986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002958828414438297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21836409386191818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001989518194370473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06566711930000235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019413943466877388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07670871046547496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016787739182338268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05650249608530642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011487340298214372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1752085713341631, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026584779502129303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21819240455646285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002276168106614223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1603606067900483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014157142739287894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.22112816157170134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030769042746772593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2776979400072693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002813335835445409}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20617794726883687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018867289872942854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.123719853905042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09435758546693512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f9248abad81c8905741a35a4c97874dffefd9f48 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.22344987630747953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038946320752308397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.228687908011972, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033634865916312945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1819863863181494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024282963824572346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06821362116644611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023091245896002433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06243742554002245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001665626169617889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05005886014366939, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012507484027525577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.17380086539073145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003294779021368952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1728818390227638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026287282834976822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13680266609806765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001821525871477116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.21200019749157653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003735810714751958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21587781354640423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003186682013692883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17195845477450572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023005495305984053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.5006949715383118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08071997944617754}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..08550ff1c1efbef3cba0fdc093530a07bfa91e46 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06907034391802865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002842925157819051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07641790711530233, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028071608967584296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05800966504497334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002055695120200674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.020323118606363738, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013765538659170954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020689214217856734, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011098261782535187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.015594437236270214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008033551932234715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.05326819078634553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002300594360643888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05823666899257084, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021806250615139192}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04363217232305618, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015475509311173715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.06512227133445396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002711627122094951}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07184593801381658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026558284841563095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05441077080277394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019346325978603714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.4203638205606742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04062156232814086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6569f88625bd438b14bc653311c6cdfcb0524d14 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.011141473368752484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012984382630292146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011356368661606248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001166446625113568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.008814606254748622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008820141996097188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0036979579695363266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00069734856115032}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0030116660387216835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00039145176963269976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0024833328621297794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003381158590391299}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.008749076310088717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010822814361618575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.008702418570223684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008937526565969341}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0067361341505456805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006778741138953477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.010561687151292019, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012526496630397305}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010693942253000091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001098581608402249}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008282398875850052, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008304343325046589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.610620675193976e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.336079313722214e-08}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cf95e0f1928beb9ccb12a911c334582d095873d7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10260040509975965, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016816885739169474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17029243506464356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024565476844548746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11796833391406458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001679540196270818}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.013771295330145835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005405889426764763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.024597820085107556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010575366347068554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.015984403876231276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006030736806193037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08487885507917314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012383575679524605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.14493899736320978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020037770730611786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09851081780240795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012485527286575612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09525971665769668, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015577333350387276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.15944167499687736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002320656358626716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10980074384140354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015599379035532135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.7950296589720617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02515184874378254}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b62c18959b0f33ef3601fba4e84d50c8197cf5bb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.13088003322560143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017327225800732847}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.21418661344518894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002499792501862042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1501712303062816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017196466104618196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.017915022495063486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006375485225038393}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.03152784532661522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001233712804278278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.020818331143036432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007112541235966938}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0927062754049998, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011190335418292052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15760113826039301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018824698988046046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10744769510485963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011180856260891987}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.1231860191540673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016113685375994105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.20193059778802494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023296824464227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.14140181753666214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015942864602239222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.1020316820663745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05702217937857135}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c32ee6d44c0902fe20b7caec4b0be0b45842700d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.12906028226799743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019154820247282277}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.20575231428257332, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026866298708369675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1444547659918946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018306868593415573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.019627327385611428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007723877693570129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0319483746867405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011670173723061863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0219486272286028, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000730523081944293}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09457002128612854, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013340023606244708}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.1551264273734134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020478473866823725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10633076283099206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012465424058332557}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.12068760136915536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017801778755423962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.19339826374990432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025250499570330727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.13526431674777373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016980499334396653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2530485760326848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.034281822221775095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..521b4e000c1d120cca834be1401fc2bf53659ab5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10422127953279672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020439479764111446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.15837206489083266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028160022730325317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11206218209748141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019274598908821678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01585990327947335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00074827880117735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.025799942301719232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011077245741293905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.017248853072901214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006648333827724484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.07759975361518856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001488357218316451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12105717330360508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002169577022250882}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08351024617373054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013594205425130031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09810666829139612, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019260722064131435}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.14946639296314937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002649765078910915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10540334302928316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017989673027563617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2880401565259663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02892700209020493}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..94cc766e34e421105118b0ddb3b7fd5c5ca0fbca --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.025833527508072014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013553627152093723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.03917838397118692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018502036256180985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.02664836221163967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012443957611582865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.003887727371925059, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004542038297200875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.006181327739501699, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005609017845600248}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0040223276062826534, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003523486305081411}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.01966558062594936, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001016340019434063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.030934646851062786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014674198673725897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.020358325637102978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009200768723482526}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.024201111892969286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012739866254506131}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.03658490742322415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017229024892846027}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.024854830906791463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011586779799544313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.05493255544599618, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.004843136897563013}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d9e58d984fc3083e7d3904a033df6b8148edc1ee --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.002265233515641718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003579734134523194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.004150900437862034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000611002349583528}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0026710840623148636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000399411300114256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.00026974488596675497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 6.71291598724476e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0005227590853525244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013134134677067233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0003338217989499952, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.131145790626592e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.001728264703351125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00026158511522949576}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.003287044984691778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00049276938287603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.002058167115063423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00030026229552540865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0021438711710157696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00033684195204710315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.003922128301743887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005753087418442966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0025284149541073293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00037733824330625027}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 6.481159719965521e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.6449993743474514e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c3740a71c77932aba9382a618cb5acce1e45df55 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01486539538592837}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.32, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..11e4cfcb5b9a5cbd50fb27e306bea5fbea8c3e26 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.355, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015139491543780532}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f37e9923e1f253b9e32f5563d054b2d0cff15544 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.36, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015186527932040117}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..934df2bdcaaf57605e0046ab882a4c01a12c0d7c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.363, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015213890444671287}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.353, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483696}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d95f1c56eea99370e1cac7df3d8d6fa21300f0dc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.349, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0150806639915631}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.353, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..83a7f3ba8d5fe8ee7043b7b141796312f561bbc2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.364, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015222868840522024}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6b3ffbd6f93c3bb803255f7bcd96b35e5a1ad691 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.349, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0150806639915631}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..095681bc4e9534c7266a25883cf8f9f48fbad76a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..808bd3824af4db1534df28d471fff8dcc2a3af10 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.352, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015110404505648658}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.346, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce508176e2377bf6ff37b92f87aed9094ff272f3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.361, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175122}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.343, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e5eecacde24454e5e75cf5cf425c6dc045c52498 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444233}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.344, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f3d3a32ed10244e4553a2d637d799371ee04c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.351, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316403}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.364, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015222868840522022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4fa677f51ea3be786867bbdf1136caf363f07f2e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203941}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6453355bd0a3ceade7a5364f4f2d05ef7286b15b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..64207a1f8bc4b54f298f1bcd5d084688c66f54c9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.355, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015139491543780529}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c860442adc10450a9fdd8e528a7134b83e7c2408 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.36, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015186527932040119}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1abfe9b5ac802023d2361be9d7ad174afec88f38 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732956}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653598}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b4becf274eebeb5257ffe4813124a32523f75d41 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224482}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae3860db7de7cf670208e1d7adbb602b3dff5c0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811494}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8373eeda35d402e1c0a7e871096b47ea0b80e32b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.343, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356953}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1662521d7c1d4e88b30c908fa15aae634ec7d28c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574878}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..118b54c1c3337fb07461aa2867013a8f24a79124 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541042}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203938}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..86c1b12791046686364ee43a81da8ce8c7d65085 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811476}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..34b7127db23b870dbfff1b44880c7c31c1678f3b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456732}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653609}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cd707521794ecce3cb37d95e784be2a9c09cc642 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.344, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015029633724408948}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dbe1249d99407e85f308128e36a531fe3c10eb1d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fb4e8b1ca102c4563f105a53334d9ceb3ab8ef73 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.351, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316405}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f496a9753e065507a20d4f588a7d7f68620c372e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015060472031706618}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01500870618212173}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62e6930338e2391e8ecea06b5d37e17732e71eee --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203933}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653593}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..43b888b744201239ca28c9c6d91de05e90ceac8e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456738}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..833a35080946c16e34be0680889180a29d155516 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0ffd5fb775b1f1a41021457a00d88eaf3d7bdce2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.315, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792515}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087973}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0382166eb47d64c53d21427c1a6465df1ca9331a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732956}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880213}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3339081368a53206c92d24f665f45453edb13dea --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541035}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880215}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..949c1270c7656039b580b1af9681a27404b59007 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363933}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.304, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01455320568795044}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..03f483a3ea838b1cc0e507bc3a5428a6ebbaa409 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01472167543888022}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.311, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014645596385722694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64e29a4a861c43e3bed33a40fc49a537dd977372 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.344, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d4cef839d0ce5eade4c1a916f49e341cc9c7d670 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..866679f90ac2c47b0723094258d8a08951d5be3e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.316, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057127}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6e1c522bb788e6a656d64c2040ff5d8f3e5f6d18 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.317, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880219}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.308, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014606483127342761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2e6e0aaae819b5b159e962ed8b4733b3f8b5e1ea --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574883}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.312, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509005}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..12b8b3b0690dc188cd984e8b3f138759a67a596a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.312, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014658474370509007}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01467127282297789}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d7365e59ec1d07ba9217c85a896eee72fbb133c1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928369}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eac45e793b7cc1fbe3a199143bb4a506133feea5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f1fccb32d01281e99952e9c065220eeb391b3f47 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738868}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541026}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5deb7e18103c0214d7cf42a5d99837c575ee1684 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ed0a0c0def72b5ffdb94ba94c4a84973b29c2bea --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087976}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574888}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f41906f5a08dad38c28f27ad5a12bf5ad3796d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934645}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087964}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fb5255811d7b4ec957bc5e95765df3ea74f99e7c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229859}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01492201952373297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b5ffff9199dc2763da53be1f6c6246dcc7f4d4af --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.311, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722692}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.316, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..66d82f4f8c6ed6e1ad4ee344bc2211223666ffab --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541035}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..942149d91012440ad499fba0ceb8bea24b1044ac --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402709}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a094790b101c29e6fa3dc5340909de8ead944767 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811482}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c6371db91392e61591fd41eb5d05905895586a46 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620345}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3ede6043ec227bf5007011e57decd144399fc071 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203934}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4cf404e14caab20166106ab67290bfee3c28ad00 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087973}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e60fd8bc3c00db6f474a379baa2a103c28fd0fba --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574897}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b80e0a700e20d3ec16c3ab3078948369da71ac49 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01480686473373886}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..56c88c1636041a66e149927138d4bcb5f8715993 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880219}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f3b235b2cbb641c81ea5d47fd43282ea925c02be --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203931}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..45127872070844dd6ddaa3e768c84f4134ec782a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251947}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013728421539454872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eccbb24ac1e5df81b47e2ddb0a74c4b7ed426b50 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821472}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3375, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe5fffbe528cfe60b31606a675e2142db2f08ceb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3225, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..128c93f7b789341eabf6004f8155f754205151bc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33166666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013596836729485163}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f3988fe6a408ae09752045929c703794cbc0e700 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f50345447b9cae762c1bd2ab412474dfb6654e7f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.315, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013415009084004864}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01362243481313678}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb169e96e13ba13acc8d5d472db4073a5d50923 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463664}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070709002}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9277aeaebf4c5f6225284177a35ad3ab2c3a94fa --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821474}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d8fd40254a009c0a567481917c23163658fd2ec2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433625}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12f5a8e8495f74856861a5d5138ba2491a444d18 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3425, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013704669762934723}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..55cd3537004c515d032197be10101d638dae3df7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821474}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013553211167251947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a71c7c7fbed96f4419b28a393986815385bf32e5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01345294899699629}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851553}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..75243944f7842cdbb5adb2c0f49d236837b26cc1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..059d6c39435950d454bc6b0331e3148e048747c9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..142e11b22b828326f3a03b48cde392057aaa2889 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31333333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013395739415639082}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013296358936471105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5965765515e69ee2a667410df3ae1afe1814def4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3433333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01371263383046586}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3dc41e3c4620806f15a21ace6be972f5ddd006de --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013499258621103245}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013596836729485164}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..87edd7eb7923d2839b993eccfed4acefd981ef0d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01337626879098211}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01359683672948516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..060cdf39c685883c875a53a1d6ee877c4ef46f22 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3075, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013326707242912041}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529017}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2ea05da3c7e5c1bff94a8ee23a0b78fe2bc8bf72 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3375, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463653}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821472}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..814c38d0e448473ccb3fc46a18e759958cef7e97 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32083333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013480882752851548}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3258333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013535422043417459}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1595de44cf1a41776663c103f4de39dd960e6340 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3275, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251956}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013579531277800922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0358f870f064e1f037b145374d31bbca59c41c3d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3408333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013688600793296936}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3408333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013688600793296934}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..16ea99118554f60de9afd95d27eb803856657360 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3375, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463658}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01357080625843363}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a38b0766f0a6cd2e0c69f9172b52c14c489598ca --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3475, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013751753243291854}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013605417345710526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4492a86e4d511a045342b0aa44d0787e89e9f8d2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932889}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..61c9c8b49aa2d85d40886a589e117a426d0085f4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769145}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.30916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013346684134591945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c0bbf308b3ccba15bbd49a428af12089ec9a069f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3525, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013797164918918366}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013570806258433623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..798a3dc09e4dc97d7c79ce2cee6ca4e60568417d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01346230971200514}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821472}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b1827246f15730c2eb6fd4c22bed68a2ccb3c931 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013415009084004866}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..80a98049cc74eabffdc27327ec8f8b656b4de78d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301834}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e57fdf07ef22a59aecd92e3ae4c4a9c96aa3d6e8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948856}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..72da4b8679b145197bd2892c468c433d5920fb0a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890793}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..63949b6c0f890332b8ad009786df1c198cdbcfb3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012272853582540806}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012272853582540806}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..616566ef76c281878d676ff8245de24945d9ad9b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01249146853239057}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01249146853239057}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7a4c9471910ae120894fca5fd65ed9e24b83afe9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004755}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012399451855004755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d2de8e262d4c38e17602bdbacc0ed1079f9cf4a0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2738907849829352, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013032004972989505}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3097269624573379, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01351205841523836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..35e1ffcbf4436062bdc47173f08fda56e195fd73 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012993807727545784}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29692832764505117, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013352025976725222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..49b4ef2428d9fc2e5170a5111faaef691191ad0b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012915774781523226}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2977815699658703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013363080107244489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9a1ce17768f27fabe41e9bd6f14131e486f6b358 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012780770562768422}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2909556313993174, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01327307786590757}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e2d078470e837d84544f1df82b45514dab70f0b7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26109215017064846, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01283552390947387}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29180887372013653, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013284525292403492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b92984434bc86ffd489b1bd86282db5b6890fb83 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2645051194539249, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012889272949313368}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0efb7424c870c2c82c7c0d000438d075734a52 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004752}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012980954547659556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..61cde9594a4478a60856e5f72c7d27c6dfb93add --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26109215017064846, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012835523909473857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f39f25a0d55cc1eeee9fded3741867af26443004 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0127390386952021}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012993807727545784}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9326c53f982bdf11ef0b1eaeffb3d7e2a41123f4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112556}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2696245733788396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012968040686869154}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2fffa8bc3deda0b3e952fb64b1cf3f0bc0096ff6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313964}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012928933196496342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b53283e591ec6d1f79604092ae73ac969387da01 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012780770562768405}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012902554762313962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cd5a096e8a3ba0724b346e6308c0cfb16d933fc6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890793}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..db11ccae556f4c056d42e5ac9bea98a1f62293f9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01247630412745395}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01247630412745395}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..af7a31f48143b8ab9a1c2ffa7db3df576c2d116e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768666}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012383873560768666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9acbb230aa3ba7ad1547ab04d386bbec24c397 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768668}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012383873560768668}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b6c5e583ba532fb9dee8e8684453f9a0574337 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948856}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..883c6434f287861117d9cd98ca7f28cf4eaf859b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012174896631202614}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012174896631202614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0536765644031524fae5b72153a7422f4f1929f1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012980954547659554}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.318259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013611993916971451}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..822aa605660aed33a214d108ccdac2e711c00f17 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012849054826858117}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2977815699658703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01336308010724449}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9394a239fb82b609ebe77b35d55a4f1e20c74d03 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01279455375428867}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28498293515358364, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01319134817983879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..57cf5f53a36e81b992134e50e49a8d0db1b3e9cb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2738907849829352, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013032004972989503}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2832764505119454, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013167478735134576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d256bd0fa730e6073f11937f9bd597c25ab1573e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112547}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27303754266211605, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013019332762635725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..37fbb1b3ca6bf453a978cd96b34bc372c9ef355f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_challenge_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313969}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27303754266211605, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013019332762635718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a213e1d18f10096cc848033087c65e74a3fee42a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.255050505050505, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008944265906130714}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.255050505050505, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008944265906130714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6e717768bbf754db3a53c89b0cab51a7807a8c17 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934907}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7f46c360a06b8f53290bfb0ac0820fa5407fe3c3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2558922558922559, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008953950243013993}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2558922558922559, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008953950243013993}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..56112c527700067244209c02fc14914d8ada917e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008865199020660961}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008865199020660961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..792023ff9047516f62796ae3e7e39bcc0b048446 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24242424242424243, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008793651516485087}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24242424242424243, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008793651516485087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b0045faf8fd96e553682fd6d9665a311b36cfc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934895}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7ea88ebd8f7cd5d669d9274f667607c762183443 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.35185185185185186, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009799078929868707}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.32407407407407407, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00960372885009539}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..03124e54e2c9fed83cc670be7df1b841575ca38e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3451178451178451, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009755139387152023}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3122895622895623, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009509325983631462}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd393fa662e9c5da25a66b37a0a2f2ce3dc81cb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3333333333333333, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009673016668133394}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30303030303030304, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009430140669278948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7340a6dba117d73d15f7dee8897e1ede94e82c65 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.335016835016835, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009685160765932356}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2984006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009388855914040433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..24a1a28dfdae1d82cc9237eae81a070e53934e83 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32407407407407407, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009603728850095384}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3026094276094276, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009426434542371227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4af5919809df04838fc39f646d39a34b9005fb60 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3202861952861953, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00957415266873942}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29797979797979796, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00938504606669487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c16aa1c451cd66c0cd590ae34bade1f189b28994 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009290733161670164}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27230639730639733, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009134218447652666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6050e35abe454ddb8a472e62774b76cf646d6363 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.32154882154882153, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009584091575640627}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3148148148148148, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009530150430975607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa9098fdcf5738800c7ce012cd5c028ca75068a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3282828282828283, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00963574950926216}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3291245791245791, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009642048058060978}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2a07f3c018301773624a2c0c8ceca11979cc305 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.34385521885521886, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009746660584852445}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3400673400673401, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009720765494805264}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7e651fde0a4bb0c7f490808943af420cef29b6e8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3367003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009697166595752467}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3425925925925926, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009738105469984201}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e4afbf737dc8c613a81d743e6d1d8f9021cdb8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.33080808080808083, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009654540125986126}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3371212121212121, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009700146509130078}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6e7b01892aa2efb456bddcbb9d507cdc9042f6bd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2563131313131313, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008958775997918365}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2563131313131313, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008958775997918365}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a7c259c89a834934fbd80764f05013e00752a71 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00879883644422203}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00879883644422203}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c5a2c2e26aab04fa0198da49ebb24c38cab37c44 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2563131313131313, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008958775997918354}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2563131313131313, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008958775997918354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..654ddc483b9ab65b8618296089199bf6f5fd6220 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24621212121212122, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008839902656771866}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24621212121212122, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008839902656771866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1792bc3a8f98310980b96428f78d982d19eec967 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008850055161459234}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008850055161459234}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bdf3a850949961a4b6945ed8a146d47fef02599f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00891494899149571}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00891494899149571}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..73e590520630d96d27f8ab08379159ff6bd17150 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.35395622895622897, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00981237064417443}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3148148148148148, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009530150430975593}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d3081ad0647f764f2319ac35c41701f8936f2b35 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3291245791245791, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009642048058060978}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30345117845117847, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009433837434252272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f4d3c7d7c38e57c30991688785a8bc525595db8f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.32154882154882153, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009584091575640627}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30765993265993263, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009470292575831183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..92ed9c4fff8bf87c249cd22a222b697bfa8ceea8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3287037037037037, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009638903167022171}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3005050505050505, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009407763090599316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7f324adfc8b2caeca493646d9dc049f404c8c30f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3265993265993266, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00962304703826765}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30008417508417506, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009404000558513339}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e6beb18b622e70f01440280f75d43bc1f02779b4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_arc_easy_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3164983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009543851857323891}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2962962962962963, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009369711585684292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a48b5797439df5ddd57ca0f3ed978316b0b11da0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.589, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008984425782182318}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6273333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00882919733890307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8be2657de0f0f26f52ffa931332e827861a8a6fa --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6156666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008882569490543049}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.63, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008816229842524025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51a4d419a065fb91e0f83c43e2dc9e69ebda9c41 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6273333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008829197338903068}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6273333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008829197338903068}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a9414edc01ab2caf4a9a6493ad423cd0f90ba3b7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6313333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008809638003862736}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6336666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008797928274394058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..201739a35194555866c5a21fa35390de1cd8dd6c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6323333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008804646702971675}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.631, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008811292732995706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3fea3adf61dbbb1100e5363e8ff04439d6897831 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6276666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008827592133099664}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6323333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00880464670297168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3cc341c75744194f39b0526a98a759294627ea6d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6206666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008860362324722527}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.4083333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008975481073137033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2dc5f31411299aaef8bae3f2be502d89be9e74ce --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e0eba8dd38286ab4920c95320d71de51f1da7e8e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5963333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008959169522662576}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5913333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008976614094836194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..35d66467d94e9ca6b3569db7cbebbd056361f150 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.613, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008894007408882734}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.605, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008926639623340282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b73f600b76e3f894ee6ab3de77536c2f7c69f5a9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6173333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008875277637761267}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.604, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008930542249025189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9103e17fc6e7199387b15f53fd145634a65ba037 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.62, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008863380835773165}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6096666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008907909838637955}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e497b3842eff4abcf26387be93b3b1abb405ca2e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6226666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885120015653439}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.46266666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009104744524973354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ed071daac8dd1e68e3dc82d100773554dded4aeb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5423333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009097447488896774}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5413333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009098980657278165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..578ad956a06118f0904413ef5d773dea06bc5770 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5473333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00908922749948324}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.531, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009112665923139413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..605a08b9f2a4a88bdb39387a9180366057b1d5e1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.546, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009091509877386517}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.523, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0091205662238016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..79fc77d5394c1c20a5b45de9630ab15b7aa25c80 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5476666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009088646624339614}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5156666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009125748094153249}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b2966f1eaadd41a2823c5251d4995547f6b9367c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5383333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009103358843448796}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5193333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009123403215694962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e39dfd795956d0f6e846d7df429241684ad8c2a2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.49766666666666665, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009130131705156546}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.38133333333333336, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008869364649389163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f68149b2a5b1d97169591865689f0f1145b5b44b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5426666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009096928229880421}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5423333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009097447488896775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1258d85c1e5c5f5bd84597f1db81e65000b0b03c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5913333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00897661409483619}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.588, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008987709736566396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4f2bb9f99aef3f535d5079cadccd85eeb50e65f5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6136666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008891174310695494}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6033333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008933122315228996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9fcc1aeb52535d5a291306dcb067d606234a5e6e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6156666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008882569490543054}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.603, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893440584870012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e817c1826cb50d4d51b89ab074ac2ee5d6d04458 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6183333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870849530787626}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6013333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008940758594209433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..11224e3e4ebb380e450ff061a8fbc38b9c9b8862 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.38966666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008905164372580985}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6243333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008843442555522142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..283befe17d0a0f61eab3b42cf0c9d3e94c398978 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..752f15a944d3747e8234bbc67e94d9bd88c5dc6f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.595, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00896391565823638}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5943333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008966262991425923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..15bb5ba2f95c1f3cb73f065a906192702a4c1d34 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6096666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008907909838637944}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6126666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008895417372116205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ceac9b58494610bc3bddefabdb4dcb032b0650c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6206666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008860362324722528}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3800a1560deff0d431f39909de08f0c3abd31f94 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.616, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008881119942353993}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.624, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008845002997512752}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c75dde812620dc4ba37f4f1f8401ca1ffa49e5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.20571590265987547, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e66f1a294ed61f1fdad4199d466601160d5de392 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0cac4653818d9a09a61a5e9fe9495d3cf5318bee --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.31122702434177846, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1b89d1f2c06eac7b8650127b969d383c0fb24d16 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.3113026819923371, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a28c026b1fb905eaffe08144e39c9de4dffd5086 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.33963161021984556, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..492babc6ae90f1f192fe05408e9dc0f75a8a7ea9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.31340255400405775, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d4529c45e2f52fd8faa6dbd943f4bbfebcf96892 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.19047619047619047, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0a5dac4cfbc990d6c1912fcaf3477d25afb8e3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6034b8c328801198b4b73f663a34574be04a108c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3081967213114754, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27b80ea2d25a3b637a8dd755fd175a64b1e326ae --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.23488400048082703, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2122171dcec57100be4934f583706af7d7d4c48d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.250952380952381, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..74920a234c2b32500318dc2ea43f5395b7d95537 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.375, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.26353276353276356, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8c78576ffec554300b864f47b4b394b753b6da60 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2407177363699103, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..85cad7a2a2a217bf335f19678e5e1280608684cf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..42fa18d2f2a5e622ee7e18abbb33cb190f2731f3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.32100667693888035, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cf50245d18ea9ad3bc847c430657c2103f890d7a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3040078201368524, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a035aeff09811d0ea95de1983c27548aa2d300 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.31340255400405775, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7d0fb04020847648be8f51ed2825ff984b5c759e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359542}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2886268730041759, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c9eea928ff95acbb83bc1726af1777cc7c825141 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.30357142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06199938655510753}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.24545791620318877, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..752e96eb7373ccef97ba6fe0d19c285e3bd06d8e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.26425954997383566, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd207e82c0a3c14716a404eecb36af0bcf9bf980 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.32806324110671936, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..51f3922af6642ab333c077b91013451f025be609 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.26785714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05971290310957636}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.23582766439909297, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d76ff4e2adce4587bfc2d84b5b407ea84844d448 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.21428571428571427, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05532833351724884}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.1915830546265329, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2265b4675c966c0ede677993d5d626b90da614e0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.21428571428571427, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.055328333517248834}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.18070818070818073, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5b6a7b2b2d78ddb704f83c7dc1dcf0e779ff07ae --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2558647026732133, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e896b4f9c4b145b75182b3cf001bd9ad81b201d3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d041f09d1b3e16d90b6e8ececb0024ee2e7217b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.317639673571877, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d6d33687e6517721d368f722d44a26c6ab3f35b6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2716672677004448, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7cd746aad7585d3cf6efeebb36262c2584c57e42 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2880952380952381, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0f8bb8a2e0118a8d8b8e412b29dda75be8032e1f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_cb_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.31761006289308175, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..19c69e542578568faab128706f3c36b13b75dcbb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04923659639173309}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c8683cd6a89863b7823c12d3d90a3b0b9c4a4b04 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..370504b1b36397df5f348f3deb833f4ab0ccb9ba --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c29736e55344a01e7dd4e0b4d534364af9ecb3ee --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..61019172156d3a178aab0ce8c5ec1c8f36c6d3cd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4c1e50154fedeb075391666ff37dcf9e9a270825 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_best_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..413155d4b2bab738542b727b3366e21b249d6d0e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d806d76058bfed335045648fc5ace1ca14e99484 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2ce5e51b7e65b48c7f89bb27681a0d55e508adce --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04975698519562428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..153adbbfcaa1d7d1f59c749be75dfdce2e14e892 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3fa4ae8acfcdb4d99484ca153d7597b8030c13c7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999998}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2da833c4d53247f47963ec41552e395c4fdf0907 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_cause_effect_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cf2564d442b96869f9a6d930c13ecc1448891294 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050161355804659205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..364e7a603859d699fe9aabda6c0ad6c1565e3bf6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e28ca7a010e1224c418fccc523a27cda57e8bf94 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4c37d032abf3e26ca3865365d398c0bc4669d079 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0498887651569859}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..82027ed82f33059aa34ae46fdfa7cb0310e7414a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..01cd1d27a6a67e556b5e81b276dbed6ec796c04b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_choose_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..553131313e0d7230b71e543fc13ce75dcfd8d8a5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e6025db665501b2090e235125b2d455686be0119 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8d9edeea53b6194907e91a2323d3c02f01640f25 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a8259067a6d4e21e9ebc595f4f8ef5da056fd6da --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05021167315686779}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f91b1f2f66a0eac4b88fa007e3142455d74d7c3c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..82441d5a591bc935f98e999025393226dbb0a07b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d69a2838ca63c70978a95080b7d7bef2786e89c2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..830958f0abbb058eef171263180b2365ad96b5c1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b64bb2d1f42860e4dfe853b470500edab5ca3e98 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d930da7595dc65a150950ff16c4a80648cb24ea9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ae9040c8e1f409f406698bc931ae868d85f2f33 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..10ead51080d58f09c06dcf258d44af9ab4c2b9a7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_copa_plausible_alternatives_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f563b08bca22ad5687ec361f8666992f45fad135 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.3348308138228138, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0542442268004444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.15695463233811197, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013172199856712327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.31753062729975684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021570780173147865}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.2057045204344029, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015278039224567346}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.054730014529234865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007967615111152051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.11321119898453884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016182734674237542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.07227415925772734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010191143299311432}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.1355356394919694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001147779810670209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.2762000750744859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020067480630458333}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.17810439394384858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013638022161662619}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.13583123172424444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012122638205754406}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.27559480017616955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002087623902803619}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.17817139272595273, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014357182624015643}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..848b5fe2a6306ef639d22e0e1332100e0164d727 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.124007942744359, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07045741934241276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3538932352903374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022379532621590835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5881592755809824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002734526700250523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4315579334995969, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002196108406977491}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15240494036724936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013595392500719097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2605145852434897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00217480883356232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1874037992560188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015276804768402203}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2466966884536201, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014576935302306722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41914793158989233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023677125845129993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.30315812846780715, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015402915151127962}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2906120521359478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001962484953478978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.484233118629712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002583500073418052}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.354643328344842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019921434871566575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..184a0eebf84bbd312355a84474ea99df8fb07b56 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.873416618205943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06451768534270613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.34827807118661636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021235627642469048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5915788110263536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026271815009548663}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4285046970746608, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020742383144155244}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15293244786496132, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013754728634272706}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.26790058462792615, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002298680167945361}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18976901148520062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001566892088457795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.24630747239149695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001440271745251014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.4268276282189634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00236641291461196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.30509672166724194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001517855741385593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.287048775958497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019274716695619075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.48857994815747946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002609384753544022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3533423486463186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001973114674110923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0b91890b1651912f655ee7e3ed2b9eae3a6b28a4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.24802349891997, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06635722949244867}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.34180817756753284, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020285133445260843}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5920459286923339, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026236762956107166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4237555248539553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001992167963719159}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.15289604438013946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001375777527347721}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2737065876148546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023716671424749572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.19128265567370034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001581453769153973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.24394200099420235, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014272228885483314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.43011826387213586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002376768370329661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.30417906792765287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014988806302313118}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.28376532763365797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018837986492293225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.49209992765341304, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026597190806332866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3518251181572721, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019444219873398726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..63f4f7de824eecdff05a297054d88ee9c51044c1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.259603611889512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09137927162900815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.33191481035446385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019317351081010318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5860564434153247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002586714587351925}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.41457403908903073, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019274435714256614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1491204275015992, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013513647776642052}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.27220476893326745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023984250517094926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18797801160921448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015801875991192818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.23674148673246934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013752390609366295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.42520126985132733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002377551920288881}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.29732435133761553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001482323108912372}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2783455423171209, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017880786684828078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4922469101029398, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002602328441781233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3477933389836102, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018713097091873625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c0edf204d1f2ec126eac6323d0052ac3b6f105fc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 7.083478384911307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10406723270307067}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3214279899425999, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017937724757931723}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5794690131338354, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00257830350864617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.40449455948498675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018006323163830978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14499572043902326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012779901757949512}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.27026431913729904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023571465046724562}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18412349501175196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001510781529338931}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.2271462950868112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012925654415635542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41645498805365133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002379296311255394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28734129485544363, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001403833180540554}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2746492057015669, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016737991466862709}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.49571786150582986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025524072970345467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.34568503871928113, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017485361441317436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5c173c3d3598482e12679d7cf2463071c4d5677f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 0.5007977834063807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03795381558072733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.10661165424565706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009177614571421352}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.21742813713741768, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015639823409019026}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.14034402421598652, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010836222389849536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.013032347739898609, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00035990837418407645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.028194149116957987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008205075711069978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.01749967959480122, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00048519612999649395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.09463196140112043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008085663634196383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.19429758597333047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001444769495062812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.12481249710425146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000964737385169164}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.09025046396276212, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00076036607720926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.18536439953811315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013433343943009591}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.11908206350989242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009080994588157214}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2237e5fbf6bbecfc628e0a1a5c85e20c2c1eac2f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 5.912367109877884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.061836973143818245}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3128102978451566, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001691715951507062}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5681459084996038, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024899844904499157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.394742251409417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017032945676370627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13058114965570497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011618657961389826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.24495367272360657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021854464652663118}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16624694535550544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013782062542515164}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2101887975592876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012001358599358798}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.38833816310299923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002272167927909678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2666452952973803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013235129810010833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2597848984036747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001569416089582477}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4719852089818159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002427421024485974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3277807853351648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016501024982922983}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d1f6261c293a8541d5f985d50db9d9b39ea7725a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.6948355547841905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0511979514463551}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3163535969719062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016603427152068276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5834971336648034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002519250385447222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4017351734479823, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016902369451423883}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1381896709382772, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011500719976283826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.26453724451723776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022668939643765884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1773205809463223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013921382173492855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2163360334247349, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011827261695832954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.40582280287415906, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023312901734070086}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2761947809578966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013335952149149215}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2646418877094466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015454852649096642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.48824736439219957, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002461897429922055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.33603536027203956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001646750924419848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ffdfb2682f9180d27566d7b0fd9fe8b1876fe8d3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 7.079344515403131, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06095387919902322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31434504165137933, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016756850149852798}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5838649601706869, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025270103149731774}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4003285706279974, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017150724139835165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1411501512049683, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012071435589295404}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2710239140164862, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023428408193831283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1814061261607152, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001458658344245726}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2167987032269629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012303868538522155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4087432941457496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023596629155231627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2773948472348103, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013888021320340568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26527285008292856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00157683567546418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.49306302016696496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025313515796034575}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3378608633527859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001695654483915732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7f5ecc8e6ec2602c699dbbc65bd6f3d8656e1363 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 7.3036749892250326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07515670477596327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31515384991950207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016788769894757376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5890522804534821, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025220913561448603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4023616713620362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017219623211045963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.14276401815964648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012323791161579336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2759462068171959, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002391724415993567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1839557774098788, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014913102819955194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21671602881600943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012338874458127408}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.41184759065700366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002409432664953705}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.278135454552631, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014055461648461573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26652069267704986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001586801161274008}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.49825625350122094, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00253173094162978}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3402770443245213, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017117460741257117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..839b4332d7ba52c401b13728e855779e45beab98 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 7.252202833742519, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09258081742959207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31332420313697207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001643155770755078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5876744067602289, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002515657293017262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.40056764017959046, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001685883519704545}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.14257489924729352, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012041170840367955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2769111512983329, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023686465359635234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1840109955212137, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001458917708562315}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21603612545582793, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012167765328144225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4116394939841814, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002384213224350567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27755157420468274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013836819536529941}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26549263778003923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015751904507686763}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.49799879824695253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025632163763559136}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.33937537443965576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017048672562046127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0e0ed1140a3aaf7c0ea3ee71fe52d04ebd431d90 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.675468702405354e-98, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.5576140191327723e-91}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.06825, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004585546668182722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.003749985635338865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00026048740343422104}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.0070392305995167406, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004846274711079081}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.00046666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00034636309927530367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 3.063572910771552e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 1.7771220236267286e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 5.466015466015465e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 3.1579886935067115e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.06770833333333333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004559937242192579}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.003701609566962797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0002566134309674225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.006953552388838532, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00047845164190825134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.06752777777777778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0045570416656537435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.0036533251906324437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0002523074490538275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.006877766213052355, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004727965098252039}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a25d1aca48b47d4cb6b0170e57465fad60d5e7f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.09194884482583202, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018000796645724612}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.10846695430848712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004476787852682004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.07403964823256537, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003217977904697661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.07620341314076311, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00314933874193351}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.04014965767231649, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001936657761303654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.03382193035815861, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016247551035281998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.034204603336784774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015853396117442324}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.08344446002840113, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003725636670151101}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.05251848706263357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002326892413736413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.054364245776715184, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002273440248490721}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.09421039833488701, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004060934406491245}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.06112231322923796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002680843364295191}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.06338498822890376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026438955448359253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ae4df1e617414e6b24aa6f82debef1a3cebd841b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.59707844485191, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1425795749931428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.19136701846801105, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005379543595527197}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.14581299068539502, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004242345755430707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.1494166605053438, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004165824722062478}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.08162398770498766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026060437165329924}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.07003897501562158, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022428114359523145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.07093954631020417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021919661337184986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.14327202198692984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0043408686018408405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.10348587771415337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030734426965391064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.10636913992897945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030096187848934145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.16186299066230891, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004753003923288998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.11938311850161805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035231116397566264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.12280164375185657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003473100919506952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..11f84c96afc7775ba54a60148b3534adc8853668 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 3.5532345468202933, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1633344563680152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.24121792609367423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005557178035436053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.1980561640610698, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004678758581243681}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.19980679600968698, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004522960796291835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.10899469492781635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028561009527101845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.09627564067307154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002532227746801326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.09632029448061452, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024366646851858957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.17666987032282006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00438764652241939}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.13985008744550848, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034011563714505364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.141384148982077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003282216691402862}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.20064541312984716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004834286841759822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.1615193895639558, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00390851563052825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.16308829854180357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037724131070999297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..92e3f9757ec5ef3c1b1e389682bc44be8b64435b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 5.217629954237116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.23284930274397692}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.26662745576267594, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005575953278388338}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.22983869250345296, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004858036351505812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.22663036657248486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0046087693412430126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.12167903294101802, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029190844237015766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.11277983027021372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026762229693605183}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11048253025142175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002540966652479063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.19277110900350442, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004362200883339936}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.1616071987301573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035572917110778075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.15912782439760098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033432276357100712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.22055060611521593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004823378796586691}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.187891178897973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004094656427625295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.18489747608203236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003859601516136119}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bcf0f3b0c6c86e710ac447095b5d70c8b1662c6b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 6.376693171193141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2748726388858482}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.26448067560699023, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0053005540267231106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.25142750063623637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004979472392281411}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2375382569438019, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0045304403027307755}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.12190247907289396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027751227368017595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.12160621956065581, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026839402002743567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11427201259332177, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024437856983114473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.19188124181418145, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004147195755453536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.1779950318253713, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036462875523432373}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.1682205810105168, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033058315417317363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.22052854784819315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004603014131646932}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2079492427367012, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004237911852406221}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.19617441412979775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038379731820601636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67f42fc0597abe5d454ec60dd44a1ad5323e0e67 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.8175837539659996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06903661804515053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.067410137720777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015267778764287776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.11905569111444682, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002272837057404917}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.07913875454413057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015242869094641899}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.00855571479752271, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000610070681102783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.01452701665726237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008619364610158222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.010022915068112901, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005999619167956302}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.06588530481318243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001456682483397516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.11730596366118605, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022143035280371703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.07774011765647057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014739968390031794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.053281568752607505, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013584259494467741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.0896663423829268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017758029746831937}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.06060524038626222, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012344605903209444}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..772d4a1cf3ee4b8bf7f7499c94bbd49d8756d5e8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.360023014480657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14523312599938573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5368002507342822, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003197714150333534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4204347436029918, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029932330301144583}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4457059970163705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023475038584115206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.25010730577128015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025416736419787912}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19320140085672308, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021059841419488645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.20489136085595536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019458972766888125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.390181985500488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002842772643373759}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.30202346410754144, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002382966736707436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.321161092281206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001995161634007105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4390382418075218, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003082038398447118}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.34172069570320107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026874775375555484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3629740875330838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00225666081685001}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d4dab283ef72050028deddc9006eabde10b49f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 13.284050272128148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1770460818661209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5714641277328295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032799480051059426}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4468451656698964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029010255062063995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.47571760194079554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002265550724177173}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.28407643182119585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002743346128512875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21834852344926514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021773726271045162}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2325284196471626, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020182406639818696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4201198181536731, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002974299270315645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.32583054256145155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002409815129643757}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3475057918065784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020325138450125527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4715445246123803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031935178217488633}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36737757013379746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002675859339354407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.391612619922272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002268675240750965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dd70b73c09909fe7f646dc2d4bb1769d3314cc35 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.26351682307982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16518466746497154}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5781766953393467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003232493897043414}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.45846733751843627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002862839752183954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4869136869814224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002252377471622215}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2936437583754312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002766357156065721}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2296605325605603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002239048819483412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24388713793667496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002081168246828599}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.42323041568862063, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00299893377825116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3336509571396467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002463831973937884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35476450327695513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002131136688683226}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.47879240286661645, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032211942725843233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3788603945042402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00272506620717064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.40259898572566205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023428037926685654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d009c926717f9afb383c2f41b865b9dea2845abf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.568456206535862, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.22714934644025087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5824109780846385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032128656177573896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46303711017335764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028550395782162385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.49248119559722, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002261462995557444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2975497604665679, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002721358209608908}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2337393664335435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022535476965624083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24852828649672148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020894492308823178}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4258973373500384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029570811260566745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.33738654578439453, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002480938975354226}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35908240651080353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021535460141111546}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48186572666282207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031731113183941597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3829876385027774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027332507970680904}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4073362598730636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002350552518906019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a9c576adb9a58700726a4697dd203d9d363a9d5a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.41705650902542, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.25051935957566557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5768002584328835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003210936196026848}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4663927464698937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002827189059265761}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4912358527300658, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002175083911938509}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2929623409468226, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026747147184981385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2340255710785021, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002212699322566578}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24634621400768708, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002025833005255322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.41839619423807317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028660522019280242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3383776885015685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024683861897523925}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35597711356903294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002065825901549598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.47696170422006223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003098280726576402}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3864090320075692, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027224907209281047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4066143622258549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002273545252564866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c83163fc581493d703f487462c3259f80d8a5c68 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 2.135318771277489, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05149736456182392}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.1300442556023875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012997860884800998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.26964198994481503, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002413258698941157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.17219385386945554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016039551103560214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.039727029516438334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007171629939039988}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.08739502207944705, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015891465126372196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.053555609070310047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009542998217676876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.11658569686014478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010550984665999475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.24489523558752985, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002084234926217945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.15509057564768253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013250287969709912}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.1085790664749436, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011429046649663122}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.22674868093509162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002209787876074805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.14407710887166456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014318610459664793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ba90fe621636e6939e670ea38a8b09de0d7f1e8c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.514185829033243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08644240018036378}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.37271218983917304, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021586668011346864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5972780988107822, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002538198650027097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.44848731359904886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020321502995675426}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16529227064321578, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013977761177152697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27152870296975423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002179427881235387}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20027753375836946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015441978236127502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2586035514720966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014969615123517218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.422455743313377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023558444341864183}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3132180262669656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015462995225589086}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3082635717959347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019666465629011544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.49456210205094986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002502870066737165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.37097935121646175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019395659335613548}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b96a976398d93b55b840b4893574ccc3c69e7adf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.3652204803444965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.084081338904228}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.34970551268520844, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020964374351578907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5664575487593494, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002630711098096289}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.42275422267896534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002039352116930094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15364365198250277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014033191424566054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2562044764289271, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002268323282291491}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.1872354549740021, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015796108546654892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2474806927681275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014727021825347633}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4081431372540134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023551278568623465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30097802629669945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015401844960097636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2908623165977362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019170217364269356}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.47214399804412605, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00260021880713798}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3518120413336342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019523611899019283}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67cfea2dd2ad912ba1c0041a823440e46fda21d4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.4968698400953535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.059669479772880306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.34117325753136746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020233228481327443}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5604137274546103, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026266097618435577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4147491979933095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019885367884029}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1512868929321076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014148299890455783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.25600210025217546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023581582821936968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.18544033119790024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016196353447881983}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24337336103065083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014497505785590113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.40654419517533735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023761049290174237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.29753225476479545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015402025665262522}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2845042264341882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001885327286169968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4679542291978592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002638793268605149}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.34597699574768703, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019510235256301094}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..42b9bfdaab245c3b7d1309927da9fc9411a72597 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.6317178163787345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0876389482196388}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3400198376127372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001969604534522521}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5624978911685358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026055056008207375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.41464793929679983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001938651237761389}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.1514447932523371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014010545789579145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2581999705469884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023705600248679587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.18624803563290596, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016110635518348864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2418798336527905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014203318517984557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.40664147075304874, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023808852507882106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.2965625670926111, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015189535046102405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2839226907614591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018528730766754502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.47007701746039365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002613092569755996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.34627588323332553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019163771810995604}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..996eb44274e6ef54c66c8235c21f0be66cb8ef08 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.491534980084141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10379177513611697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.33728382733850976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019389929582716602}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5613995582020262, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002595310912421587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4122007802452186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019059990682215938}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.14882470581081966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013513137828580892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2562146590122059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002346866630933058}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.18362949663211528, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015655061377427496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.23902186517429475, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013782846724426727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.40466863634778255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00236488455920865}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.29378677375300194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001476945763349084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2822952417456622, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018260149187799492}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.470397132239544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026203121351988648}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.34505218091652107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018918651643402265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..57bd1063cefdee6dfd186120d10008f48fb32b40 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10101705898942855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016024170779631586}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.25441720721083605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037811710248573083}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.14273720741484003, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00215614367639498}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.014796714574159396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006925399675854941}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03874020571430558, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00185089897813793}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.021087066079578522, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009789482177719121}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08142707182033677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011693922318445696}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.20644516180144928, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028652502483865186}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11527495670864521, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001581234925824072}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08118614796808597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001299485058038869}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.2067297549645945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032511919359005962}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11505369846922088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017783784436516575}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.764704482900491, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07729220338781524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f44c1d35e302792db33788d88bacfb8b5eb86a5e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10639113371900555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015564566569882924}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2612998397001183, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035416281966467018}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1494286022448456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020905023117744753}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.011201230051773512, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006022749049849383}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0281936592385339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001591047010891972}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.01583906203117334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008546628932044431}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07481912872892685, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010039125628398355}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.18571610402684233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002406722992733317}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.10535331505143768, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013577546916502855}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08543958378845312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012424221154866547}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.21159077593083697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002933760670380391}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12024087462673269, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001679730935315816}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.5933480280129028, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05429786159159416}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..39feb45ab917d317df80f6727449c20e5e87324e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11490099337353599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017855066978262773}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.28416098353802643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00403393246491068}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16179129667309153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002404708780391458}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01814304612682325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008381266313809256}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.045832024333924644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020855696544412475}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.025696677971347164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011719065575280197}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08448464202437457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012405791092172244}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21062724107258454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002888253274044044}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11916137004159028, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016721548286313965}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09202870499262696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014247653846688566}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22909088132492106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003296324605941834}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12976468768250315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019232886149564547}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.956802800637642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.053844707729285034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e649b9a51a5d330d3bccd0c1778cdaba016ecce5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11944990921004701, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002075349245199532}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.28605812303299377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004658807697952179}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16516309621402628, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026871657655052933}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.02287253200603821, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009561691761087596}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.057624882236011116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024533377849923813}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03211728661220163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013308066074301423}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09017935363635442, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001527608100576742}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.2174725953720994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00355507141242734}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12491419978896767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001992774375798432}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09579245704657499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016893680911935236}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.23081299236685363, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003847882937168944}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1325926800413563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002179322108142391}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.2524115713328146, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07566638507964296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8972e30acfb2d6af69e491f0ca2f1a808874588a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.036672752688489164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002280550360708834}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.07104357839010789, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004348885899507683}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.044505675987276345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002633675660123755}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.00723200667268081, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006697782682481475}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.01625946634947881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014906194095147963}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.009527033234766386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008493752428441728}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.028537184733923405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018341458712957833}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.05427164298658868, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032950923739180006}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03404085416130508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002000857011564909}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0300121458572667, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019272475921858912}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.057470387886084315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035480273356727466}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.03592767335941958, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002132797533713022}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.6706678456407895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14282899844134608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f6acf3d5b17d76a2c567cccdf5bae29482c71826 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017130559457731738}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.00016538869567667977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 8.34686515261994e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00031526656966758077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00015897565882719525}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017130559457731738}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.00016538869567667977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 8.34686515261994e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00031526656966758077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00015897565882719525}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017130559457731738}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.00016538869567667977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 8.34686515261994e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00031526656966758077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00015897565882719525}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c6975385ca9aa5ae933959b14512510593c81567 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.18114983702463375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032933592986375145}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3436733855851609, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004317974276915552}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.22203482324927531, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002846869346449123}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.047446576026624596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020034775467036724}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08991408171222039, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028845441646191392}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05712816335799515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018963073568218371}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.14097151038726158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027300858693928703}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2690913907957878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035417684042767152}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.17285149090538388, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002306787503569375}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.14261278916768066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027978086937486806}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2733257622994336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003865650205750047}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.17513809375886846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024578450883517545}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.3356469238799042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08798372854463395}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d6de150b00447af21d517f1740ca91912b936438 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.15000019598004125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019021792570688687}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.36983583808020726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004482161415274735}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.21086782423473222, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025534571718774162}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0368473261642897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001172295467736883}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09474599631418981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030950408198772202}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.052391077282527426, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016560324357599706}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11414094299528126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014353953204332421}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2837425807321581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035949604065181845}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1607709135525665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019498240736311654}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11932299339443851, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015966278483692021}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2967588714120258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00398015313663571}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16811226782770253, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002178990811580748}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.1464792246976607, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07488613082754325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c1e2d791400718842aaffa67265e93b03a2a92af --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.15639600620674704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018335197504392494}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.384927926107643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004418750266865289}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.21985072281272192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024623644197026796}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.040354659501515545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011772215563549183}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.10426062090763166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031736366813986676}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05747020905233346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016740650674689041}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11784071641372797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014063568059494577}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.29208769322286265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036018446554112815}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16596009416926394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001923269555875606}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.12311951167098538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015736137729286763}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.3054920255583516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004042312320168184}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1734564395771864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021637914826953624}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.3300715197661055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.076927398815874}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5aa15257de0b6e0006eb939f2507d240025a7855 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.1560692539475909, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021493143139258548}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.36907923753393884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004843054637600467}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.21450007180041575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027403785317734034}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03862402056888545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012274248154787231}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09630094014738012, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00313494655980441}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05413486443607653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017007510337812338}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11715734774896432, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016770553946149032}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2788115939785124, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003945546036537552}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16116371553163497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021510860622327255}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.12407625422586686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018426611135377202}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2950692838761085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004308493244658286}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.17063343819296709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002369199596245067}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.219342171346122, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10669483486054532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab51458713a8c8559e41e6c19c04e831c550dc08 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0465851524353561, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027356501105498567}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.08936680319887794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005013916901074763}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.05633682692560753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030812766489717136}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.010250954815284269, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008867140781829498}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.02226634854406678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001849713945167963}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.013394347210809258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011084639218436085}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.03558590757936969, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022148829116658956}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0672125091911251, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038250623727023383}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04233274207639188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002338602629818879}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.03778568077281841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023309139307629827}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.07189209907306558, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041053780258082095}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.04511518475148196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002494953102030446}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.0148782549474955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.145410394761497}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..73fa64ef47ac6a7e5da8dfe9ef5ce960b6a733c9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0031814398143476946, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008461438789843281}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0025059491300529427, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006912202190712276}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0027092066961271533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007219195635749328}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0003440741648288817, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014037543336007386}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0002614759454382096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00010902516863349196}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0002940707111925786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00012089379086475229}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0022868109507954127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006237767036215238}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0017473662230373889, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004792256951751278}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0019169580279472128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005161311601160227}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.002390441596878889, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006439443967438869}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0019002132238758686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005456581356378058}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.002035329984614452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005499679200350618}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.122977737748436e-42, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.17065601470076e-37}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..123c242df287574e79c9d0578ab8aec86a8acbaa --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.17908873736938333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025998582918028913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3401581574130041, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00466124187806079}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22142206258867178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002657386115409225}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04001978076164408, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00143853172997629}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08351989855551167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002908378017045409}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0511787638415587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017012685833384458}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.13280826032727655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019720134198053826}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2547013277658435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036980006484927316}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16457376114227693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002030624196291875}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.13820058290248866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020236107693674434}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.267820674383363, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00406330233863675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1721074292499855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002183282479578269}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.242352554457821, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06303290180001715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c52975eb27a619c09420c95668ab22812fe551e1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14242240386934715, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018474131654211898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.34972417416826374, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00425304183688617}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19996151191164502, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024642136191649943}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03180784324852304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010605401760687887}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08122973807701163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027530425274732025}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04515071736102295, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014958437813087835}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10785508103994479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001342675196432051}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26778299938242467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033228224655886393}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15185456372717573, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018177177985044526}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11366701902108305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014987342312601044}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.28182954712893066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036825745017303583}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16002746501050483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002040753526176795}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.8494036251126653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06618274112679547}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1222d2d8b09e6b87186308ffeca71d79e8eba127 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14299555153351842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019414837646538163}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.35054680023331225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004405959066068025}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.20066139374359412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002583892537384957}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03372540037348309, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011433968826317544}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08536452458478411, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002896468085770253}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.047730927310845786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016012182287873963}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1096711314096099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014240979490601532}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2713048964277041, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003435168695225616}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15423466735433622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019141237080501433}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11432354351222646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016135722781519138}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.282399261929163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038458320187164767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16073844532804232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021770573542344426}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.9553979785993556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08114119801334493}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bd44033a7073d22c892a228e221e27a90cb28a8a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14095312565957024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021252189798371952}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3355112558235932, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00482038827448099}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19424816162568462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002758326610864678}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03309503575773881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011583105646001772}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08361606756802129, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030274265177690917}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04656621187743751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001620760377451161}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10750740692234483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001617470504570499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25822344513710505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038342113677037708}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14844786782549954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002106590246901777}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11272654966890937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017739663038892394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2704314687293562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041899251242752955}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15564298400864224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002325750760252049}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.9899134724216982, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08189694158167818}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0c4e7d8ca322abccd50731786a95c80545cfe904 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04182867272383347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002516912133041032}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08068317129523372, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004630110172525636}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05061277872693584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028337392524855837}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00821242826897234, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000780085704496626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.018229654639556718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016649615737807282}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.010817994039374855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009898135477985153}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03237406484505732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020316037823294927}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06169464366711305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035477374875163793}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.038452182498697945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002124389719631779}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03434532775935572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021605467450954603}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.06559190633881043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038270052788935026}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.040956310132902136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023102690323821627}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.851414956048288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10142038717157466}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f9d03f0e7c3ec1ac25d52fd7d76c6e627c17e601 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0021529907822185413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006324831746531657}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.001764574432238271, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005172389485732619}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0019051009413407058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005593343846105789}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0001457415441877151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 8.410282821934284e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00011802662746058974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.837858900511585e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0001299594149643802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 7.503370260000825e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0017522974182971845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000503946684193751}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0014399120468157657, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004056116387189239}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0015504765957310557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004402385576724586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0018058994766162238, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005178578773672206}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.001474217364139951, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004127036649374616}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0015923123485654278, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00044995087651658854}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.3013943780107486e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.513754776072693e-35}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3414508403d3048758df33dd267ef791443f1223 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14742845406065047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022525058672854016}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3423037012370881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004534073547427449}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20038284398674655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026502877333823838}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.035566259003931525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012143665358549194}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0856761804632457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028762266574800066}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04876836268401361, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015994140460390957}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1119655682266218, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016851034973666573}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26208954500477866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035709505116761774}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15236796425987564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019755777625985065}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11645877000552814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001831460190174745}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2732569800295104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003986118877385603}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15870323247009147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022161037551435207}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9361958605276772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1105276321242612}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a39d180a200129491396ae826c84119df352d9bb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14311406793133874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018301237072922785}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3539643364314241, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041947517615325515}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20137857386382021, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024449071554284354}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03128359088712783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010905195100737691}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08031060486303311, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002840064526422232}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.044490758416455556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015403655337067337}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10608525021287649, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013726454652157288}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26468135425429246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033595462893916675}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1495663368544618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018563556646033327}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11421808592242337, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001519079728610017}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.28497861112523026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037025598565927275}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16108455797657517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020640892705846995}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.8413841516657772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10867724653105522}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ce30ac8c265bb3b3df2f62ca7bcc2cfd5ca7d76b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14180365518935828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019007951978877876}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3486026747587321, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004261696866882266}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19920399454150123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025260772599993864}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.031146297678497057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010625332283172775}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07920383582526831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00274094299078919}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.044164071082009024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014977618687604835}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10604824573625245, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001386014229937149}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26286006784713417, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033079692185273243}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14926255759990167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001862261876173521}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11354760855679208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015422259389720333}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2814350135656724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003681070167391697}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15986703304551134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020835266037317474}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.7406142615504723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07696728998860818}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6e901eadd80b57fff3a88fc451ded06d2403f38d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14423414216147484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021959372951004916}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.34261604167692117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004689675276520268}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.1984246140855233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027300551089189254}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03256635443812794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011365296591071454}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0804693811178453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027940237622737594}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.045340440062370646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001542686638928427}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10661260527903711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016541928452697811}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.25522999799700063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00362478715746941}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14687329421715054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020304285556757055}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.1152311222340281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018256115213476074}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27538991990706024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004032468272508086}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15866027545322184, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002268964508976531}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9355476748458424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08372337758759396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1f8a910de2ac620e94ef8fb72c0f020ae50fb987 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.044405906375544715, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025610959506775525}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.08540051490880955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004853700270964368}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.053882370203528755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029733530298338136}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.009550693224487655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008726966214714558}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.02007700952508465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017411005950916892}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.012207544410113281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00104718267565319}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.03383880268801385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020073366629025043}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.06436025797802344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036713369177924643}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.040443470438169406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022120809378912516}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.036232835041777094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002133037107479806}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.06982777383177885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004049740359330814}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04367643240984686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024209022639727494}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.9480714926440517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11225116755335138}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a4d0acdbd7f7a76f18dfb3f83ab27fc6e518a1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0029613486934300065, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008017954910988925}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.002604820968979451, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007205582057724681}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.002683283542434088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007198699976945498}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0006160191869252509, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00030225046555707244}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0004629935054463356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002245599913382063}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0005146148795680421, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002458736058430443}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.002211450919027296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006050217509925478}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.001984170387640971, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005699295324941954}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.002013235956184291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000545919825941679}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.002437420380568345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006809965618635652}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.002143562481916302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006129726717806505}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.002196929788975592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006019250148189245}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 6.583932428818449e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.862037022971363e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ad5bd1a2fbe3b458dc595fcf88ac6e92ab53bc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.16879708539303456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002317898014536499}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3723291290939063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004545165883948288}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.22431631903155347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026093322298271567}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.042926431575461076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013931045314890859}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.1010845101974381, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031251208246711737}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.058199865858872574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017490056690093988}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.12782464697639043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001786854774861224}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.28577217256570797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037793340502424826}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.17056577152255134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002052264214122958}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.13266264249564455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018939748248399677}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2973642224201739, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041420819681628265}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.17737257514409943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022595548538614863}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.4757752294894693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10759359573160089}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c069fc4dacb1c71a28494aace32587470e64271 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1361829095273917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018976552774942676}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.33393036619688554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004242354030815657}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.19107539113194497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002518376366873991}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.028698540492772314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001082902961563565}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07241022779028632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00276886558899143}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04062921012242493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015228641462241134}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10299681457974053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001372759455933598}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.25531583728222634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032867969214619296}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14491822198289625, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018471762207363737}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10874915835982801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015461930795802182}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2691233096679109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036642185726151262}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1529660415125503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002085977228826524}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.7005679889378829, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09964986448064858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d66d24b1cfbe63423d71683d47dd37dc8b6b41da --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13249564318284043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018709338649646882}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3246622264280915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004249757242473774}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18596744227922282, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002501821848962091}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.028665504454117075, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010557844669082574}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07274913333784847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027319168569439082}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04062641993018198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001489738006922335}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10277571882616827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014001676277520247}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.25399401920963616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033229190889309318}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14454037675210504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018835020887859987}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10648169866167753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015420217510554847}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.26335591514014295, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036975443420346143}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1498182518022352, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002092342016369401}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6591704296842291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.058952940117381177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..98c8f587f8b41b16d2011144b8a3249a5a1ff54d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1321970313885188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021585317256625847}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.31326781741517834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004809417225791706}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18225506964350696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028014365519188978}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.028598963986615815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010992959325129637}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07104911617756425, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002803269832027392}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.040076565671093634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015348459428170054}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10064574048197487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016306171384224302}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.23989193498285913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037628092205002047}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.13893464976445405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002136322118023521}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1044393755169441, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017305664557089341}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.24983012145534195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004050169780460673}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14439166349049962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002291465287441587}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.7550272676426535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10693852053384469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..815a6116f1787ddc5ca4be5dc8e74a38f0f741c8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.036052583256726584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00222465814162892}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.06806165902172936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004214887088569051}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.043595060973618746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026168296236483567}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.006971532253971467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007060259100817015}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0151236735555749, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015108099841256314}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.00916357714654539, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009047152239114784}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.02826611508296682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017443681779659868}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.05303756340984109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032948227775611407}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.03393069801696403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002029922326331422}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.02934032328626938, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018132080168363025}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.05552522032023831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034822420283327185}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.03536876258776992, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002130719555450794}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.761689921419185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1328182623834428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cb68c1db0a0df81a1a78b6a2e82b59de3ccd9de2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013223146686700555}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0003997094985643063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00015232935691739325}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0007072965816124758, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0002672293328922938}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.003144654088050314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012266788586519496}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.00034926050249932806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00012487917660613927}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0006215332883020128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00022231592837332874}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.003144654088050314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012266788586519496}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.00034926050249932806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00012487917660613927}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0006215332883020128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00022231592837332874}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fd456c72357a39bfc4b08b3976f83de88f408431 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 9.584748891646283, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.4634311680433123}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.250581062033785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0058239075018178725}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7062158738344604, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006429129435489434}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.31552138662440865, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006271351560477954}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.19130230064405784, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005388995993524528}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5459773371219434, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0080205859504265}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.24351260224849494, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0060038232867892245}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.24167318855021985, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005722704187225168}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6860409065042077, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006645065801360722}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.30520915879086846, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006214719219843871}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.24469422917506942, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0057794266304149345}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6893060160244424, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00664328067734422}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.3082991401991823, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006258332068067462}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..557973e6caf02c13ee7e802911632c8c5ecee0d8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 9.533178886935461, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.280143031253315}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.36248265473655833, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00786742415876683}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6236177733252095, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007294999942203647}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.36387361635607934, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0072313190667876864}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.262274764045787, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007192741692019697}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.47184937877299354, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008337489044044633}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.2728457121204912, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00696791537970479}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.3439193525878385, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007678026405379061}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6036216040299907, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007526769831516323}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.34999255403663865, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007254981296861731}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.3478926854036015, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007735989195619495}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6064434019351521, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007514234152920643}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.3523209043106269, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007250830483715573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4752456a64fe42f21adcfdca2e37bfd42deed41f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 47.02235168348344, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.7717853003889126}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6572202360495694, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00684754645297467}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6314346295705424, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007231765242823908}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6186288635304461, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007000082166722139}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5015222551527236, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008055814149315598}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.4939358543250565, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008237670269354857}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.4813163134151628, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008040016805301602}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6352718835824829, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007015561715569228}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6161858038815646, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007440102458339062}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6025255165585022, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0072208910046901355}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.640484622899746, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0069959465065261486}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6188983018867796, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0073956327995358855}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6055217771045465, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007171276947403789}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..20469e294f5cd8ee1c67b79ad1741aebc0be868f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 52.9579720847619, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.3100172607657463}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6766298604013554, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0067056488771926465}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6598734365934378, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007039834715151128}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6471778364388423, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006873790000568802}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5291084288377738, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008003327478399361}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5238663129877955, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008115248816933896}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5130291514276495, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007995711764359356}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6572059275393904, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006902013951713947}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6451122696509598, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007258135995140546}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6318467907423595, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007102064438778346}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6610004390006043, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006869316294428751}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6472344224673601, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007217696942084704}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6342013853383822, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0070565440039982155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f3350f65947ac015be06abf9ebfc6c902b84a7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 52.34178177222339, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.133611224110179}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6802288907430989, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006708946985682678}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6684627731217727, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006968332579326068}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6536521914658334, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006849076962719914}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5329584588559811, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008028711159823622}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5312726398193803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008141583217459525}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5191957202312659, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00803537497169239}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6616061710878812, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006895248485381961}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6544242595472203, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007187512365355003}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6391386680743565, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007074861598280453}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6659368424074638, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00686752783122648}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6566926468449152, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00714104853293446}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6416329037978986, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007024709271452905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..72882e8c061e72dd3ed694585461bd209d780930 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 54.45931011034328, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.3566590232163973}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6910659299797782, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006587178887176958}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6819893555935073, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006829836530109502}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6680605340297567, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006698004882443747}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5479142345751371, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007979754345711795}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5461321733007606, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008056987802607279}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5349097989485111, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007953481702063668}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6748350327218241, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006785040185272315}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.668942921143986, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.00704009869784028}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6547986538369945, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006916766018240786}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6778670501360622, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006756650353609527}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6709060733280635, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00700381652397326}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6568291500377871, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006878708628824482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5ca14923419f4939303d309681faa5906160c4f9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb4c66efd855820ad7753f20b7c1f22c68bc4f33 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..24cd74b57e72d608cb80c8796d1bdd302f912cbf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.4929270946681175, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664656918145945}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.4929270946681175, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664656918145945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..24b40196470ece72bb49ba73bf170268376022e3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5065288356909684, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166482959521097}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5065288356909684, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166482959521097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b32e29ecd3f86b62a3a3a78dae9219bff856f8c2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5059847660500544, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664988455853328}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5059847660500544, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664988455853328}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2c9b01c12042fe3f8ed61c7c82aee1d91a8fb21c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166557553076037}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166557553076037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c4a2c665456f9bb8d510762cc00a632dbc2c8461 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.18526507984327809, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.014352687013672758}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.0207974489505957, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005419055674694086}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2318870955901451, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004096738969172235}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.036171445152019484, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008434569441985617}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0031982235045390026, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00016202149012648599}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.041020991401829215, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0021462417342201236}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005621669068984276, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002680503297844867}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018948626201834524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0004633920176847882}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.21626027275928003, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038232690449784353}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0330631932561623, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000733927602740061}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016844577935812375, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00044192663287105345}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.19686515382910386, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00370184205433221}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.029360548512863772, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006872127389303035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e1669ddadc1c3acdf408ca72399821a4ce87a89a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1702000893250766, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.022602277898476263}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019738604164709916, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005943670794712055}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.21655770189995802, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0040458352158600835}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.033921577431504014, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.000851851348681226}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003127993584163729, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00017993685583142814}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03773492345933376, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020242951172695168}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005440119305280117, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00028436946832994327}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018344912777774225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005160040189181007}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20486712491657955, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003806037253425624}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03161334313286401, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007473887108116619}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016140312647768006, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004983217012086031}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1850395128939851, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036539476740178115}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.027751716986245508, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006946521749936248}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..03ab24aa311e2e3fb45295d2af9678a99eb7b003 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1503842124457421, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.013366858191649686}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01879881841993818, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006724234666717016}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.20311375385070707, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003999202804540118}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03135729952678734, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007915594925290304}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0028249484242329798, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00018807827347922434}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0352797748367778, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019648415800267776}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004831334688117916, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00025723946213493105}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01766775823250998, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0006294794523191221}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19315703981184032, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037946160886462503}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029545104979527895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007329506313779751}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015392857681802947, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0006117781547052657}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17259808203436278, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035891800128922888}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025479267218651926, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006476891534007975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a68d1ffb9cd8981d86673a2ecd42904cf7ce5666 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.13272684019612407, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010461864794627634}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01850944530673799, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006860099150056603}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19748557905607972, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0040233681628607295}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03068502702468395, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008064587848542262}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0026003584508943025, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00017066530138983938}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03498197751607171, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020301674433790403}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004497352970943405, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00024384382231740025}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017220549482260407, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000613096462513708}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.1861971663320917, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037489583224773423}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02864531316521238, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007274072742712978}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015199797557400578, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005931310311220563}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16911250346695345, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003583275698147175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025128864550950536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006505763970428052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3ae9c09db493dc3f1f88d49f0e2c1f2fa48e1d66 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1116223813797144, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.009741200478014737}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01686851468849681, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.000499194047802729}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.18775401200901778, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0038484179354708087}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.028877688849418524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.000724243387986705}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002223786720488836, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00012230483001260983}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.030293584456631554, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.001817796896011939}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.00396783046089762, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00021343834712482054}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.015625504769488287, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00044811071614759033}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.17632683978230812, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003581217055366121}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.026778467249332305, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000647417434268784}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.013796234932842528, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004109358448293982}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16103835770466796, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003451182928178216}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.023672570832917728, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005880376849600522}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3bdb20910fb62759ec849c5594ca4e545e703eda --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.12258212222097367, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010470680109155984}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018668170976776546, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.000825403324535186}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19105972479056632, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0039771031374790326}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.030015026802667398, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008029789068806333}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0027317811603090264, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00028543390592784026}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.032512896092599414, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020795575853301057}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.0042412326403525056, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002561460325558005}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01696418124382772, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0006649299596873597}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.17869693365245193, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.00369968586124689}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.027601201325242745, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006943269476354326}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015314009901949352, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.000662958634304753}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16508907780803328, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003604116467893131}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.024736080266790614, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006498006943115617}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dd40f71be62cda86ac02f478ebd7b29a939ac816 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c544e27816832dac9dd5cd464069b4efac6a00c7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899177}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..da51e01749b1b7b3f2fd09ba7fe06f773afb90d4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.48748639825897716, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662170084916892}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.48748639825897716, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662170084916892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5647907290ed2554a74c420b43d8fea92292419f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4776931447225245, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011654208652596473}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4776931447225245, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011654208652596473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..06008cc1f9a2da32f9583e201fd565f705e69de4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738868}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a87a2b477fc22277992b2621edb90e5377fba8b8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4967355821545158, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665575530760367}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4967355821545158, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665575530760367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..46ec8e5bf9b708a29b36f572e77f1136f1552ebe --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5669205658324266, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560864423151377}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5745375408052231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011535468840824526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..97eb705828d0869eea1a7c520e6967d5207ca726 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5495103373231773, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011608491028638191}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5549510337323177, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011595157509775765}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d71dc083477b7f39caed4b271e5efa04716e3678 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.529923830250272, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011644913435420153}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.528835690968444, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011646407809944715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..09aed1c9e8a6607d7795b932505ac72f2932f95d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.529379760609358, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011645667565050859}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5397170837867247, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011628961491718635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..31a68158f406216953279b4d9a7c6d6217ad38e8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5277475516866159, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011647846656062251}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5353645266594124, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011636607860111557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..edee490dbe726d58a0b1ca75cacc5721f941f315 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5348204570184983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011637500993815848}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5359085963003264, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011635700809215629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..217e8261a1e5c4e11e6478927bfb3e72e439d682 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.617, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01538010232565271}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.549, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015743152379585533}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..15d55eddacc18038f7dd4253523a13c6f678f12b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.675, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095527}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.65, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9cf73780e7b38d50b4746818e08f0fa2ab69a677 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.689, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722694}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.684, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4f222602d6219ed3d447618e3b5ed228c7f9d292 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.696, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014553205687950436}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.696, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6d26aebc62f5306fdb05916036c32b383640c7c4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.709, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014370995982377942}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.7, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..897af2e0ce924bbd738f77c5109d861a25fada13 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.714, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014297146862517908}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.702, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014470846741134713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a1922cae46a1bfd2b861f75f4cde4bdb863bb781 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.866, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.010777762298369683}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.791, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012864077288499346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f1978a7748bbce8ab3e5cbad72fa611eeec3edc8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.9, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00949157995752504}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.87, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010640169792499356}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..572001d440d3118d89cfae958aa6776245986ff0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.901, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00944924802766275}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.881, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010244215145336666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..28bb19a1218b460f7f7805b0ed30b8ea699a774e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.911, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009008893392651533}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.896, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009658016218524294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57a8efef5526038088b24e2e6ab8ee33c412f692 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.904, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009320454434783226}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.896, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009658016218524294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..964e249ae7369ef030e2480c7c565043933e3b23 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Direct-Question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.906, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00923305200078773}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.894, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009739551265785127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8e7275fc851af9367b99a472d164d9652d168cf3 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015768596914394382}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.424, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015635487471405186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..57627ea9dde88bc94625448a827bc4d8c769b13f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.457, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015760691590136378}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.455, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015755101498347093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..75c202ad59ea2085ed23b3e2d9b1f30e4f232189 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.543, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015760691590136388}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.523, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0158025542467261}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..813642802ddeed142f504f29ea02f6f59931b9ff --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015663503610155283}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.553, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015730176046009077}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..405d941c9ca30d5df0230a97b3403af36f001c26 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.565, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.0156850572527172}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.563, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015693223928730377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5da6d54ce196a304b7d17ce4e57803089b743794 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.579, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01562059547530132}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.56, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01570498795436179}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..985f487d0effb7e81744eef2420e91d62642291a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.534, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015782683329937625}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.463, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015775927227262423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d715889afcd390e70cc8d393135fe3de5bda22b4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.387, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015410011955493933}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.375, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015316971293620996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..02c93a1037f6484b34d5a2dfde9c534184666899 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015615500115072957}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.401, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015506109745498329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6417fe7a52b7475f05dd3f0ea7c3643034728129 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015615500115072957}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015615500115072957}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4c239f72ee7ae5506fe3d0c93cd1c0279d873f61 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.445, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015723301886760938}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.415, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01558903518560463}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e40a0803df993dec91846b1c5bfb4dc4f3ba6d50 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.462, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015773547629015113}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.442, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01571250721186421}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9a2a0c18a6da101fb46e9c92f029105422cc10e7 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.583, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015599819048769618}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.499, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015819268290576814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d147c923ab80de6ca54000795bd0a94401bae8b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.507, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015817749561843567}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.487, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015813952101896626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..827f4a46b32cff22d37e63ca3c41955ce7820fd8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.548, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015746235865880677}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.521, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015805341148131296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..517d13428d073e3c15c15e2d4cb64493a5a0037c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.575, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015640320317040105}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.564, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015689173023144064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dce90fb1364eac53b2c6c8e9f19e01bcc72b6d8a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.584, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015594460144140598}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.572, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015654426245029288}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..874c5cff9741c24e2949281a97a03e2b9e4dd31b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.581, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015610338967577799}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.583, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015599819048769618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0b79f01e1a43538424221b666d387b6558c84b32 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.49706039551042225, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562232421541944}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5221806520577231, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551049647290309}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9068631c6e50edb9ae68887d71ffa5f5de1022d9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557435464292914}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.48957776590058794, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559920087347783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..266955e96ffea1708b66ba05e0428b66c46369ce --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47888829502939606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552120807053819}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4863709246392304, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558135970599896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c8ec4cc2acf3711d8408e6c53295477a7c40a677 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546234813777406}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.484233030464992, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011556682042196382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..062710fc208dfd3d28db7f5b567a711c63828932 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439891}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551591851683333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..110c7327976874577e3810ce8a22530115f8c712 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439893}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47888829502939606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011552120807053819}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..932d59e38a1176997cc1483ffb314f8480185945 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.48957776590058794, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559920087347776}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5334045964724746, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011536599118298177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b2f6a2aeda16f6c27e49a88d7b775f3273f51e8 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4906467129877071, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560409019420362}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5125601282736505, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558783570737967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0593826dbbf767f396cecada7bbf15b214447c74 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47728487439871725, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011550494192008945}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.49438802779262425, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561703928784335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e9d99ae04d5a20647c235aacd57b12da36b0a02f --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555016408505476}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155743546429292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f9d4838c6c9e83f60609787ec76809d210d58e08 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697235}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4906467129877071, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560409019420367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8600637566d89971ba7134e376a4e20c2527aa59 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4826296098343132, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555452669106632}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.484233030464992, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011556682042196382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3278c6c1e76280530f6fae92a1cbd3a48944859e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b06b634f4dcf7d4525926298faea8a6e7f5f520a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6599a1aa29e06d28a6bc25715ab47f83a36dd380 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e4101a2f9ead8bc1f86aebcf0329462d24335f4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..53e9e0e19ffb2f8d0b8450a0335b323bb101947b --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f573d913c6146946e78479c7210bb1b797fe9fcd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5363338c74465fb2de16a931fcc9518c10764a6 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4879743452699091, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559087533800687}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5125601282736505, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558783570737969}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0c4bed72f3a7ab16117a46e52cc4de3e40f74bd9 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.48102618920363444, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01155410417401969}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.49331908070550506, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561400034509398}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc9665aa2cf533611f799dc4706afd2756328b4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154813982307477}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4927846071619455, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156122826464673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..928c87d48a84f1b9eb5cecdc89fe8e4ecc3806cc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4740780331373597, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546883081384901}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4879743452699091, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559087533800689}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2dce5c6d9b8247f98d6a38aa9be0b6437e83aed1 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011539022035111228}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4826296098343132, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555452669106634}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca837e511494c85f4ce0579081c0bfa68ae5007 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4719401389631213, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544210396951669}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.47995724211651525, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553138977961008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..398339cb85bf8fa21c0dbd9c590bdc7c0c88d416 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.49438802779262425, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561703928784332}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5312667022982362, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539803085637724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..57cd8b223013d875de4245ae4cc2e913e9ab5529 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4917156600748263, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560845076525718}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5056119722073757, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156170392878433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..af852bf937ddf267e3b7b6cd8617a4e1ba1c7f88 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47995724211651525, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011553138977961008}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.48743987172634956, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558783570737972}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..72934bd1a825cd22c0e7d106111585ff2eb3367d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4901122394441475, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560171163157397}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4949225013361839, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156183605423878}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..908ec3cbe879496b0eb1eaee92107333d08b78d2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4879743452699091, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559087533800687}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5018706574024586, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562351329083271}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0eb8e5c69101798edf569acceb322c120c06e492 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.49438802779262425, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156170392878433}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4938535542490647, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156155858904076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bc1697e1a6a8f2872e92ed22b727e374c51c9887 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4620938628158845, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0e84fa8fe0e9dc802f5a140feeb15072faf1b865 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2e1136b8bc256c3b379ca3196d4274afc94e0b5c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0468d8f8ff26a96fbec393a561392e7feb714179 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5451263537906137, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029973636495415252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d07a9424a6ff36ae051b3f30333a1e9e216c3d0e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e3ff027116a5cbcbacaa3adbdb82914532f28108 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b04c76807442e0a9bd49dddf057c38a1416de58 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317194}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a61c97306f76d9edf2fe982c27e4a351a143cec --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eb770942d7eded68582c9288a7635d1b1378f8a0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..43105c22ed27e2a398cf8ea15d100f7d684aac3c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..054c46cf6310efd579b3eab4861f1ea8b6bef08e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f773c1f059605887084d3bac93d3464cc3245bd --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0b73720d8d1416fc272859f3db4ace75fd78eed0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.44404332129963897, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029907396333795997}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c383fc6973850fb4c8e43df5ea67b55f0502139 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee0ce9649dc9dfef4c46b497bf25860588b0783 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb4b26ec25f4a76b89b47abc2d803a9f5c77ad6a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529117}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..21bc7964ba0c3c78459673fb4e75598442189160 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..37c6ed94dbd40dbd1de1fdd17814aee740fc8323 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..963d60e5c3362796c694fc95319c995d480beb69 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5b5ebe99553010c0501d038942e3b7bbf0684cf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce64763506af0cc0d1f45ca6604eb02f84ec017 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d870943d9f9b1e68793812a81b5ead4a91c846fa --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..75a752b31cb73c46292e416f5c530038d2b1361c --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f37a159b4c425dd6e9849a23b4385c135e0ace35 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b0bcdeffceae22b9fc750978fd9f5b4e0f3b5a2a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a86f8a6b860c6048721db96be19a65ff43e20639 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..01c972ab7ff9246d5c5332e9440ded00add0fae4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..34fa8025a533bf1b517cea7c52b33615b96e097e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..354aef71d32530ff952bc625b3a1559bc80991bf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a8771f42dcded3e8f7ccbd70e279aba781e3ee84 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_superglue_rte_should-assume_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c5fd34c16dcd747358755f048497e138dec5fa6a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367597}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d2fb64aab7c97117578bfc2fe87bba58d8f91db --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367589}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050170094497697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e21c59aef714613484b185efb167737abc7c7564 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174959}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..28e322afbaad45e7bc677eae76e6456247a69ab5 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5240726124704025, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014036189665395129}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014042813708888378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c72c4d24acdd1759039a07b5124c2d47845914af --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174962}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050170094497704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..10934889af92274d122ac0c4afb42510f7101319 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_Replace_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014042813708888378}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140451261309786}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad60fd0d4ecd434592e6f4542f3a81b13168075 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0140519560640769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b0558c08265c9f649e0194d4782da90bbc8407cc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049749833367585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e035cf7a86a85e4296b327c092484ca3a7229741 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076908}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014050555322824189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..922387617e6da1eddafad291b78067ceedba28dc --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4988161010260458, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052446290529022}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..13eb7d10d015a29c48f9cb258bcc34c2cf4d5ca4 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5027624309392266, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014048278820405621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..494f1c667fba3b5749bdaf12eb361230437adfae --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_True-or-False_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4940805051302289, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052376259225632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5aa3ea7cdceeb947b520cf5969df35f9d4081019 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..db36486fe40dec9f4c54d1fb61c4fb6292cd65ba --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790513}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b39907f991bef4db6ea6ad3c5b354097b44f3229 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5303867403314917, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014026510839428743}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f95bd783d717d4ebe54cf06dc3efec3bd67837a2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5272296764009471, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014031631629827701}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fc4a5f1e4cc608f75c96e1ce827adb131544e4eb --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5288082083662194, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014029141615909612}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..78ee7a5470184c1dc631e33cfbfe559df3c2ba4a --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.526440410418311, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014032823874407229}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e26233282c9a8f3a099238fee2cafe0285d0a833 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..adaa946ec29a6375a07c49675fe5df8071914124 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014040185494212945}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..42208f60a5799725ee49da7f7085fcea4dc99285 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5240726124704025, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014036189665395132}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..73f26f037bc8f3c163e95a761ccc94f853497247 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049294536290391}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..129c6332c839d8bd29e3379d652f1d397ce88cdf --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..82044dfcbbe65b01dd73927b46e4499eb28af5d0 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_stand-for_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4846093133385951, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404582678978367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_0.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7e71b7ee1292c82a1f0b61a5525d2197a935d8b2 --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440419}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_1.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c477310e58cec702e7f46e807aa63824e267c22e --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405621}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_2.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..18a4bbba87eb7580a9cd304e87dc280faa3c6e5d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5146014206787688, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014046492383275842}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_3.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..073fc4774cb95aba5f2f4124658955e9a0f7be3d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5248618784530387, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403510288362775}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404880419985932}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_4.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fce77d7ce5c0d68bfa685caa4ee1c645b68b3b0d --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5232833464877664, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403724130957364}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405174596179051}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_5.json b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6fdf9a812308eee0aba60d8d615cabfaecc062de --- /dev/null +++ b/4b284b28bc4/eval/agg.4b284b28bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5224940805051302, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014038257824059886}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5169692186266772, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014044390401612976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..35afbaa27d513589933c80a07cb6d6eabc155c09 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff325d51c8019b4b05700a6079d816a1155b700d15231d43d9f88783b0f9b27 +size 4120896 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..371a7d4e2596ec98a9434388febbd8fcc9a1ba03 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6a997f23e4e7dbc50849e8fc97aad7dd042720991a0717722d6511ede138078 +size 5090672 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2ae8af46c1f0cf2f04dc18d50653a0216ba3fa1d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3beb66400f58287362f716012a1a123205bfef98b291daea7d5a120e7e6eef64 +size 6031122 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07ecce4d94b57011258f2778281a5ff5fb7d5b05 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7e421e415cf83c18935c88d146c986d4fc0e4927dbca0d7b0654fd8f64ee61 +size 6961546 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b8e8b051d88e616fc480449bc3b23ef48bad78c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3911f505240829438f85aede74d31fca8059e7d53aa953d1e66d76fb9dbf33a1 +size 7880760 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4236c09703a47250496b7a6f7245a61d3d7b9b15 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed344a6ab4d02c2990d3ad46d553b7ea55600cb7e392ab191a243b1ee84427a9 +size 8787047 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7e79a412760135d74036301648a3b56d6506f87 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c71db376e53d148aa27d6f97e0b544fe54d370f3b0a8c492875ab07fe266814 +size 3945474 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fdb6a7cb5edd4f4ccb7930bf1bf8ebfdd9678246 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb28d260c225d074fe9257614c9eac84b2fd4c826a99ecddd42a94d85419064b +size 3866829 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..33895e621f2f59122e3690b90ca25590d80b4ac5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c63e6be73927613e6af0060ef1435ea57161d530d55ceafe61046ec9f46a64b +size 4180631 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9262ab89af41a0bae20eab9e23e1169a0f19ddb5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:255cca8c17613b280f7e2b52180c34b04aa4118177bfdf2ca185e36429fe7aec +size 4876486 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2938bd0039d4bcbf13884a45cff6f668b927de7e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e38ef89156556cbb29ab755aa80b671fb65a1de29051b385f9e430645008027f +size 5600948 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4652530682ac7202b74fc39f9fcc084ca6b1698 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b16fff17f053df354f32ff0aeb28879bdb654fe242e827a254875df1f2214454 +size 6308901 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6eae47d1b182d45c7def1d119c56a0b09e467d3c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467c573e64b4e76f90b388e1be8444ce90448b1e339d15384baa2dd9470f163d +size 4806796 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ca6abafa5d353280a557aaaf7db41f5d0ed1e5b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ace5332a37f2ce9777007bf9881180475e4213a0f591122cd49a86a489bfb81 +size 4865026 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d92238ad69a87d7a67e1afd8cb63ccbd5ae1430d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8ea882cdceb05be3e8536802791d7bb4b2fb3f765cc36479d6965dce0334fde +size 5266963 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b98bb668c4ad8b82f448bf742074125c2f576bf --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d568bb35bb2b0327f4768d6c5399bfef730b2b2000126022f63b5848692b7159 +size 5783170 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..46453123d46ba02ad698bb51e5905687f8603386 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:593a6a4b475975f7ccf038cc7bce509832932bc06ef01e34fae441836e5d3600 +size 6353611 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ee6002c80739d8f2e3c367a308852f7149f0edc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262fded47399ab88fdf262d9fe69e4a288b6adf70e0d58129a97cc4436c360b0 +size 6982950 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02baa186c6f3ea4ac3f9ce0874c6ce691a19906f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8848dcbe1ae2415a821351e54fd1b636f415cf744ca8d84e7563cc429cb5b6d1 +size 4598111 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82453e7d7965d4e3c6242b78317cd387ae3689f2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b093dd393da0c734529e08cb0bb5bd90e65e6ee0ce8c90d5d2cdf725ded69958 +size 4828429 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29005ae0b553cb33a28a76a4691a7fdfc46cdb72 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15002e903ff0042398935b0ca2debbc150f218702fd338c2f4a818e4473d26bb +size 5604581 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..758f9f24d319ffa60029ded2523bf243cad49de2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e74eb0701339e70be1a072586f1c8419fd681a3e9ecc19c1648b923e1fcf4e +size 6431559 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3e1367c7a76c4ca2fca3436c80e3359ecadf991 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0468108a5ecc7043d80c95de2ef6b180d1e481ebadb1616fee2608f349c5670f +size 7310185 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..897f2f13e8f09b4a60267a50b9a5e99c6719673d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac3fbbe18c7ef96acb45557c2e6e0126f62286987901a76cc1c4db2cfff0be4e +size 8193103 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..837d119d44d6f487084410e4beb99f4c32298f1c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f96dd00be851abebe0a99bbc19e68750b29364ebe388ca3772c0b5859cea214 +size 4457266 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd53ecfe8d652a137fab33f8f1df514b86599853 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ae411e224c13bb1bc00b6cfc38c5e97da334a4af57e519474e4dafd738db257 +size 6912560 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9cda707686c218ad383541cb58ad5cabca3b432 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:203ac6d65863ee8564b7862136fa570d6cfabce19f38521d26087c65defc83a8 +size 7446969 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5415fe1da53c54df9ea201c056e5b53f29f86eb4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448da8fd6686c36987c1a0c72c792d92dd8e50a88790006718502879e1da2f6b +size 8223982 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae34fd1dfadb6ef47974c847923a269658ee7d3f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17e8d2280c39e6e0befdd60079421cf25a02e21a8ca502a9b4241ff41cd2807 +size 9640792 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4748d9b68b7b6814a89259a9d7c7e8a23c67080 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0245689eaf504534a7caecb556ec975129be0d596b1054f1b9aa36e577c9d137 +size 11104517 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90a6e4463676541f6aa7904ba17733409d66b38e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d1f66178f5b2badb345785b6a38a7818521c9dd541e2851e64ff72a60e98a4 +size 7935613 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..30128ba8712681e1e4163ec06885ae4271ac2737 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8412445e0ae8c4fb62f60de108e1aa332964ee5efd175aab2b2451b6b51e20 +size 13562794 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98e69f7ffd14332c6d1565b19529f19f779ebee5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c84765f38f42969cf3fd5acb5986d9c71b57ba088a245ddb343480015f02504 +size 19234125 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..989053f749c3637b675f982b0741aecdf34b8256 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4af64e33855a8f43022f0869f02d6e513b679a6bb97db2301a90d2b34dc2731a +size 24714995 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec1587f4d0055ce362bef51f92658ecb6665fbcb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6856ee2c467084e4e77504d23403b45d40eacaa036bd040bb6c9bfba1b3789f +size 29904785 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e21dbb605204dfec3ba2b6647c794a461bc8f855 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2fae6298730b53eeea6c922603f4eeb599661790d3e1c4d0255119288e06217 +size 35294458 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47e51ea878918c070bbc08da51e5a12650a1de24 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1444353ee52f9946edda425a8694d9ce3c83aa4326b2b94480833088cefe2d1 +size 7855261 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5fffee27a8d106d3670329d4b957f396f5eab6bb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:646ede29b49d0658ab769269d278839a763dac44cf99e479eb3ab5274b2c763e +size 13578876 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ea958777a67e253e13cdd810d873d1397c799a4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:374b7f9c0f5be3924468033b8780fb88da0b405bc69c3caefcfa5c94f66dfe9a +size 19270965 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba40adfb542a11ca8f10de1cacdb9b26ec622997 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1f189e7cb7bc4a747d8152e91571b9b9969721d49c3c8498b17abac821c70b +size 24787729 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d893c821dc4beda470082813592ed5772edb98 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24fe4deac54f65ba4134fbe07e2cb84774d325d1d53ed9a27c0b98bdd2298430 +size 30012514 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..765ab68dcd3c4004133f7a9f6bb3d4d22b56d1b3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861349f949bb1f2f5cd0a95d334ac6a076bdc5516b9fdabc9277557338e7e4f8 +size 35437226 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..869489d24ee578758473914ff692474dc02afab9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b52ecf1e61550b1f446710ae0aae97a761d40aa1094212017397c3b9b81080e +size 7941533 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58e5164b1abdf26f8a276b0303b3359b308aa32b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22f989093ff08be48daca7c97efe20d1ca0e0ef97fba90f91e2171a6e7840c9 +size 13686461 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..968cc20ac27c6c333785b3b7a4ea4b51f6365e55 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2f6630a339ed3a5f5224c1e3f7cf7cbde523744f80d4dce7c8fa7b69ad1f037 +size 19377083 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d636538ab0b042ab2972638aaaaa0fe6d8c1900b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3309e9d6f3dfe336beb65025397db0f3d180ccbb9be009cf342f13d044632f1 +size 24870069 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2e70e2e83791e8d8ca0a7c47ab71bd19e662e2c6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ced0969bd2bb62e7a34a596f4c59b6184868622afe13bf5045d56cc298b24ef +size 30123363 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ff4465b3aac03906ad56f8ba4629a16d54b5e58 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b77c449095332286f3be75d1513a35007b5ef7379bf79f054bf90f6142d162a +size 35580011 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60f749d74f066dabcbac6653f2e9d6115bcd344f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd4fec9611304f4390b3fa724ea393b8462586b04173d7cd0dfee61c3ff8ff5 +size 7697575 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad6b3245805c3d1672ee43a67b4ef68a411819df --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d7ddfa6eb43dd414717e54867e85e09de38ce9b78e3be7b77f6719b910506e +size 13314107 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..99f79f5f3356e51fc0f241278cb31c54a5ae1b7a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e1ff0c8f9df815c44ac87ba66e4a02f6f7140318f03ec9c5a82e77eebaf7585 +size 18815990 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3cabd0d17ed7d3dcdbe771aaa9b60e9aff5d6902 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1dcd292ae3f4304fcc54af0d18c74b2ae88e449c1fc3eac334d1f8347b482ff +size 24191576 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a56de348718a4211fab6b13c04bdab248379cf2c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052348383a039978a87460d63a38645e131e83ecc21a88b2c076e157785c8b3d +size 29429823 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a4a881e561b5168af218c0bb465dc9687ea0f80 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f2b11add3ceafd016cce983f96077afa8dbc47d93705ec3f5fd3a3b96b61fa +size 34792526 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0286b48ea6cd74332a56f02c9f9d803ea74a1749 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:258009a5d768aa20d3e26d348c30dbe33d1f28ac5015f33a8c35b303d5158bef +size 8230301 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4d9f9fe9bcc0dd511c5889c389f6435fab5e6fe --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3a0feeb6761d5d7629bb8342469dfb83df28b81b1dc1fcde2d1875931b747a9 +size 14068415 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a826ee31fec57a78c819d89dfec0780b168c4817 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e6964d58eb139f062ae051237ba8f3d2cf764570acac72f78d5538fdec68f6 +size 19832248 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..912abe33217ca439bfc209d43548ecc065bd9b59 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f10caca9dd1913930caf6a2ff93d6129aea531b576f94d7ddba330d53c091b38 +size 25444828 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a7fc19ba14bf7d9ebe070c31d0078826e4798f0c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe507697f58c638bd70a983ea86162a8900dd3ef8d877731df5362f38543163 +size 30853758 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a48901270094e5eacb1fe2e84e8392377d5d4a77 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcd0bb94e5412b7abc2f78690aa856132369ebd91f2d1f6755c6644c1296c46 +size 36471852 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ac2849c9a4ae45e6fd9da79fa65ba52103f76ef6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac95109eaee68e96c2a42b4d2f5640d822b0a188eab10c17ecc358b1f74b276 +size 993159 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0481c1b923ed8d5494538c44837da28b567e8cf5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6e48e4034b299019cd908cc9f840829f013531f0ce148abd855131b26acc2a8 +size 1452508 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..682ff9cf83847d1cc2e9643ef874d899ce136968 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8068af525bdec3d16de1ef2eb712c0632df721348ce296c8d2d99a52e9ebeb +size 1910819 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7a7ec19c120bd0aae49b82a6878f2aa29541edef --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f4e2c17f87ad6bb89c72c1e9a3f14066038d3ff5e5a8ff17a8c6f97d51df946 +size 2368017 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27d3d0b44772e4f73ee56ba95e1e3d39790b73ce --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e056d833e9e5c4072de2413be80ab6186fbbe4a919f2312754a27f87dde6f6b +size 2822357 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a9cd161ed37af0969376df421361e13c71d9283 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff036711ada6dbd7454e2500b348a39df97aef3f60a8aaec740547c9eb05aaf +size 3278647 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d547a4de266bd3cf593ae5e569aaa2c566c9ecda --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690fbff917364e834bd75674605aa260eadd58013b949c4593a3d5d2ab3599d8 +size 1203113 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9 +size 1755006 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e650004a683070c076bbd36891cd49a4732cdfae --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd1fd3ea579a24d1b2976d4552bf3a4c73097df2a988c9317c963724f1c3182e +size 2304197 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51a7ec297948e7e52d6103ca4929a652bf8c564b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4db68e638aab8cabb1491cbe4dfa7d88760e18595a52ea0f9899687b81d19e +size 2852905 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94849db82533825a35fc09085b3483c90d3dbefd --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb44ec55fb281b251fdbdfb526e71b9c338f81508d88dbf0ed847f3a71f43204 +size 3398553 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40557be60d5398f81b5bdc9015ced4efc8bcecbf --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500a6182ed906064fd8a88e1f31fdc881fe0c0a06bb87144f98209c4767035a2 +size 3946045 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d22283dc1077f38e7ecab4a048af33692e30971 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85771f6431e9109ab2ae6e508f71a8a3de7d527b6dbdca5c53ae3e075a01e1ed +size 1008054 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91 +size 1478640 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..696d8c0f1eceb32a0be5928393612d52e1ca4575 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46582a52edc9e78f06876530ce5c9e898284686533c6f5ed962a0ca1e0642781 +size 1949774 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3c408537eb7a9947a2a827ea948d6cce21cd5c6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d536660b2cfa4ae18cf3ae012bf31d5e75514f7880966739929805f564c5fbe +size 2418938 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b038002b04f1c66811c0bc0173d8a1d3c5f70d4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f28fc20a5cd5159e2e84f53b755cc66761dcbfadd8010c823cd9160545c70f2 +size 2885473 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b440aaa95033378a7df0e95dc339ab7822251daa --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25d3f1369914b248f477d70fcadc5b00c2a9e0b3856811dbfb82f7d8c3b7b723 +size 3353867 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21cf3770825a158992677a21540bda88bb25b07a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a9482e877667f4c37501a19bc37e774611df9e24d8f4edd3ca5129b48d055a +size 1160854 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f7e74aa8bbcd88e3dfc421dc57cc3c7a9ae5024 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71995e309c72c2ba61eeb0b784e7ab3045210c20086f73210eee32e564338cef +size 1668559 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a0cead8128d7013de94bcb3b84b69156595bf05f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a749a0684fa8bb4842e1d1d63ff6b2a3cc7cd2ef65a2b5733eeb96391d1cd0f +size 2176245 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..531f47f15c23376a13d147a682e48607c22d58cb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21e714991e9da58f5e535c2b460fcd87083528b7ad9bff8e50997cf2d7dd129 +size 2682983 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bccddc89c507e6eac722331f8b588fb01797433e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a485689dee6eec1ef10d4401f619deac56c92c560e44d85f644832c3bde1b987 +size 3186936 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..088670771735ace3bda87bd101fde5797c4e97f5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99ae91a60f213781382ed7f0187cda3ee18c54c0c40cfd64cbea6421118a0146 +size 3692933 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d35d904d092e63c5ca64fc91aa34ddf18a56665 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ab512e5af08d467bde8ef49fcfc26ee69617a02a2084c72ed674f272428f54 +size 1027181 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78bb5a64da0003ac0a64eff7c3bf60f23b7872d4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c1fba83bf0c5cac8ec081ac798a09b9de964d59ee812858f645bd466bf95a08 +size 1503638 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c8ba8a703f9f6ae0d18e93a1200024c7a838c4d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc23502d8713302030dc2990e84b46c8ed8eac15657decee90da759ae8328c91 +size 1980744 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed6445fb9291dbd7f4ba54dd8b60f158db21283b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a556d07a51cfc4ceb15904771783aac2d61f6fe56f1997d0ac668f7a759a2ba5 +size 2455923 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d7602b055931f344b637eea72fb2a5598608999 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b63c79b4644b9d980268358b75f4394e4919f21989d97d069155be5ce634466 +size 2928396 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4fef8f4ffdaf8c737d79c70ed9fe22ac51f58d00 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r1_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5303df553c765a02b4f6980e8c106a5965b5e9df1b890c72513a44b0ba88b6ad +size 3402828 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..476eff79ad27b52c9afdb97090720c9646b4f3fc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe3703a63338855925c05e87e30eedb6b9960c82f4614632631c4a3e6b12d7d +size 993570 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d5e33b2c0b1f07ce63f5453dec0279cc71c2e86 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d55ff452c7719880c7eaafafcd40351759f8094f02d70c140bf1f3002120b67 +size 1447662 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e17d658b0f545bf5f4b544d140c23f79cde3bbfd --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c8920857b26b54cba993860798dc6e223e7a7d38dbb2668b22f8e2622acec2 +size 1900843 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ae80c36b4a08642c61f666ea7c54bd6affdf34b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa13bbda37da5766764cd7a17403e293239896a5571e56a075d31d3c653bc06e +size 2350608 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a78dadb7fb0b5108155b3e042a7897802fc01e58 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad95cfd55c0d58b9c69e4f8f0787bea30408d71830925510ebf5c3f6184264ae +size 2800277 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5339cfcf2475d7568320e5a16af4c26e6f3a021e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5e6aed633de24f14bb16d70fd715a39fcfb803892c6d12238d4d299937be717 +size 3250949 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c7ac7506511269df2ea761794d196e45fc1ac55 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea23220f3c3e538ec3f203b6341de022e2584102ab54f95f6cb1ededae235d70 +size 1203517 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23 +size 1750064 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..449b4af5cf2f6f5186068dac409377c20b66550a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e192398d23530ae268f179c2b6791c1531784e2e87af8e805fbd657eccbc3f74 +size 2294138 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1f2f5e90afe5f5997e7dfe32b87ff3cf471e4a2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d56684ef2653f4d365e1823a90cb09aab7bd7fc2aa248edf3f73d72a68ce4f9 +size 2835540 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5386af817d447a0c1f803254f7196928bc685d7b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0905788fe7431564ae9f46fd2b44e066e121bbd25b28fc04f86eee770e1cb09c +size 3376568 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d1a55172492002895b6461c478d5f9cb4f738b83 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9931d5e8f60a5cb6b83fbaf838519e9587299f6763fb47890aedf12950d35ea2 +size 3918484 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9ec57c0d4dd53e2bbfb44333fe951300c24da46 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a28e2f971a0e5717c15e2faa688990441347409c1a99ca498ea85625494286f +size 1008444 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5 +size 1474064 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f6062eb58f18e3d54867a3b66f6e429c09f148a5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c6ff960993610189b31308f7de66f282ac5b386bbff84c742285fc4d9fdbfd +size 1939938 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..594b984d8b9ed3d09b72cb84987992988343da56 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a7e98ab5b0652f5ebe0646daa3f26f097900aa1ca05f010cd6a51c5a50706c +size 2401720 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9bd2443a8a132d3f4dcb152dea0c32f8b1fca1f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd9ee5d82bbcbd24036acd10923d7e45dd7ff215f482554305dfd4e66e4e3f1 +size 2863546 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..041656bcd585038cd3f807b45cc14a6e927de78e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a88c29ccd3532dfe74282dd6f96c427fda87a538fca91367b9e2e3596092a27 +size 3326297 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a6cc92bfb11f41d02d20500d89d5e6e7d4b3f490 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbdd1635bd05d118c0e1f09f1c81adbb0aa030efd3901b78d9b39c1f7a27867 +size 1161336 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9999d8403b9f19da7abef5611a0dac44f9e50e8a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f097a009bac5d033ac5653e09e25f6985eae0f5c5695552d9c177c9d2cbd34ae +size 1663729 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..653ef1b687c29cec80696c3247048dc14a95d416 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e56d30e25f1506c78989d04ed748641b5f392d7b97c9a1ded9ced520b74b90 +size 2165834 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49d080cd2bccf8fd7cdd3ee2abbfd5bfee94857d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6fc26dba2e7d501657ea643135089a6a14b9f3761680d13ef0c85ed149f976 +size 2664957 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcb557ff51cfb3e303023391a7ea5fa68720ffb5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd208bc1aec90478963b138b98ac328a78b64a9669a3160cf9986c4e3470529b +size 3164043 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7255ca6b9e2d0faa3eca1a4cacbda3a026f6df65 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f856d76531c99152844b69bf1b32f41f9ee9680e3b13fc7132e92efc909bf53 +size 3664278 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..505623d6042e8ea31e8913f5dd228f801fec8d48 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd9f8111995c357184f4d3f7a89f3f6650ff021d025dbfbd4bf995c64a3e6a4 +size 1027553 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b +size 1499064 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4050f0db7bc813f6358267ad8afedc953593231 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a90656853e7290c9e2e95a4efd56ea866ccb109ccb75396445a4d75a206d7537 +size 1970902 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36f979f433b1b7cb0561aef90d60f985e398aebb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8638245e626a1aaff576a8e02cc6301d48e20f590b220b0395f4bd2e39ea890 +size 2438703 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8c9af8dc8b35513475edd478cae76593c310695 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f07ba022a71e8961edd118e05b29d2aa772b68bc111766a66b13aa8648a7cdea +size 2906486 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cd30a6143571d664b09db6c4729b45eb43991e99 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r2_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f8b1d619b9d54f2f150337269a4da2a483f3ca3888e206c68c7e94058e7e8f0 +size 3375251 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d98a4b2c07910c71824d8e7092e090b791aeb4d6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e6f0904078a9b415d52658d2b538229865b891d15133792059c634caf915ce +size 1169298 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fdb45ba7675e573ef19d77840b4dcc0fbdf68cc5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538b84c029ba8f8bcf7dcfdc46e80202812f5b430497ba6d3d2c804f919682ef +size 1699138 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc097b70ef35fcf8305754dac6997505e4e32352 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a52b3e101953bdfcca6707908b4444e76363c68a6f227a8d07fb7ee5d4729813 +size 2217771 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ddbaf06910e72579a4ea37ad61fb50c50b8d12a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3602eac109de1de705c66f0057ec56bdc24c8804064ca8da537ecb9ee07edc42 +size 2731052 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c9cba9b725b8761d7fab6b69803ec1494ba5877 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d81b4c1f47540f254c9b4b3ab5e7089bfba75bfe686573f0e6494a97d59e76b +size 3248561 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..16a6fd216d72ef958148f67095bf5e2213cd813f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1143f360d8a46e06d8f49690e3e0dee5f6f269678732c51303aecc61a45e19c6 +size 3777066 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..faebd31c6dbb3607f86724c7c586b5a5befeed50 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4754a1bc4f51e2fa20e3e0f55adc30bbb7df99f0a32677992dae38330f464eb +size 1421223 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d330553797e9f6b7842cdee26cec1045db1f54a0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddae3e6bfffcac79eacab1e7e5c91cdcc0c0675b9e778e99d4addf57ada36614 +size 2061835 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..930a5190939f59ae8f6d184e90a6558452c9d240 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6a6748d013980c6a0099755623aca32f4d2293dff2685714f819cb41ec6feb4 +size 2689170 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2358d7a876a3ebd50a7b3ca784fda23eea403a9b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9413b1854b584989fe1b29f400764f1e7435ec17f695b677844736150d730b3 +size 3312052 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b532461041f21c62fbc652be12cb4a519910d33d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212e32b8aaefffc220121f8a3e508a04029120f47b0976925e7eff819094994b +size 3939055 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..43b39b46389b51141a869bbe2974cc463777dc7d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62820f786f47d037487a27c79e87770f770b10fab10c8386f0d2532bf2813b31 +size 4577019 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..285d18c47c6e817d9addf7b33bde694829e40601 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02fda6230a7acf41c3545319c9b3ce4bffa96e17fc44198cab8937bb8f4ed8c9 +size 1186978 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778 +size 1730743 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d051f54b4064fcb76586214c1e7af7400213d49f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb68bb66553bc3bfab597c448c04a1af34aa2ada3be50a73dbb637db2ed8281 +size 2264742 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dfae462b30ef990110440cc5c09a9ef799c2615c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1a9cd69dcd1adf20b6973b4a4cfe2b8351488805e69c23ff73c45ed4c036ba9 +size 2792177 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b80ff1f59c954f8129b4b2fe022ebb9b7d8b8ae2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7fef4ca54300c9a37bf1e1d5ea2bfcf973d4e6919c27105105a05fe9d9f8245 +size 3324249 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a30f1d3c9f0ae6b1e54309f2c3767169296ba68 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a876605739daeefd7f09cf35e77ff5dd6820ba78ff2b785972b67700643856 +size 3867241 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5c85697d8498db9d3d1dbab52741a1e883ea5844 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3676a2e1649333a5201215ee4956d132564eb9b042c6ff49c2af3deb51c28242 +size 1370787 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fdb354019f7241e5475f99a08a775d4dda390aa4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d5c4160b15b6edd3078a8b62d05474bb16e932e00ba3f357db1e6704ed6e7f3 +size 1959077 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13d2a837e3a76a53a27081e9f1966b12bcef17e6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66f901bdcc691a44d91cbeee7bb6816897d31e310c925c4a7fdabc2609d9107f +size 2536952 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..10607830062990a806cc97b718cd8a7d72b3a4da --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ae42596bad1a4f231b72b1a786128fac509e9145e258fab097c9fb77ddb1bd +size 3109745 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..958649a6cf6ab6de396f146a24fca0f6cb056bcc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d85685be674514c7cdf11f45efc04af92d7d1bf7046bb8a518836d64db920d1 +size 3686880 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3afe2113d8caf2d851526045e616f4e21dd0ad7a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54d3f0c2c5c168f31d75533382bff00a1285187367687b06fbbab58a3e21b4e +size 4275035 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45311831a6cd2f2ba6b906ddfab0150852aed4f2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb143e70708d72081a292937d0138ff8f67882b90d511f3132dc2452b5cfa9ff +size 1209979 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..daf91692fceca191d90cb7a099adb0b86942b14d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e365b561e60504b3c803d712775475e2f92401c22ea28a00698baabdb94218e7 +size 1760735 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bddae314f8e340d2fa659d128c7900e733f82526 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e146e73f95f6849b0cb336c8e6ae68d1ce5267c5a35425d67913db6878b9f49e +size 2301890 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6847fa6889e540ea64f8fdac8e63dc5717200413 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0038d8ce0241c82788b17d60ee019b2179f162ee0ee98430c993645db9c646e5 +size 2836574 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7870bef4b603d5e0ee47d17ae28c9bc7e12dd6ea --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e94596349c24ec56368759b0a39a17d88ac0dfd11b9b1fb33438fd7e99454aa9 +size 3375810 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2de8d6f962a923f1c4252db6c1dec1b0f03f1bc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_anli_r3_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b86ad463f4b33b84c6969d302a14ee0df95f7263575dfbc93f24209c9aba5e +size 3925982 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df0136ea11a7c457fd7cf94f6f03b4b1d65f1622 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ca09b994637dcb0b2b0fbc81f1b3e0344de0714142cd0ab2d73b0900e08746b +size 1216777 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd10b56dcd52971f5ecdf15c3dfd38230ee86da3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cef8d82103f4037093589f9581494003884adf15a26d6f22e709185d7461f5 +size 1670632 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2452bbc84db2625d8547e3f0eb9f6e830b743063 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8437fe1da647bfe0d3b826e260e50538fea04a4db71387b27f4931abad44dbca +size 2120084 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f93e29648812c6cd974f774d0bd4c5225c468f3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2732be3d86f91873b73afc130a9c92c9c58ca809e72d3a0ad088243e131ce27 +size 2577615 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4689323aadf77a80bab9a3c64d1791f1c03dc315 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7643ed906197b0e812eca88189ffb885df39ca96cad1a4617ae552d59dc7c0e3 +size 3027517 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97b28686e153ce1ac00a8b3e03a35cb3b9d0953d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76601d44b08e18128495ef4290f99f287aedcfb9518921989f2db8637bab3bc +size 3479076 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9391e0a2126215a55f44542d0ef4e4258e22b4d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7369eca7424527e8fadaeb8a70bc62fd26f4a366234d0c6f8e4634c6149bf3e7 +size 1458247 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..10b65f04567c6fbc773e7f359e35904bf5aa6971 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8732e079bb222bc95561d757f53184831898e08ad0b0173299031ee6259bc90c +size 1960981 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69e94e248e784a5f4b545d6d4060278f9b6a9d48 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9518d55b29adb0b8c0effc618bd1922385f482c2fa9e22970116f3e863ca4660 +size 2457817 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2bc3a8d8c304cd8344df3a8d6d466ecf16349b8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e129f439b64896393b9e3c35a3d39d6485d879de8c9925a90d811d56ab139908 +size 2963600 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..839d7bc6dfc2da69a303295a5128d9203d556054 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08819095fed0d9a3c92280b53fc60a133cbcbeb12565995b1caf7949c1b74603 +size 3461714 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b309681e0944f54fe5644e41b186d549f4da118 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd977f7e9a9bd2e286c24c7d303797893c9be6f5efc92ded5c203dfdc67a18bd +size 3960975 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c352918388d52fb58878670c0b278669767a37f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad01c2f6a79017bb9b9812b2dd44a25d8043dcb84aaa29b1c0a38007db8123c +size 1505973 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb8e4c5704a89a78d046d65aa123b231588e3a9f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f5c710700ce941b6c302823e00a71a8f6bbff23c341745709ed33c363d4338e +size 2033252 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14c583aeb76b4aa1937a66d323fe5a1dbb250fb1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbd93b721672cc351d41f1b04a112c2fb879bde4ce8e25d190e633778b744f12 +size 2554299 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58697a939b3cdc2792729bdeb1377e3e223c260a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6f939b605de27ab66b94561376fb1a12ee3da19183747e10fd8f53a53d5daf +size 3084502 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40eae01201fe8ac40d7f9d4913d1889af97dc40b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f534d03d9e1de063d7d2c065bf45105cb48f36ae0f7ae68a179e7afdb889a707 +size 3606634 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89fb7a700c21f6a5c1eeae73da8708f9d324e305 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7792b2ea2d14db68e8d9ac5744952135c4a41c7c44bfa37d8f7b3690ba0b7ce +size 4130537 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2c93a6e11363c0cebb19fb3de88751d231367ca --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83e7b10c43ae0e31420853eb2572d1bf2be8de0a2cdd9d1af5671b38b47ebaaa +size 1202714 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1061f72f16398b8d42c3c46221dc446c799bdf63 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4ff20ad6e8c9064488d51e9428c28fa26222cd3a48bb1df78d12cd0f6783d6 +size 1638992 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21a747f2ea40572718d5e431d18e40d3995b22d3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bbf437d05d56c53dc787697d9883a0c9f88c67f6ee8008d5679b311661464e +size 2070864 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eab6bd7ee09602ef6f9b86adb22d7c24a1fd39d5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e386ef3a48695df876b61b3458027b7ffeb2564fb80bf7e4befd4fa6faf4b33f +size 2510815 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a7115d8ab1048cc6ffa9b321f1671ace39bc4f3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053424a0f0a08b8d0cd53c7e1991fcebcf6a0f9cde973d5de8fa588ed51b5c75 +size 2943137 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbf1915252f696588b25c3abd695f25b4ebcef07 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de9251b9d70c1bbf7686a0cfff17f50be25d4f4459b492c7069cac29cb806d1 +size 3377116 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4f1db648d7f580d3195abb8d9466aee695502af --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49319a2413db86b4377b1ba53a577fe0f992cd4a66679dc70a2c3d10349f505a +size 1187511 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dfb911f732223e143a64dca3edb91659558cd4b5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b229a15b4008debb90c2024a91643f6418ccc4447e1d6f69ea04915931a4d4b6 +size 1557862 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f443d8485baee0379f7b03fd2408211745f99ca --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06f143ba6e9f60fe07bfd906b642a995930c23fa15c23932c556da1c1adb22bb +size 1922308 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68b1670a287393177e8b85534ee1d751cb3e502b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78d953a7979f44a07c6427dc871862672b1d4cbf63c9de06aae702b8e61e5da +size 2295621 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..029e3a1ab7159142779a8d65742e918665137d96 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d23debbe174c9890ee0795554f9a3579a44b82e1d4463afcf45bfa897d33719 +size 2661189 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e8e25e31fe3d76adffa734374125b597c7c10363 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_challenge_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6816b23ecdbc8cc5c6c9a3c07532ae3446ff8e7e2757756d45864c483835e1 +size 3027855 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe9c77f6c27e826259f0302179ae81d2dde17656 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f649b90e3223a4706dc2b62b376dc8db7cc5d68a04a663eeed2d1502f45ddfcb +size 2351241 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6588a7dd2a3af831da5b1caa47c5e8bd7f7a9fe1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3fbb8e680dd54b91e31704bf157daa73ce88fd51ede6861279506bf018e4c1 +size 3174164 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b878928eeb9d17b18ac16bd04495b09872986e8f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276ff124ddcf02d23eac4e46b113ccf899941aa4df8acb57a5c4f987b7543fff +size 4008946 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05bc28312a47511be635a45ce5e7410ea9b4e84e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83afc6941d4cd6fcf7a6bb5552c4f8b54504b8065fa7d55c2ea6861a862e3462 +size 4831612 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..04253463d96445353b7743b3476584a2f129056b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8766690bfc68b14ed87941b9ef9feb76592c7f20b4a22ff110073a466ab4b426 +size 5662356 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..298a93092cfbb77789070b338c53c102d5d7da4a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dcccf06f4680f4b730d93e333fadf6201e5062ee7c57d93718eb9d56e1b111 +size 6494729 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48e33b388f6739461cb05e1eee1353db81a96aef --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cad97f508c972bd80763740fa4bfe0df6db80d488bff4c2973fcdf083962891 +size 2745693 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4081e84fc78f18c388b45a61fa9cfbb16f8c11ac --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ff8993cb42c2d916bb9a73507e9220c4444277ac91375ccbad193e853107ba8 +size 3648935 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ee217ef409dee677e8fbbdeb317b91fd1b0763b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a17823ac810deb78a700e3f78666692f9e1638b8101eb13b37fc071a2403522 +size 4566360 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97849afc5361ab2a0549bf0ec2cb7233f34015c0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761b66f026d0a575889eb2f076e91debb18f7908f6a5db7705c3079ff6bec8c8 +size 5469985 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f2e28f7673ee9c3a070beedc3920d514ff2262a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f89640861b8b13813b49905c16c242ec98e137855f3d77f84527d6a648469b7 +size 6382479 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dee11f2a8594f2882abb963c67b443ce8b97dcab --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fecf671536959b7417b25144ae9b09208dd540d9324472b889c5be03c3cef8c +size 7294926 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b472186688207cc4697c8c67c5267022d1c8cb0f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0dbfe665cb832df324da3f22fbf6486ef66a272fbf2861116121a4dd3c7239 +size 2843553 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0044a5091d9e8cfac1a08481ad43758813183557 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe510e46b6db08171181038b4fbc23985fd08472f93577dc1814867536653f7 +size 3795808 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4673138e35960a7335fa7a5633e2a099962c2e13 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf096c58f2596db49426d0048e626ba5f19be3ac304c625a85b32eebd73122b9 +size 4763483 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fa7d67a75dbaa54a44e5e2d1c2e012d677dcf4a2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e86758032b2d1ff857025d68873bb90bb490cf3c54e6cc93413abed8a2175b +size 5716273 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1251898cf45629a69229b56125566f228f71914 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:818de3f2bc47df84e2c0da6555ae687b5f6de3c582bde4d2a696ded7e48680f8 +size 6678666 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..168ede1c4b8d52707598745a6c73fe407fd297fb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaa7e23f5106dc4884265af550263be0c7f5db0f7a2bb81a65166b26252c91ea +size 7640620 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c753520ad30b789e6e1c9505dc6f97401ac17bd --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd046712399c543cd61d597e38970ee38223863730401642237f87665ad6d7cc +size 2322732 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94383ee49138d1cda2e90029bc6f0216d5228784 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd062ab61348283570fb52742ffe0a957362c5c6dd3b36421475e4316224af0 +size 3110021 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71bbdd6427bbe639c444815e01ebeed384732540 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c036b9d66b21e98a1cd2642f8484af009429908c643d19f123cea20303549d66 +size 3909160 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9e408bcec6b7203ff020bb86016f1de74ed787d5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b79a73ceaaf64415d2d3e98d64a39456265ab995d912881d71cf0211635c15fa +size 4696184 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..353826f749a9a21c41ca6c37ac3acf964abf0fc1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e34931cbe7275e356bd7e9714e4b5289f675da414e06621299fefb8b63267b5d +size 5491290 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75b9e53ef7b4b873d24e1b69e859e5d34f4877ea --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6fc42152ecbd25bb55972845a9b36c6f3333bd30cc7a43deaf3f665d914bd4 +size 6288023 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52d154f7552e17e9e6e924979e3cee6d5881e72a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e7d2c2f3e050e05be4cf175d756c41173dae3c41db5a8c3fcb8324317c2260 +size 2196865 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4296c8aa6475d747043c3f9da58fb4778ca114fb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14a1d6e9db2cfb48c329ee710a0485347a8e13c96178c201337cf6ceb33f6ab3 +size 2831570 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ab6d7a47240f665327248e64fec80a8128cb449 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea40e638408da69f6ea9e14a803eea4885e0d237337da83156217004066f8ae +size 3480446 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3f95258a5eb5516ce11d256cd0a32a8669bf95ed --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e33e12195205c18c989d15e641a254d45158e402210e2a8cfdfef8f28ef09b +size 4115466 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0175da82ebac34e41512cb8d752fe36398ef9e33 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:899e0c38cf496616282c1c21766c38c8922121e50b631d934adde3551c3483d3 +size 4759370 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29bc84ede4832ec0f6a06e66162a6b48ca9f9d57 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_arc_easy_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd4da52d243def8daa3c89bc453651254717537a8078e5a6bcfbd68ed8702eb +size 5403447 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ecc6d61c47c523f440fc5d0e89711b2cb433e75e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e2bd09b108598ebd0eade31feb7dd0feb58ef12885df626984bcc491720022 +size 3641812 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..161ee41b92f7d2f1f649b8774c9caa9240b7f6a5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed2d54b5e6127fd53fd242b28c398f445a94e211d0a331e04e22f646fdcd4f5 +size 5657319 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bffae9c8d24bc87bcc51e8829ae87586266750ac --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f1bcccbf9eea09d9688dcdf6eecb6319936ab310cae4b4b223af2908f5deacc +size 7694470 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ee1183050e8c0a53d4e940f9ec8265305c759281 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de08bdda9c42bf88a7815f27e36f01b913cd5f0d4714dd5a822d2b8774855a55 +size 9728608 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cae83192e605d00ec5a7810c6e9ddc45e21aaeea --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5507719a8be2155c696fee7c04a89b538a4971167e8f2a755a3aa40f4968eb42 +size 11769272 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a71efdd7bc2dd5ab447138a36cf92bfcfe9174c7 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3532338350888b92970f041fcb9b8fc06cb8d88b3aa6162dbf72e0802656687e +size 13790833 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3896494af01ca61c8243631a1929a7662b72b5f3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd59a024be48bb77c7e3853de2c606ee907970890bcd9ade8286170079828f64 +size 3984683 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6606fa856271ae580da189bce74f85413c50036 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:388974a6ffd707e696660e0f604eefbbd4ad3019b16281815ad04f5525f108ae +size 6168136 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bff393ba19be9997690d0782030759b17975a92 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:182e362b7274e9e780d7a0ddce2a885487bd9567823b94f5e81abe82014027ea +size 8371655 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ee029dd022f02b9ee91001ef4d2edda5562dba8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a128fe4071b75715497a653e4d268ba869cecd1890f54fe2cba4964707ce430 +size 10572775 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..054fe6d41af574e85af2b45b8edfc79d4d73d37e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95cf75cc24ad971f9224165894592be8ba2ead3e8bc0dbe9838dc82bf539b1d6 +size 12780585 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ecaf151b3a44feb3f38c2e4f57d21277d2047448 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793b6919eac2b2f92b77ba5ef0ee27974a18dda0f07b0ff9c714b6cf11054ef0 +size 14969380 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13a32cdad3fa3940c15b5072cba820b3233ad416 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a6384f366f8086fcb9c06646a07951c5eca49e2cd7aaac14689b84e6779b516 +size 4041663 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efcfed578d0ec29d58497ba923735c0bcff61425 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a94750496aae0644f8ea9a0479e9af633718fed91c626d706671e1cab31f086 +size 6261131 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..394a655b3fe7d2df43b68e5360ba82250264c136 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d53e071683222818a44d348239e127cac258180ef9b2392eeefc68b533cb71d +size 8501446 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40dfe7d77459a38e86a167c104c98ca3c2e0a22c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9286c6d11f9f03a7c0d45cd8c60381aff64f3e1560a67343245ef2a62bd53797 +size 10738766 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9f25797a5f9e839bb414243549aa98752dd92e2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfa0f8ef760511ddeca13ec36d99bd38a1a5d768701a6647973ff0afb9e2c57a +size 12982684 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..201c2dfbe73164af6baac7c4d395ab069079690b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86530918856c3632fd3230fafb5323226fa37062a48baedac4af6811cd910092 +size 15207519 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e80343007491940444e31e2a6b7835269a76f49 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9a2636cec1d8dc13d6b174b6b18f45fc9af726d1cd98b29256940f1e2456ba +size 3665152 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b16e11d77fd58ba7b08d27c22a0379a7ae462bb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6bcd1dd1c73cceda21c765490fb1c0bfdff0a446aac38d958274d67b14ea0a8 +size 5688106 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6678fff486006c34d13ba4fb973df2e001d999d9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3f0dca757e7cf045fef91fe01229f3eaee25c820913c4b3a053e602151d238 +size 7732700 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0664ad2c80abeb3ef54865e8734fa8f5d3f70eb3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a25badd305775c00671d6e53ea2d259a9cd9b84a6bb6ca70f81635c18660683 +size 9774843 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e14dd01f4a78f7bf06f90ff326f63fcaa9bebfc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b4111622c3cf55befbcc8f14560d50b598a35f67057d96df9e80692eac3cbc7 +size 11823648 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1cc2f7d55d79042985e2e7146e5b64e2c48f8ac5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8b66c5f05f72a40a5830e0e08aa5c656c1ed563e2b0e359e786ec23f0024b0d +size 13853443 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7733a2fd6f0ea75503ac7b19f09894217bd36fe --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1376a4322aa2d71c1faf8cb1ced7013b39dca49500b983c629e250dd83e9876c +size 3859526 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550 +size 5990492 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a7a0124e6239ef5770abb886d55631b57bddc1e0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f05321cdb37809ecf8c98f53a66dc9ab94901a058f7d86605b14d8bf42f990 +size 8142175 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ccead397d29247fca24b515e1c6f49b3aedcc67 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc3e6e1de1252e348e946fa2122da17da9e911e039259fa564a21abd772473f +size 10290457 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3128fc943a3a18284eca5d0f3568a732d00c61aa --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd506a5926decceb81affa0e27170a2fb61033c80be00c2c2166dc47eee5aad +size 12445167 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f12d6d520f23e83e5205a20b5d84b28880d1eeb3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301b8206d739733f71fce9e0d50def1e973895030fa3c4382455af0321e79665 +size 14580744 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bdefc45f9ac45f9db5d60d3cc1a4675b1be06f84 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437a044e31d59b98da6c5c77ff929e13cb0e83a275e8b5b1872a5b5de1fd58ca +size 55151 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1728e31ffc2c029f2141cec865b7fba9dd854972 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3672e56b6e91da7cf5a0eb77f9d0676c2833e9da6f1d3833b767f00e80078f9f +size 77973 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a3d4331488692429a65a728e45ae06c2f33f97a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee6a0a936bcdfdb6066866c76790a99da8024b75ba5aef6c2871a14ed8117c5 +size 99577 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9010690948d7994821d2447db7cd59fee0d15753 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f508fbf4c4657b96d7ce5d101f8f294642589d6bdee6d8f3a447ff6e8bef40ed +size 120735 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4f9e039ed4f169bcaaad509b1572711ba062c542 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:427eed000838a9fd20bd0f09827b815d604fe797fec51b85c5510e45835b179c +size 142861 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a49e548ee3a59a24b25c701c9f033c5dabac4de4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e2d9f758ea2a75a0c9e207d481c4b2e8662520cb2309efd7d2f32e67d9737d +size 163692 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..922284558096bdd07cd59df4c51b6b5b2e3a6308 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b40e43921266c5e817f505a51d4dc94fca37c119dbde8dd3a0acfab85d9bfd6 +size 66228 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c +size 94141 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8a3e77da4177e5bb8a3043f45620073031f0e3e9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb23d9834079a4e9cbb5e06da538ca208bdd592f4e6e5552d1889c07349c248d +size 120807 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7348dec48bf2f383e993bd151652269ac9d471c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5891c90cd86f66f6d71815ee8254937202036974d26550d3193dae3003b3fb +size 147045 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d27680e404b8e21b5885122361e2c999fd2ff54d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c8245d8c6de55cd6197b62dfc4bb23e81a02060e68d3c4fd60ff7e206b05aa +size 174250 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f2f60aae5eb122ba8413cf9b813b7d3f5488d58 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b8f1117eb3b272b65764ef24a694e986e8be4a3ffa18d14095dfb53ba48ea4 +size 200166 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a4ee4d24bfee15eaa1a624d438d769973bfbb50 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d70bc918691cede81356d0b65714df7d487baa453a20781cf6435be7c1e1800 +size 56318 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d +size 79780 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..235dd2d757e9d5e566fb92d12f4e251f99327adc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb73ec208dfbbb0e3c7d3dcc9119ade35974807077e4492cdd206f03e7cb9fb +size 102072 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9da340c936db0b1869cab10120b9388bdc73e84 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa311ba8936e6ad9be0c5b7d5b37a30e9dccfdd2cc5895877026c3c525fbb803 +size 123886 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58e2dda149e707e2e28c384d489162e696d818b6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6acbfb0ba0c45eb8b704c5ca183c815aa302b1c5b62e8334b622ddfa477a122 +size 146669 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22137d2bd5db5c450e3d22d34f36dfbe74f27704 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f243961ad807c4c55bb0d5fc03ed0507c27fdb6ebf47e4204253c69c1da8b8b1 +size 168159 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0226a14228a39e3731452eb787c328ec39ef4279 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590df1d74bd61244bc789cbb0a0de1f667aa1311b3d96ae30b24f961a833dc54 +size 63947 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca77e5d15009ed37e1bc763f9c64e0c57eb5a53d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae1290ac500757964c05fae33d4b605243b85605f5b814f50610f25fd6a8b82 +size 89650 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..159bc1f670107564526b705c4cbc6981f14a1751 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfa050cdd5ca58da0004c0f654c4aae2463c9594646ee13618dfdbaff8d06e72 +size 114109 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd0d730a05fa77479eda684369d514f454c60ed2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f29c39a627c1e3cc5dd0b6fb9a5675129e2e451ee057c933edec0ff11efbc67 +size 138134 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d547be2b0e38df3120047585e91a07fa581f73f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8e52fd1b3286b86833cba69dafb0a67b1edeb3d542355af164bd6ff3ffaf0d +size 163114 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a30e3480e4c0c01995caf6c49dcd3d15ebbb3b92 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3cef0140641563559cd3fb7c903b8ab410b2ae45f565381f9f55c4b47c00c5 +size 186794 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef4e71e71d44f72478e1953d30a61c3f9f67ba5c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef14a28c9ff7821b2f706c96c32e074f2c2b45b6eb503f32935d2ee764541569 +size 57324 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05 +size 81124 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7492a28a98633262f712635089a3ce08c02d97c6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a300a759f813a20527fa83fe34a06b855b3fa907b429c699af8b979352898fe +size 103749 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5010e7e7a160d6a8fac61e558fd8caaf1b7405a6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3854f4dfacb3c29c30a7d006314c9d8af90c2132bb1903c61bbcc7aab4554d98 +size 125899 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3db7f55d59c4d70d3eccbaafb3f0a4cb6b7649ed --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a69f80d3097f2844bac1ba29cfb930a1be15faf6a132dd5e3340395b55e2906 +size 149020 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21518085f65c75dc851765151971ffc77edf4adc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_cb_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ef9de7646924051e61a25ade7c26549dcd217196b3f7bbc7bd6374a67ecfb72 +size 170841 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60b9b470794f3611d573678aa058c13910c52114 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc77f35e681e09aa91c4947b7e8d17d46658c264550c4a91c2d6d58f65c8ce7 +size 92208 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2fb30ff5f21df0a23c72d01b204293658738752d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b29786e083296bc8c41743772fcc70880487a059ee77fd252dbb2eb59f9c493 +size 111562 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cea0cae7792e035b2e27c7d5b615f99968fd0725 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233aa96c22a226dee575604734a124f5e635c34308da56e9a11a3a3b07075358 +size 132094 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5bdaa819b47c1d986beaa50f0db740aa7766dbf --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:982fb640250e638e0b82ace087f4a8d171456ade92ad9fa5955172b367176cea +size 152241 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8b9a66f5af0e716bb1e66c9d5a750f8964b32aa --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b16bc0c15f48bace8c5eead3ff244d87c90ca97056de7c5a1ce536deb8f1d46 +size 172045 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aaec3a3cf3c7513f13e5cf57c8d6b506c8f44cda --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_best_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed68812a8879b67b3b36effb4bcd881ae2849e366938b755ba546427a558a728 +size 191999 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42613be0ceead170cd3b6e591f1dd7a7bd0a3086 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a16485df7240b24d10e8b6c0dd98494aac4c4de7d3c6fe4dcd99f561994361 +size 87862 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0dfa990220d05b32187006da38d43da8d4aa02f9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24cb34224a47ed53e14dc75387a1af1c0e08bfc792e83548763dc9fbbedc0765 +size 105079 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9c9512ac29886ba10850825ff63d4fefe9c41a6b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a05e241f89c8840d21b8683ebdfb3542cd27c4f0032992529d3229d722251aac +size 123431 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4366f08f31f1c99b8d83511826b56aa3c82621e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7846c39d3670cd928915ddbb2a5a9c95c1ba47177cb0f5c25db895699304be68 +size 141413 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b77e5d4ef0003a29f428584bd9179cb32fb54fe --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6047e1c57737235732d16e13cf35b39f2c06cb39344c259d8f08e7a3370c6a9 +size 159093 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dfcc524da0afbcc54912c5069aaf7881e32c399b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_cause_effect_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527c1773d2103bab9c219619bfc0275018490a1a10a33a9278a331ebf1c85f90 +size 176896 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..858f6c89459134db7b0890c7e4195c416846f80f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b75cb145cc983941e88da1ee46f39e7e7903a1600a3e4c03968085cedeed5680 +size 85181 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37236471ae2a23fc9f1aa0232d4fbda0e9ec230b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04cb2314d786b9b23cd8cd81c035613742575a1870bd9fa845dcf50e5a53b2fb +size 101261 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..615022988ddf9cb08e18876cd61e5a25ad3ad671 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4308b11120eaa957e2339aaac1a424f56e0ce637651e2529e6528bec695a96e +size 118470 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4904e7d1efa4e81f48150cc6e38f55327d6d9211 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc8bedb15a2783e602f966c19397e64f6085dae4038c8716d085c2cc13cac3aa +size 135384 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7166e57472dc4ad267375cde7f7eab21a5bdbe5c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7745162a98c7927a1a70e811d050bb7f96b31bb8591dcee12ba27fbd2dd79b0e +size 151938 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7236585f67e8c5a8391c847982f4efa4430a3a1b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_choose_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cfe76932559af9f3c51f9c58da0621b323c806c213dae0d8529dadf4b9e9bb2 +size 168732 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c0bc646a6fe12ee3a62d1203c258e3cac7ee399 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8468412532606d091a4a97856adfd39e14a9853179b13f3508c4eeb6882d1d71 +size 96777 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a69de38e1683f13683cabb222018addecf019b03 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408c03bf918d6fe112001fb6a25ec79e82d63df94e7929021eb1c870b7736cb9 +size 118295 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..422de1ee56743829512877d5e9c956f4d68983ce --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9989142213932f459a51c8bafb5c19415ae371e08908e563c841a552187f2691 +size 140929 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f25289bccf8e910a4a550b3efbfc55d1df8c9532 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7336c803617032252ce31f70d5164b9a9b6eb365b760c1eb0ae27b649c31a3c +size 163204 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed1f4337920dd770ef712e86ced1a1cf7343e982 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd806aef8768b8d88414c1f54ce09153930c2a88c0998a4bdd39042bf6196f4 +size 185187 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b5a2158bbe7d34140975980ddb3a588daf75c93 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1fb12607d05cc8d7aaec6251ac2f5214ed8cc58ca2a10ccc147d05afe12457e +size 207299 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62bf18dc2654f55486467cffb88b3ac1c1d22b65 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda1edcb07ea712b5982b7cf3dda678500eb1f842287b7a3d303c096776255e1 +size 95771 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f8a745a604c0e03acd7f80b74a20cd8c84f3d9f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ee5d715b6fae3541639b6327b3da38b9d6fa5468ff2c61b6e827b26d24a23d +size 115634 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..848f2dcb000ca2ce93a6be99cb3d4c2c3fde3395 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92176e9fff05469c8831252e6e2fc81a64dd61f0836a04a83400ffc80256e623 +size 136683 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..453817c892f7627828c648bee1c8cdb749441bc0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d6e88351f5fc8f15c8c336a210438b7ae2c2e80aa0884a0ded7e33b662b54d +size 157386 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f28fff0110d57b78f460a53816bc0a7c2ecd55d1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c641eda1e8ba7d89c9155fcaca1fb638a77c6bfa5bbef88081940c2a881a6674 +size 177733 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27e0413288f566814aaa915ac75e8dfb2c932587 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_copa_plausible_alternatives_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae201de85c7964e16a4483e9d67a3a5d9fc163951b0cf086e92d5b0afdf6e42f +size 198328 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9c4d1927f4830811a8bd5e6c8275ec52e31a96e6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0556a4ce0a78eac45a595a7a0578ed0c8c3fafeaadba5795447af9f14f7c1d12 +size 3536909 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1bc950bfd88439fe23db79943040a8a9e8f335aa --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2ea75de31b51f45da13faddca234549e79c1029897306309f65018c98eecd2 +size 4382403 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63ae8036afc508bca8c592a626457027514eb67c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41218209f41bcdd562620a2d3d204be215f7ae7d700d2d9b2bbbc2f4f18f525d +size 5326446 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..231649c20164495af9e99631558d6837a3bfabaa --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ef656659f88adb858156e98cee9c888cad57d62e990880296f9b825c688368 +size 6274001 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..510eee37a38d5d29c7a9f82799ed0dcf8aa5da3e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50099263b36b36952d2758d5778dd2712e34fefed32a191d7b6643bb8f430271 +size 7237269 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61b87964b8611f36436b35bd13d46e10a985240a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ede586bd5adc98a590c2d29425d6b8ac0ade7040a35d3582b7751dcad00083d +size 8219083 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4d829280a81f285ad6e0f17c73a16cf2aba120c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6272583d310c0599eb8e546cd8de503e56ac371ec252fd2844424080df0eb49 +size 3415274 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed044cbe36d0083f981011731fc35ddd52cae79c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cd2c8e0267ac66bd7f9a1fe77d7cd61c5fc5f13288f6565ba986d46ac0113ca +size 4358519 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4e045e4e7adb6008ced13cfa43adb94937c8c5bc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efdc1cb4bdf7c393fc4989a60906467637558318ef5e66f3f36bb38614cb2f72 +size 5272798 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3133095e26073db9cf5fd5a2b0cb0a73bd3ff992 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0827eef2b750e626748d74dd250efcca275f28191f911b697346a732b00651d +size 6184474 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d762c2d3c2e15afe10fe4b0565bffe9908b65250 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a114545b0eb309b5c576006b1c62076cbaf4a68df9f47020260ba26d6ba5fc3 +size 7095570 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d140cff25bf245c0af15b8440b84fca0214e10ba --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a124f3d94c01a32b42d4c6852febbc09dce6b4f37cb96ee53f50c34ec0cc977e +size 8011915 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a637133287ef0174df595ab1ce0deaac8c267ebc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fc184c7a34af5809d19997b47afa0eb2a6c48742584d27d24e92ab05903ce1e +size 3597982 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a31e05de328cd79c3de1f767f47b3d6506a34d3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3bb1a8f5e8c356c25ae75e5f6b21e1b92b494fc72e682429bf7506ac8485744 +size 4698367 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c97b321f206f62574afe535ba9eadfd3807652cf --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731daf880aa301ac024bea04d7a481758270b12bccf6f09cd1f7e8feffc02ec1 +size 5803058 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e79566b9b61214602f51aa76285d80d8f7271e68 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4957a60caad4cb4558f2d6d34bf64c08ab0669fffffea658ba2ff2d25eb9e38a +size 6891719 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf77978b021de967841a48c1559a8ee0b1edab6a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2514b0c73996e772204070abd3993bae0c8b2c037cf342a72620ecc8f355b4e2 +size 7969775 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f390a3eadcdb7b8f36447c13870a8e5b1be4e8e9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a42912fa579dfc0c8812492402638d02fdc584e2b3b2c5e4606537c245f52d +size 9055844 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b98ba56be9de23172bc84d6b1bed9b3ef9bef537 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b275749c5f2fc40fce4012bc512fb6b944010a8e558c0f285f14ce45cb746aee +size 4489195 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9630c91ef4c6a51a3cd2130f05ef9d6d8f22215c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:315e461be3239575cb81ec0a21ad5e95cdd9542fc288a2700831486aaf5a3f27 +size 5027837 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab079c162f7f3fa1a87e60072b6b7264d03b30e6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4445f6c3fdacf9b6bd7f191ed2db656d1dd78e8f851abfb52f79d57d4de95ec +size 6112511 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13d769ec9c11d9a8bbae191cac4edf3023b36b24 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93c1549fd22dc445c5e428da42f0cbc0cada4e8ed1f962ba25dd242c8f8e2fb +size 7198488 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a2d10f2ea1b7e1069be44552c0dc26f8167b5ba1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc73d28be4540ce02a5af98f6f4d6b078e41b407b5cd89b7ff10587c6324598 +size 8277611 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..890749bfa6141dfab7a34516d3676b22ef9ecda4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d40118565e1cbfae84eaf368a364dbde7c76813f487baf651d3db3b267941e3 +size 9372982 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1e4ec63a2cd5976bc73c24c2fb6b8c004073fae8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3fda395749068b18f8b6921c13c716d9d5cc6fb696fc4d85ad3dc789a2c9cc +size 3292341 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..31bdc088ac2e92c7b892303fe38e1a02daf68ec4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ead6d59968e682815c73d56ce1da14ab6d25a44a8fd476446490819e8f65f8 +size 3909624 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81126db420d016fb9ea0afc65480ef822947bcab --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47adf7a5ca75c26abebe92b39c87f015782cf5b3135ac6544d8ec237a81c529 +size 4703690 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..058971ea582843cc72aa919418f53395635d5e10 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc9d56bee1d1d4af53d19bd6fea28cbdb08733131aa4587cc8f553894296da0 +size 5497580 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0f0e23e9b8e0a74ff13b0febd13698374a269d32 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7edb8fda4ee4ede48a80559f9cd65691be22b7da1df059e42d785f00a5abf0 +size 6288434 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6fdd59d458c4eb4a28bc0ef9da6be3f67bd8a1ef --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228cc944d0f7d270180d6a6a198e48fcddf16436f21754c3f028bb574c84d2b6 +size 7085145 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6255d690ce15f5ec55e087e6fba100d859084df9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbeb3d96f669217f24eb89a0fd9051b7573bb221fd415915c54f3a8dcbea6525 +size 2891038 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c87c034bb925f79bd343849d5b2cf79f89d73e8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d131b1cce7016d4eff5694012b7ea7b0cde088bb9f9d6efb4691eb7c05d78dfa +size 5204944 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ecefdc8f8f4c87eaa8181b2d3107a82a65e4cf4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c7323243e05c35be9e514867225f41006bfa52456d27a22baa1ef25ed8578b +size 7499254 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f69f07fc4855155c888b1c9f1c151332ae8275c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d65ddd441edcc2da1d786c4ea95d5a47f78d5ad936690a27f0c97bd71dc23dde +size 9781563 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..beb47a3436e5b91e858b233d7c7bd027633b986d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:484746c48b3e268db1164dd5e4334081a3fd849093a330505959a08a151504e4 +size 11823174 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79051018ec45945aa9f2d3262ed11a235b80a8f5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135823d841bf2a0edb36340b867e9e78d3d8d41e55362c4669f8c61c7ca8d0e2 +size 14076809 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a379c7529f51fe1d7952172ced429861f64a034 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7637835188dc00c83d172f05a12b8cb5ce2f020f1802c63c2e4bcaf55ab33bc +size 2755541 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cd6d601e62289778f05e9792ab46aa62207f9f05 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51d3f20dca7ccfc9c0f4965904018027bf159dac8180351f4992c3c1400bb45 +size 5053329 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1091df94bdc9aceb9281e06bf399f5db7ffeeebb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f5d646379db4b6ddce8b666d364643275ec25c49b6d238ff2ae0f107758f87 +size 7314290 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90021dd271cf08fc57e6f06a4094727942ee394d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226adf769559baf9f33ec96d485de23b88f5f006e3c6fbb24e0d1a38a05a4bfd +size 9569691 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe751ee56f0ae1ac2c7dc6bcf82645867cb103db --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51bf6498684ece76b16f63d25458ede452c1fec0c225ea487154c63c110e96a5 +size 11582386 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e65c4bbdbb889a6b0936bb489a1ba2bf0dca1298 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7522f96d54e111688bb506735d03c78e075f9b1f49a8159ff1b9bd1e0a5563b8 +size 13794750 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c14d8e3f400b22800f3d989f5d4ea51446691d76 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23bb7c4d85bf6779efe75cbb6b12088aed6d4d61cabc5b9cd7a134b153726fe4 +size 2780443 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d8d7a539e9cb4d6e6c792e2481fd058113581a6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea779d44add0bb7c6cecccbc0d2887b08a6a1882e4b20c6392fbe0b48e552a16 +size 5104717 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db2b03f29bc62e517a0d7b44f4d7ac2ce3bc1d02 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa22fca8f6e064f23741affc2ad13baf55c5819db6ea749b172dfc1be574ba34 +size 7378125 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5c30542f084912ef90fb816030a4b3699d241513 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c78a42622f6ffd0f39cafd481f5eacf00d403b9d7c39ede999f9339e0ee028 +size 9646595 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d8bb1cf3e0e36e17dd479da5df527d082ee41df --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1310c710a2e439859274b6b6706bc09581aa516187b9de97a03ff7d00fdd33e1 +size 11671604 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3facb3a0a8c5769d48c51fac4463369eeeca0318 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8e283ec04e30c7987b677f1f9a0b486de23ad833d0911d74c6a13a50dc9eb4 +size 13897505 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f6ba88a1d11d987e67192e1aa5fb166430ebea05 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dca92e88a781c842bab9dd4e80ce85cabc73af2f58adb59e9369fd6265c5b97 +size 2816774 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae39e2dd352dd795877a58ef4cc3d92dbd1522d0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e307f55cab2624f233242d3fc88626c4f0d9d2ccc6ef8fcb98372214c5b2c44 +size 5093542 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3c86ede937c49bcfe3f5f7adb942d3e993893c7 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170287cce95f77fddbc7f5199a99d1dba8d6c66629bd34c6843ef7d997356ff1 +size 7365535 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcf43027f3bfb0c967bebfc917c20249467131ed --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9348f09eeb38bb378dfe8c3f040b53788df34aabb82520d99d157c27886377f +size 9632422 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c435f3664f4ec945ef145afacfcea654f59fd85f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298955c4288ea72baf9f3cc53ed9e231cf9eb69059f18bac6e66acf30f74b747 +size 11658075 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4a8af0aed408cd0e9167bd9727350e6872dccdd --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4c950ff18c65859aae6568f864bddd088d1d818851210aaeb73a3528f0b1f2 +size 13882385 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8ccd42633d1643b5b4c77128f95e7a2302649be --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1988fd47fa6ce15fd7b798b57026ba2abec3d7d8968edbcae6f53408f4e56ffa +size 2855574 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81b4d803dd0d715c7946d4a26c0176d08c0d5a0a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a14520cd6766eb3942bc6c71732e6152f4a96e3efd726d662b24610fdacaa72 +size 5159537 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..795d1c0b998926ea4b237a13bdfc6e5d8666cee6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9616d75a0c0b5f31c7f7bf02825d902524188380149769984f5e5cec2314c22e +size 7452135 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..804705249c0fdefbe56fe19306d2e5b9bc030614 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc88025f4a882f03b1e393f24d1f0c6559af0a3977f010501e96fb42a3e5f43c +size 9730789 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9565c3d17ab2dfae3dbd7578695aa50c5ac4555f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e482d76181860e6c0cdcda9855acdf69a8cb376d762c28f7be132ec38c5d8e8 +size 11770149 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3115785808814c9131d14bf7a88bf51e2893ea61 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86dc7c4a55fe91892c9d5542b87ff7c9e57e6f8cbf69d732e1a4f378c87b3ba2 +size 14018625 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7855632c06ec7a03daafa5d93b4e5fe0eb927a2c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a305af3ae0631c345b0c8dffee3f52033266cab9ee92e520b27e363c9a18621c +size 2832248 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc5bae996a436e98b2b40fbf9b675c7b5fa4ca5d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e46d6dc3213696ddcb28fe173973db3789af29196ce971cc72598033b6027f +size 3305247 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe33d9d147a05e518676a1310bd0d428dc6ded49 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e0f7fdbd49fba794cf092157712106365d1726bfab5f8c96c643145d2534ef +size 3302156 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..760f547b92ca78628a7e669cc105ba4cefa1686d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a5039efdb093862595bd367ed870eb2b4b9b1ed3ead6970f3009bf32098a2a +size 3963282 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a920d885745f93ee5e0c239a6a72c2a5acceab3 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8323998a9c6ad61308c06567fbb53b95d79a4736d199b3b0552157219923b82 +size 4659410 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7019f20c9a5c63d2928bb202fbe73bf454c78dea --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60fec289df6a7a2ee6d09990f7767f5743266165a0c5880f986e0085d2758e6 +size 5340030 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521 +size 2115935 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c482c3df9decd346f5fc00abccee2b98a02e3ee --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97bf18b2f511bd1d461a4f115da409b0aa8d1d71c9f6a47fc844359cf871a4c1 +size 2900909 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..80fb64dbddab7d9e2585b063af77cadbace004af --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9308073581f3173871f49cd5a82f8d2015b6019e9127df6121ce20fef21e5c66 +size 3683911 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d82574e127606ff0ffb2e65f1b8f2970bf67cbb5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:318a21d34e8699ce7aece4b9cdab14788ffcb2c59c4f63b2122614244b8b4e7a +size 4458372 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ccb1be0facae9d32fb9d60d6ebcf36bde1b5313 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb7812c1621cfc84af8a399d42331f12cb24ee482be4d04b53e00c9084e63ab +size 5254516 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..35b6d8401782b87aa246c9a0b0895906515d8f4b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:194d19d77372e7b32bfef529a6959714325cb272829654ed96026d62a215835b +size 6040677 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05745d86ad9f62fe567f9a9e948c406dc85d5d08 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fabd662dc4a5aacb01cd7ce0b16541f445e8a8d1c2a0cccc175074f3ebbd3940 +size 3055519 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6d148e1a056585c6e6c7aa4c45e093ec06ed144e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a04c045d5140f68a8f921576058b7be1fd5b311dd3f9e7aca8fdb95d7c725a28 +size 3292757 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..632485ab32f388b8df7ff11a12a3b405b3ee1703 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa9eb2fd99e816c0f89305e7079a52de790a30cf19cad574821884e64ed42a9 +size 3525329 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae474d2ede77efcaa927dab727336389996f3556 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c228089b2cbd72c0aaace23fae5bca8dd3e44cf971fbf8fbf25857d9159d4a +size 3789411 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ee143013df61fb9ecb54cf473a9b71ca8c1ccd47 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e201e2ccbfe4b2e3abf4e8af990822fa14ca6411624e907e52a6b93796bebdb +size 4064245 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8aa7df61cb09b03ee6de336c95bda91f21992a07 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164658a656e4611b40f7fdedca97dc657d3931be34a721c48d6824bb47b947ae +size 4348562 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce +size 1864129 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad6d499d340f9a3235c31790bacf2ba222aa0bdc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b327b1f072c749dd4df0b6402f4d7d6d7048974070789d65fbb951575ee9c8 +size 2557203 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1425144622d2f9550d373cbc359ca324e1669acc --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73796be9b8f9c6f9d99d9daeefda8a2aa603abde0529e2198faa7cf6f2c7439d +size 3248305 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0fc05075159e06430b66fa4777c1faa4f5cce256 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139c22a829063e50453e8b6d54615a770af366fc1abc2db8edec6fe3759b4882 +size 3930866 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d6c707b3031a85a0251cb96a33b748a2e5ef6904 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4237166d2a4ad8a70d238d2e88f9fa3ff6651fc36a56ec272db7cc59d6c891 +size 4635110 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcafc1cc6a2f427c9c91bccf87743e5395f13c60 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_pick_correct_choice_index_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d697a5bbefe522b5cf568d34bb3f27a52fdce2a001d07ede8644b0eaada5862 +size 5329371 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..681cd21ee52435775982b1a93c159fa277d301f9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:193ea2b94ac5bcac29dc69830666d812a0dd50ad8529688e20330becef2bf449 +size 2257351 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e213e64750b58706d472ccdd878b8a20e946e7a7 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a39fecca98ad205dacb71766840e83f3c31d0bf70527c229f5a675031c59578 +size 3016866 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1cc2e6787c3c4367ce4eaafa132e9b92909e4bf8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18fcb229eade251fac9b90daff66dd78a3598779d2bc0ea26d58e23d50e190e4 +size 3773512 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..691e88bf5a32680d575398a62fdfcc0575bd990b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433146c464ef38adf5917a76cfca8eba2c07c079f59e988d943b724b54e1611d +size 4518731 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..837313ee75fc3739b3bb7279fd4cc8e7a6dc5c46 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59da01a16d82533791bde3d9cbacdf8ecc27fbc6176a18521006b3cc19cb839d +size 5293278 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08e3c111a39180fbb994acac3ddd8cd03bed6b77 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_piqa_what_is_the_correct_ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c31d80d131d1e8e269b8235241b5898f9f30f6f3b8fe161910e2e3bef484b227 +size 6054918 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d359e92d4ebc5ae6b62b4221b79791c11e0ac76 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43a591a72764422b4c2f718c31e1fbc5adfbeaafdd267de61d94e9a2b695bf47 +size 639955 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a285d68d376d3bbe9f5f0beb3661eefe70a78614 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1a87e98ebd9ffcc71a2279dcacdbca463996703d1b976d9920e19f78412d14 +size 755099 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..453c631615e662aa90d2be65f68d58c485fc4484 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4adb70ed336bfcf6ffa475e0ce471a62a6f4470df66cb2284744ed6239e7188 +size 871346 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c4ded3f22f8f48841f54c68683fb841d6d15c7a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d551145a0f2b247412002ce9efc2bd058d2a9572df137bab2ec867f150f30cb8 +size 985755 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a6d43bbf13d42a1111cf66e7342f36b88ec6adf --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e3a247a2e3502f9b8fbff52597e19d6c54310c0c47f3163160601468e254ab +size 1098496 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..98181f03e3aa50ec9bc4fe86e418d611f74ef55b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3fc9fb6146f6373124212523bd3c191423002b04a12b0eaabe90d7c2f07c73 +size 1213673 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d1d9858ba4572c032fec7a7e8222d13b221b602d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5890ffc325e890a98e12f8a8a6c6a6925194460e6dd7c640bda527f18e0b104d +size 1182551 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ab4ce9d21f07665ea25f4b31c0892ce013088be --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded2a67727ddd6b7d98ea7b3306650f21f86a6c7b078da3e66d24534ae82d53d +size 1779374 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..771251054fde859742752ace4f08fce664b7d566 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2e4b6fb0bba139a25b3f541b65681a3d0447343b2db9b61056a6539c8c7dee +size 2388553 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc71288558fea2e335cefd8db487b673f16b46ea --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff6ac7b0cfcc30b6ce02ab2434fcc974b42038e5bf9d22c7903f07279751e88 +size 2973503 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f4fabadee20bbbd363c2987e531da8cc14e2198 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dde366ad22eecaeddb1ca15bb197fbf4a7635ae4fd02296ec2e9bbb9ded7c5b +size 3555828 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..323edbcfa125c86eabba7a66a248e39c1303383a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Direct-Question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce71c7d83acffa0c39ad25151cf34ecce38e5dfc504125035d7481e5cdb09d1 +size 4144692 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cf9a78ffec3b0ef2d19b53d539603991ea8d546 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f71d19839d501b6ae9e97c1eadd94ee9eecad43cd20d8af861a63c2d23aaa6e +size 1328813 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e15cfb9f463be10ccccc966575a4ff104cffead6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939b2ccc7757694ed15c19e19b47d2d9c2e380242339f6702540499cc82ea9a1 +size 1524345 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68891d6a846ca2daff4f8d8dc00d170beb409d8e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac3a47c9f06c03ef681ce2ed41a63acb489e1c9864154acfe3bc54e62168cec +size 1720553 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf33c6917b02e1a7ee7afdf5276f2ec4611df378 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0142b7c7bc0837a1d92c1c69194e3414fa9696faa0c4a793e1d09622092638f6 +size 1915676 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..607c83ca9c270f0dc08eea846094e854d638fb41 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ec3dbadf159b4a4662ec770ca6743bfec84f70374d306aefac960a17b0895b +size 2107380 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4db8b992b04991106501128569b147a3416082e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3a380281e1b9e28c0c5d4771215aa582157717a27633d2962533bbd93e4590 +size 2300921 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29883a1895517e4f135ed5d854247b187eec8352 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64ba1c019579b7e33c8643e5788b3befef1f5fa829985232b1b324269c757966 +size 1935127 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c72da3dc8f2e29d615cf9374bc5cf5c206e9323 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f93a04a94a34ace5ee48606584eec4c45dd22d38ed2f3f0cf7009d5a56872c86 +size 2636241 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d6243b0fcca853e80389a6cccb86a82e12e3b32b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5058e311ecd457f38ca68c46c5123c3a4a2a0823b1008cab2a4322b221a1ef63 +size 3349441 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a31a9a0add373f8d7563efc39ac2cf025a09cb29 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc97b9fe4a9e1e4b5e26244c0146b87a23943bef036cc63bc2af5c45a9716cdf +size 4039092 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dde8b9b0bc4a94a417ff439110b5c5b03aa9dc46 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d827271e9511dcf36e0954bd56057a52db04d55e1b77654b77acd025ee117002 +size 4724589 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63a8850a1c91105b72ce5c27101d1e460067b3c5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e849bcd288b74146041b535d5e8b1abbbfebc58b6ffdd9436e57d4d77f6e631e +size 5415652 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26666e1b0a83da89ad630c129a77b94af3ff8f1d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ec4da3ca49f80226381fc5e1ab9209678188aa71a16eb3e2281c5acaeefca9d +size 1870118 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1109d3933657c362a9a976e89abb2849f08b1ff5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d554e2c16acc2608b80f93b475b89c2c16ad1aeecd6df228c592f8ccaf20ad +size 2545352 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41f1278c0731ac870adb1e5f1d83d33ca0260bd6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58f193f06b119072284149ab5056f09aacdcddbc6c1d7a118cd8ed53c31eeec9 +size 3232474 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0d3c0b6f29092f8d089affc371d46ad0fef941e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c663ede786183eac0bc8f4d562c33755d4170f67c88792cdcfac2f163e5bbe6f +size 3896165 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d49510d71b883fec5a5f079810bfc3b2f20a1447 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b1c2f0243e8e7dd8eb1a4d4f54e687bca042ebef62e6e23636c8f793970e2d3 +size 4555577 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aabfba4282effb209e5cc272fd7034af6ed5bdf1 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12c6807404c176fcc1cb66e9d62157cb873c1b0ead8b0ca81660455c712ade5a +size 5220688 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c37b8e4b7dd3f47217bf6276a0be95108eb3bf32 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe718ba937293b2f0be75d9c61672fa0db08d5b4a520260322b60bc709be062 +size 2213817 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c10ff43b56d46ae715f0396ec0c2390c5247ae8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d564449191f49bb07264014b77e797f905d44addbf7e9335fbd36828bbb241d +size 2974263 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..647311e15ad5dd683d32520ba3feaca18a5e5b37 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1e63d87770b859bacf7039f3faa5ec1127bd725455a074d9d8963fe9bd0c00 +size 3726446 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1593977abd935c55af678a76a3629ffb62faec5e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4260abe8daebfa2ada0a087c6c5a5cef26d12361d54ca823b0d2792179175826 +size 4477808 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..141622e0756fd24308d53e747c06e16a19e92426 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7e57a5965c7c274b5d5bf5b40a57bdae3ba43bf8994389c13a631b3621a6fb +size 5229417 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3e9008158616d09390205458aa8e993f2c53c1da --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9601a6a20a681db0122236956b5b7d8195aec1be35a378f46a9f7b2f61741ff2 +size 5980025 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27a705bcdc59c6f6256b818d645d9d35f0e6a6e5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64ce43bf64259b0f21fdca27b2e9899ba595c630ac61eccd14d3923a8bf12271 +size 2360974 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..063c6fb52be4a19b2ad0a6c1d7e932f15a162b1e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc51238543c33aa2a87bc345c9284f877c826de375e72757c977a554aeeca39 +size 3193105 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d831c7c9d0793f6ee277db02ad3e01ca22e3163 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08eb92500b8e38a556ff81be36be40c62c14a17f69f31ba064b45e5a5eba5424 +size 4016578 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb8c6d9304589f01a76677c1ef712328a27bd965 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3394cb4c92caa155bc59aa5347fe26c4820c725556d2f409a3c37f72edd6fb2 +size 4838737 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ecb89bb0bf02c332e0bbdbf16594adf9b28666a4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56a5e1ebcb42d52d8ac0d66761fdfe9d08f663210b1bb4cfcba2f1514499eae2 +size 5661559 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..324a65597bf296c4c9b2b89f5c7a461ef55b7c9c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f4c3b7f0804a36b6963ff393182cfb806054a3e515d62debdd4bf1d9d3e6af7 +size 6483464 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab3da4eb6590231fa2424c6e9b7f20775139f888 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3c0561b4fcdd921c92d07b199cf2f971a3f2aea69fdc7f6f954c7df1e0a382a +size 1877766 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e4fcb5449a1475d767224452b6a3e1369900367 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:998b658db9e0f7e609a73235d5d413a3725714305eb0a49d5a7fe9e0b25c5414 +size 2434491 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c53e7f9d2c0778cea5fb905e5e3274e017400b0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64fadad47c2fe1f392d6d91a2798a88db417e95c816a66a9d9a407451f5034ae +size 2985777 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ed1c4cc96cbb200b5529a23cb483bf8325d9559 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1b95f340caaae52d9e2d63a8e58bff80a9ce8cd9e34fc2cabb0426bb6509d2 +size 3534603 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eed855c0d425bde5251f8dd753dbc44756b66557 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c50875142e1d51e68e5c2002cc496e4dbb0a65b26358c30ed241f97720934e +size 4082785 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e3a8d61c193f498d52d1fc90d0c8da55ee3f7de --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Generate-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4acfb916ff183dd89d6b92a51e0f83128a994dc1a3d01da7ff594d2179e3304d +size 4629908 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a2785ff9b2ee3b8e9ccccf740009ff1a1bc4f8c9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bed09fc93f847022562440d8046f3d37d7b436a9710e4edff2749d7575095a4 +size 2371065 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1e3f4d9e6bb977f44afeda4086b2a2c3d8119ad9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3778c768f8c9e974b3192705626c8254807116be39eac4306bb4d7188e794a75 +size 3210437 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ee4d2f91396614faa3bc7fcdd4ee3778c89d24b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:049f3f6ce8121b1dd32cb6c925c042fd4cae12f09922c6f0d6f945120ed09513 +size 4041042 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..01784d60335ccc6bf75655fa686da12e116b3809 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db61bd0d119456b43b454a970544ea0628641dacc2d55b58b15ca061a49bf96b +size 4871155 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b4bc42a57b99f200f427a48cb79c19dd962d3b5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf189448da6537a287d603757366588082a5cb07055f6f68eedb7b38fda6512 +size 5701103 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c1a55af9fe3a5c34541d7eacb5159bcbcea2a20 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d80f4b6b522220ed1f84aeed93eeac7b3cd9bddfb7155044e92df5ed6298b14 +size 6530377 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..306d20b0eb0e78612e69ce535af8ee75270f1ba4 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddb49bb80c8ebd8266a0345b91ddf7c3ca1f6d1652e4b7ec9d4661e8e513010 +size 2343631 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a2cb40ecbad1a217e4aff38f81098921f942d67f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:526f56bd65b2f84ab4802acbe9a318ae530eb434209da0fffabba8223babc686 +size 3157810 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..200f9611f8126ab30903a7489c6f90793bb6083d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9357b132117276ffb5ffd0734fe0524cfff503581dc7d3f1c3baa4730608c1 +size 3962265 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..578c3872e73bfdc88615ab610eabf825091a591b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca477a875adb79f54266105edab158569dddb1224238120177ee52a372e89218 +size 4765611 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e921f670e997599847ec65c3190dbcca52b31c80 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11de6ed359eafc9953c8a6a705194dd30212692b93fb336b8e818f090be53a66 +size 5570179 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ebc80a2052da969f01ac0e7a4257cc48d7d18d5e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb1aa822010ce02ddeb9a8a4c0489885b7c5d93fe7869e42b58cde5fd072088 +size 6373179 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d46a3f23d44119890146b92944d8162192a6df06 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5318b256db5447bd49d17f5e7608f9d228507ac688745067d3ee84566e512a +size 250578 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95e70b4e19acc25d4fa0153712684976f85bf92a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:345476b9a374b32f3e4bc3a740eb6b57c3d799a431e017968d0fa448ddb5e2e8 +size 351355 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3b53019a946acf725d1779270fa82bbd6aa344d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2598746f9fa3e1a2f64e7b21686d877d6e296aedb06ed51836934fdb63b62fad +size 449623 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e06e427b5d6465f724b7b9d21e2237a9d11c378a --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7438c8e9e65bc19e3adc15d76d6b12988a7a5cb1ce25beb0f0f7097d66d3ab73 +size 552972 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4f8a1b8aad6b4e8b43ce8b3c3d06494b091f520 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ebce07d498fcb9cba1581791fa144fdb227af62b69c5121f598e0c95ea72b2f +size 651585 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7236d426fb880c2dfa680b19cbaeaedf987479d0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033a974333bc1f387a6cf7e95a12d7209c3daf42cc4c597693c266accb38dbeb +size 747484 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8f621713d364899e118a60a07e73a3c6ab95e7c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a99a120db128a23637613567ad7b8cc101641da8c594f13e195f24a42e1775 +size 292891 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346 +size 415394 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf9b68c13ca295abf3b61f9947d1fef1e3e2f15f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07e5cc538a118e5a64642bb47968f44eb601b8001161d26a10e7b08ba3cb0440 +size 535369 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..213181e6756bfa943989a2c2b67a7d5493cc482b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1a22f680deb35679ccd5cb71a31964fed2697bec1f50cc72f1159366e24c6f3 +size 660381 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ccd9dc3d956a93bc1039059e7124552b62484f99 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686cf3dafe7d82b3e832c76969742b1c170f65a27b73a6eda525cd7a7d57b60c +size 780611 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d6575e4ebb2cd628e6f70bdedbc6bb15d05f128 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84e84efffa729e3eb9e0fd6142f0842b40b17911b6b31487ce9e69eba78715a +size 898118 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8dff3076597619e2f0503e9ab5f8e97bb2a7cdb2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b4984d724f5614b75ad267b58c55e29e650ea78f324c5e626c5ee9bc2997687 +size 258296 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f57ba5b8f26dbdd80b217dec1e99b82a3a958d57 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a5143038a055e4d43d3563d9c99a05f0b32de93ed41e53f129a482bc65426d +size 363049 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bca84beedb32a50129de59cdaa7d73970f98380e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf5e769b794b8eb9009582190fa48967ece95dba43cdc9aa7ef999301c575b6 +size 465291 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..227005dbb0e6e002aa551f7c6be9c0d8a7fba783 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e3f9459b1fb42b1cd65af628d398adb75328068115cf86d0631e8eccde0cd2 +size 572567 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..552006385bbdc13bf99e1367bc7ec2c2b0532427 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dec28b20f2e748a96b840e9994b227e9ec58c0a8bb57cafe0a29ff6ea73069d +size 675065 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0613a2b960dcac79ab4a911edcab210e93bc1925 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_does-it-follow-that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bae5f79f452c63e547bc93610dd9bc373b34d1602427d1121f31e624b150b37 +size 774847 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2712d22a051401fa5eea1f99cd09132ea4408d28 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5328d5fcdb36a56e9501473f38b127573389c474c8c889bd160c4b82925a19c +size 261133 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d +size 367750 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d08c2e0cb7c3dd215cfc613e32e2d56d410651c6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20d2571a704da97a6df27ff121b35ba9efb99523d4ad82c32e4ef1c12dc65809 +size 471937 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8bb9661e1477970782569eedb1dd0a1e2ba4e5a7 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b6130984c970f54504b313a8929fe8aec82650ad0a69b254dfe8bcb2ec5956c +size 581159 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1117b864c94a0c22c17fa84737701b4db2e8295 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a036802ebdecd0d163ffacca68022515e0a48af913da3e198cc2735c41c89719 +size 685595 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7245873e18088cd194df2bc8a8db05f1292f697 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_guaranteed-true_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5aec8a42ea52823f2db6892503a50abc1448c5a7b703f1de78d8fa0dfcd2efa +size 787325 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1667055f670a01cb51ed20a604d8072dfb03333 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b2dab39f05f43a8a838b2929c25e7f40eca0726913deca6da060060e6be8a06 +size 262323 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..752397159b3d71b1b48484fe06ff39be4927dcd9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e7ae2885fc5af864be466627f1fe4080200ae304338816c869ed943bbc8bb29 +size 369691 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42cf99d46e55d9fc152bb7f82bacde29cde1d56b --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd8e2ffd29fac95941fd777980876cbe9b0de04b811870c0297641bea87b7316 +size 474717 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63801ad16525b45c94a0b3aac2a48a6ff9b39c6d --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab2ef881968f096f39be0b006e93acd42f882264b60805da9cab4523dfde89e5 +size 584771 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb97bf84e6b65a821917d9a176dfffa52d363abb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd33c1b905405b2fb5013f312b0576eda61803e265776a47e900a3e677f8c92 +size 690044 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..895143d4bff0208e532ea1a0aaa870cc9d1bbbba --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_superglue_rte_should-assume_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2f9d4e14f6b690fac2786e2f9d9b813f7b084ba840a145442e150b533c5a72 +size 792597 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56223036d596eec0fa372b4b22636b0ac68fe634 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d53cb4bd68b9916372d9b0e762ce79fd91024e923a1e4e9df3f67fb623db76 +size 1039276 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..101426f1184a8a6650f46d32676bf6c5df361617 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06261bf3edd4b004fa3de0910b58d01e14eb52b881bb422f2e39ed0e3e7fce70 +size 1300223 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86baaa3bac62123dfc8b9f9ff13fde7c638d9d2c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee974532c84d2ed332e0a8efc2819f60330d559551250899e353e619caa277cd +size 1561278 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e9c09747eacb5c7ef7f54c50c51209cef9e0aed6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a30a49c79046d5b528db48293e9f8c82728ee4fffdff7e7ffbafb57a7259cee +size 1822783 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4a55f9067e62e612f057bf8e11805d6debaedfc5 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b890d807f03b1fc5715ed058d5ca7ee57e636175c16e8517bcfee58f7b0490fd +size 2083050 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69fb5870ea36f0708c7c5b767b31329c59c6e4c0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_Replace_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24873005e2312feb69c034e2c982bbfe3f4f2e9d10051607eec9c38d6337d9af +size 2343729 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7fbe78b2be785e1a0c49a293b3f10f0bd00e8da --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce76b039949af18917bace98d168a151719d05294e197340564ac88019435a2 +size 948113 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..012b278642f0fbcce4abf2b91bbd97a6807f3ffb --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd29f790fae0f3c3af4bb24e5c9ebf6a00e5b46170954bba2d51bbac90e07ad3 +size 1180479 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d2a24fb5e7a412ecf5da90f2a9711ec85e4ab326 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a30076eaf8dfa354ed1a0fb3995685a95805dbca07cead441777dbce805c84f +size 1413006 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0197287aa1e316b0b5040b4141ef82908386325 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a593145b0f819eef9215e7c5868e70ce90b58c6e84c1ac172f54b4976849350e +size 1645858 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0897a787e1f6b65a8972d8bc1871343aa4419513 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc45623c3ea082f63cf872d4c177971ec7e1ae45bbd7e90db706aa5a9f692fd1 +size 1877769 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89224acf30238691e04ddfa0eda678661faca852 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_True-or-False_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5077f132fe1d3cbceea8708b0df753ff41d36c80d13915a1f7c879da6b07430a +size 2109877 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..823176d6305595b046920cf2b20e5059a3a908a9 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b51224f7161f6d5653f1c9b877b77e8ff3abfc4edd4b0dd129dea8123343e41 +size 1009991 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c5cd37df611f80e0db2f5cd6daa8dbef1174d0c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e63e537ccd4960cae8eea55c599808a49ed17bbf735af030fc8df2c685035b58 +size 1243165 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..15bedbf2ad9e20596b3dc93d6cd913a7b755ee9c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f133b8400563c319182f3134aa41644251cff2da75d100d6e3c6031daef5d9f5 +size 1476323 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bfada4bf1d5362c8fc7c4046a9bb967a07c0ffa8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fffcea2486ae5f620dbbb96f73271c73c43e2a6e8239c60e522e1c91dc27e9e4 +size 1709905 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..288ed1c849bce81a640debda3712f94a42f5545e --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55340f21c3eeb91ab699fc7b7449a914759d564eca42421b5fe87542929606e5 +size 1942332 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e39801bbe1397092de8df4c2b3cec7c3b9cb461 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_does-underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81190f1e785df0cd1bf6101cc28ea98331921c16e31aef45e4a254a201951281 +size 2175191 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0af9abe92898f120b8327e96288a3b8e6dc74c6 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b69d4e6771f0e7d82f658e9862075c1e7319f9d4c1804a691dabaa5cde4e08de +size 969402 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38ba8c2a152a5cc6e266e9be19c2841ccedd82ee --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd2b8315f89cc692ae6dcee75761bd4574defe71582d32f8101490c8e9d02b1 +size 1205029 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a2dd0ca858feaedcd70c087edf3be1a8bb45c140 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00902e473c0e6d073743bf7dc150397747d255f18e4b16c9312683702ecc0c65 +size 1440812 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5163fd1a4b08c113b5fa89b130f1a58b84d753c --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f4d916d0e2baad113e55ba21ab1f6e9d91afa0d5030fb5b08259a95f720f569 +size 1676974 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78a012a8d943002b28660d554100824d3f49ce7f --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:000d18828a25356c22e6cd6839438349fcea085e019754ef2b47100aa238282c +size 1911944 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e561d110907e190eddd718045e9754f45c8c75b8 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_stand-for_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9a8252172309473b7a1bbd1d2994b8fd27dac923461db34ed18603fc66f977 +size 2147307 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74e09ce9ff8d737ce0727e9d34acb12039c36ee2 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd32ff99e5f064fb26104e362364d2709eab52e6caaff2dd4ade80dd82f106d +size 1016476 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..33b2f4a01e82410988d3618c3db7e082b423a012 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3342962168206f6e2d7d377b25f5943745045abdb4093baaf8dd2459a0fa7c13 +size 1257111 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9443c902dd6428d74a5ca6345ed54081a99e0ff --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a7a4febd5793f25c73f3352fa494b734a79b9503de40214e0f801fa72509691 +size 1497859 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78933641dea1fee3d49e119c96cd36cc69425d27 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248ca467f1e311f68e8f7a96331b5c397537c1d428ce4e36642db730d208f5b9 +size 1739054 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_4.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..041a43a6a61e99ca9be8ae7697a77dc74e7846b0 --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd35f1ae537e4d43ec6a0a04af2dfc02d83d8f8cca2d6a2da3af2d522b0711c4 +size 1979137 diff --git a/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_5.jsonl b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3ba7e29cc9e088b63e3e8e04a2597e169b64dad --- /dev/null +++ b/4b284b28bc4/eval/examples.4b284b28bc4_winogrande_underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4001c3bc5cfdec6d3c8a118c971e620871e4714db20560666c59a16433e62bca +size 2219581 diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a02cdeb2d0fb2e83cfec200ea2a861e16a83658d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.2871925225988394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02880794237734816 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07031750338322859, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015362201736309874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3012155634284117, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004546846231718025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10707093959955763, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019099253416430774 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03252619427180376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009041180535715348 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14439430798437106, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0030595449553106713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.049917192299013896, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012014538250113653 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06757287768017098, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001414062975537199 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2931917692240097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044525581749204875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10327080072990603, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017814345648663893 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06720725044881726, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014407633141713585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2882066628071488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0042625595600572705 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10238598559666155, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001793711398293261 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..49c23c28bb02078cc90f234ddd7a4b49c80e6ad1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4674006237665374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03675015156688127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07708396436923028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001529013971135644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36235242098066, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00504305962540817 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11843411039548267, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001895876971489225 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035725346847754684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008827277116185362 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17866943863024684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003599106083841207 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05553061893758205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012200850248834274 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07294152863590639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014064426998360536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3437168036630356, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004711682138202734 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11228525846628372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017420331023114827 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07337404630465882, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014496612765965426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3436003452114997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004616609203187745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11267135213673385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017806522868172628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..63106064157d5ac66ca23beaaf4e88de18979ac2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5250778439407279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.025625496064299234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0786831611862841, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014969718758498687 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3956998500897082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005131459404981971 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1222441134481138, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019020155784340502 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03682232020364324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009265440871287532 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.19779728011829195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0038163400420330356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.057331612844470456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001226447389142751 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07302274174131225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001338647584797816 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.36618568072469376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004622159858156629 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11353522248821303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001703018837429438 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07488527729009754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001411146414679001 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3754608054763451, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004798151749531035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11628410210009521, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017922853862293046 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..07e9956ca7a2cb31170aa39b45a144fe6a0e2847 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6240971401779115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03840020245332954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07946921950052364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015734983149428565 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.41443709705557313, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0052063482293464285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12413425365513392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018801730074252724 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03727374629297188, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010068745541143666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20933728572588564, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0039729036741121115 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05836966723015618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012404189838209753 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07340984582992345, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014110667213403662 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3803567082991153, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004624202286888448 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11463438747216832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016925693943895671 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07570628299965847, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015124490142500056 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39213432148715394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004828378800565317 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11802649327609507, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017907209912884955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fd84e71fa6d914f037f3487bf93a3f1668708829 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6460958847523566, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03922360003785139 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07760008602513783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001320690166501572 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4207626130570328, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0051343858175420766 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12269659017119199, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017597424238451932 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03635274690781486, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008330185169416092 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2129557453468375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003851289229933982 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0577700863367864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001158809400479912 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07107029230772488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001171387599016451 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.38200909854824977, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004464667845112728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11231459598044416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015694568258752962 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07356474812940257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012438027129333135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39729543398826606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004755911479219925 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11625392090888889, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016649015391818642 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2ec52c119850d2885b1115f3a8d8c4b9892b4949 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7283147266727299, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03506630990313516 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0800529618846568, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014030415087213262 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4360056147654019, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005237161872126708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12670702220068714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018353031821599214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.037349148159645094, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000853308549188945 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2217598521275178, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003994885901439153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05951196634046783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011799130523628795 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07255538123365188, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012004887370386942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.39357574348180674, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004547413362244599 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11485528722019342, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001589365738462336 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07578798351325475, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012955589973183616 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.4104972172441367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0048123434799543785 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11989099231027077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017190774588667075 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f7854d8a82f7dd85d80178cca341ed8206c216a0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.09435580335073716, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002596237406338737 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.1516877183797435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002163617861294294 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.08821581167136193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020280559575042486 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.0053159784831301035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00043010712261271663 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.009522281054005119, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010761109699630326 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.00537375341399136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004491934033010578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.07882470114338755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020788842929298805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.13940682930714066, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019495456097635088 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.07551459506478578, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016129890419721987 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.0777715925714806, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002284643122449491 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.11130068960104114, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0018656577889789815 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.06986508795300185, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017500362429631329 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.16810516169972559, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03835304479839481 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5e2d01f3ac55c0ab8d302706f1c4535c48474a95 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.32500435790610316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.006219134826883584 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.46243909963396884, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005727997203394979 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.31536194630864184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0048986014551623686 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.15832835758132308, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004379479319821069 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.21989028843694963, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00443907246545659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.1493719195270224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0034781292314063805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.25838083394802175, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005195219904829289 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.3797932387763246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004882479498294952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.24943460339721188, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003903652479227547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.28371843954863074, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00558326351046381 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4042180845615538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005247552299585261 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.2742602935438785, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004363579640120447 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 2.6466191277757654, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.15100935076720456 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..071bf50e8cf4ffb283be3a97975503feb1ac559b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.616964955469801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00592579769672606 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5254416101366304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004868870996231356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.5130705309379091, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004347345042742489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.3649050910641516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005264819050488393 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3044911557323109, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00442897115331052 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.29741471013327875, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004142260867688268 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.5063613364405887, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005565487436877947 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4323292288126833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004568270840466541 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.41944130358080134, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004138185147643418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.5436203304123873, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005624136142894095 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4628187968326305, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004668999589191003 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4498297629937119, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004109002891379284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 7.5083947202338726, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.24848690178409746 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..09901a306e19b144df2eb400d371b9f59eeb1b3e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.6407710071562776, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005336259744860889 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5360462579768369, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004707261284923473 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.5412014771452126, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004008094970635858 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.38414086267020836, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00510054295724189 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.316436535679906, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004429375693886751 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.31908027874923556, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004128300468278769 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.5276989567128523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005209222935383238 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.44075459787644217, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045004795324398335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.4439501498676375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0040343196115624315 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.5658065072497239, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005198931528391741 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.47242630953998244, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004540483814405851 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.47592175675183224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003941128663060527 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 9.28684502126115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.5074769619297956 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ade7bd9db1d2b2e43340f18407aa1c84fee20b2c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.6472264998572703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005232105513902178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5366498372396236, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046402228185357716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.5464856118324216, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003910057721192816 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.39097181878893417, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005010695619680332 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3220332631389833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004380478546023843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.32559945531385337, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004008792699742887 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.5313681099482047, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0050483302509796105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.44300639511893825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004479689083403708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.4482338075737081, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0038829126617500277 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.569330259899693, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005059793660411658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4721692944626995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0044818770782843265 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.47944694202870075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0038182134462323098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 9.795110512629257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.43643537944667626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..402a1d39016bd0f4762594a0704e3d0bbabe085f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.6580257286903278, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005154899649610462 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5377144440002729, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004751931821359596 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.5525906757014365, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003963242953623838 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.4024984504821446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005105136616228292 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3256015347638036, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004546014562570848 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.3330531711847648, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004179019933874763 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.5454660436185705, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00507487374817911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4456656435992453, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004577146376256788 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.4568474950562419, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004002759515298098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.5815729850402755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005083872222185427 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4737226545590055, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004553659067486879 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4862086181416132, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00389366720553536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 11.239061696658661, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.3211949581138453 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..38d759d48e99847d8bb234e9299da5a4bdca1d12 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.13642739401224172, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019132472003375194 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.02512636113552703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0007036632388253778 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.21863243644000946, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002521017541213538 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.042903100190703786, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007010978773798984 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.002350066060168301, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004125290354570353 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.01850848216392922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001239122346922424 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.0034599979122394713, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00029151015532393696 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.024907100271538515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000690766936930914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.21742396152099233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024595518988296725 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.04253972308092736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006683881645207197 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.017035317660162044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0006541825206071444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.15198121220346103, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002036913011848439 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.028854953753925324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005590058998891695 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..46634bcb52a17dd37823d209ca2514e07369e0cc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.2344773114465395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06144744886971806 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.09359962530750512, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0025105917511609984 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.522665803339837, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004850735788452025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.14240938766090952, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026994539955292815 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.04032025283529164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014131816715617135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.23323991686673376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004275513488096086 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.06073259012334097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016141608477161177 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.0769023862248869, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018572847386138004 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4625095347149072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004342267829958794 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.11888572362932227, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001965744751513682 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08184718121577335, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002207850839836764 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.4608387006930489, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004681197366055284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1245843999084538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024393680091218312 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..76547ce181afae117a712474443b4d8a847f4baf --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.6773713320753056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.055884247961949976 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.17374754160822523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0050939685408092975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5730195532268542, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004521823239131311 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.2057962810551555, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003760812343326168 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.09140223665380956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0032890628357242144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3074558179703969, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004375074576693778 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.10509886424101751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002496931131477509 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.14133526089953277, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004177683006061348 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.49304734486333374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004509829618388591 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.16838171411089273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0029883563268557796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.15428774706595433, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004553930547293362 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5140723424370337, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004433969406651666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1825157877706738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0033370782552026696 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..96872fbba97498113f0634298f742f1defa31477 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.688973732407593, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0437940722424649 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.2263783160965279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.006171991779324731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5624565194412028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046331481122687 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.23997327415870215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00426770453497841 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.12650521502940273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004187905052421771 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3116380965308516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004367945574727286 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.12855310203356, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002943917792986252 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.18506548497150052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005182255852735757 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.47864760075928686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004626469457756582 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.19630420632254614, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003553339268118638 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.20150484374278746, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005543691944915573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5064505915998677, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004511689198493829 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.21346988106806253, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003833256489070223 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..487fbdb5ba972a1571dcc444a9ba3ef34790b0d5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.670081659519132, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04318517964945629 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.2397708671243343, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0064228338007496924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5485806406380264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004720768862432214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.24712078753708633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004314381507879295 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.132848179955477, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00420337986829722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.30940781093399633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004367428964629067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.13327902883600057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0029460826534397667 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.19425619955945797, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005325133026757208 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.46533329832959947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004667920119273198 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.20127645493389038, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0035809225423920244 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.21110404564884136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00569641155080923 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.4921812900394637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004548259054500709 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.21840163342192948, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0038191209490347515 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e8ea17ed5bffd9d10bf85ec248e35e1db1c0edd3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.7150662066576599, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04867244279674884 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.23129723699957813, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0062156402527426885 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5532761590144792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004718580529051603 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.24413807121404715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004196903213714051 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.13131503132685515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00424768261815168 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.31689123692271287, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004479228620477408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.13440771646832117, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0030479854784096305 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.19041781904289584, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005317512197816564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.468350868728647, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046258788947290365 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.20077123718465184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003610791227071776 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.20527402650433593, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005591111023710644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.49732487082024746, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004602620634284861 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.21698522412439886, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003796119059919846 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7e29d01cef014878004d631a696fc6a3fdbe55e2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.013350416749489698, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00028057458287476263 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.11566374488953345, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001749855882209205 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.023310980636958793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000452486408355606 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.0004485308245799727, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 7.901239884774927e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.0036672786059082207, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005157087714575659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.0007679014767691473, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00012819459126050034 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.013285581389076958, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0002656851189258093 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.11534380952368356, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0017118106609243738 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.02320710445968874, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00043026318572661666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.011828106724862774, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00022725277636703424 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.10529252247490435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001497843448633412 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.020705297270136803, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0003655496821761756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.009519886388116046, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0030236756925875077 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cffbdfc8fd1ab1f54f30cb15279b655c009df2a3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.18596600590889065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003645906867093087 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.7000841391242195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00407563035243576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.25888123233754007, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0031768498729056785 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.08857846914965355, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002396373719643474 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.34773296216867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004163769887333789 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.12187611759524282, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002198007938376146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.14607148785062837, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0029509545373158728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.582387391289673, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004300891719123408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.20438070771678543, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002413318992124119 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.1590760662106541, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0032411687263806696 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6045971668841404, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003947144030126864 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.22097954689386484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0028005845900640268 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.191415616124935, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09680100323362864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..042533a080e8baf0f8985e53a942c1416a6fc66e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.1980304639632034, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003492199137777792 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.7112810316711771, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00376166113477323 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2805422037797494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0033268027640946463 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.09886442937653378, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002345021679529973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.37290220171207966, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004228119092633622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.13915045372935977, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0023234212020214113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.15240021506621623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002802864425131344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5764693524969581, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004032933171743271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.2169472494379018, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0025420675625343024 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.17080807902531406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0031067593345175896 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6220206309306955, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0037970602252338824 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.24203406785259418, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0029372092841247473 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.8569076256551087, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12052575313613716 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3562560437083a75e85750543e199d1e0f768cdf --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.1933328502928586, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003099530624368226 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.7011350938451281, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0037491005183801886 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.27975069293703747, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0031284036937637358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.09519751402031547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0020025816969127396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3723729566418955, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0042697919686992426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.13862295274121508, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002135737901854583 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.1470822171154783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024663239297952433 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5599189483902042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004020046406729631 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.21407754529660591, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0023998887716300023 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.1673033495587541, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002759668112872226 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6165957546031788, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0037550091734433974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.24241624159869068, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0027709885961488493 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 3.009544719305422, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12091914294594716 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e48382e3f3e8cda0ca0af39451ea2e33854eca28 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.18185566077745052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027072441708704876 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6878760207615766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0039399069406699195 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2677925532107245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002981957765622307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.08953701779489043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001710625628350317 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.36720908036084704, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004308436966018264 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.13292917476889163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002014006244700826 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.13720570266102783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020576199930684665 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5490721849829181, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0042054270913741984 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.20379974606633744, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0022399193895611676 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.15812569287913272, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0024138703089301825 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.606968571968092, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003889795075167132 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.23315369915847917, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002653830698926144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.9471554297303775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09562350894285683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c1c0a6814b7f7839b8ad233423701e3324882174 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.17429298605765095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002453866849905799 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6874234755183823, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003958252758008644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.26156722580164604, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0029559487433299535 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.08498484032096375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014987715737771314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.36659589385018765, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004439215725950152 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.1292376475089326, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019765919490193793 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.130533611669738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001731325232954926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5475992486934416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004197239234834233 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1982820935914217, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002156722479988033 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.15152370925757377, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021398507103325367 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.6088523048467857, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003930409593530016 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.22805092628919627, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002601257261272881 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.8828869742378753, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09032299188645451 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..41bfa15fedf1c138e389d8074cb769461eb65310 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.10655760753679919, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019077191437516404 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.12089604584843483, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002329629845339565 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.06975157019283658, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009137011093224832 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.0003312213456760952, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 4.9493114517125344e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.005980525954602008, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005793862846553182 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0005675109134810492, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 5.471478866388383e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.1004586755525386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017955289781684024 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.1184027498174915, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023585973835660435 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.06626241359825397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008570407826010054 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.1014087241228341, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001861610563145598 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.10839827017927679, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002044855763678236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.06538396642694783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009096365360827909 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.010910686930411773, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.00030602545126491815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..88c4718bcb03d96ddcf5937d7c290a353ed20920 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.08062063267183606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016464001390685112 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.5804077018598791, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004705014575654514 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.13287078506554892, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017687380102785963 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.03519804726011007, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009691687220744844 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.284901492168855, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004424090476216274 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.058019313464160664, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010621558481173597 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.07336501049936618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001340183479865301 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.545991165694801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004636447373365752 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.12184770112424802, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014608368603050287 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.06830124095667763, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001429554164098288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5022686537167637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00445439816772978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.11269533007372474, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015437562768583104 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.5758893404824672, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05552280312712737 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..47d3c64aad552106f4797dcd10478c480d059491 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.3420015606083595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00759844434394234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6045685729269148, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004463808851257848 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.3340303326789718, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.005770276552964151 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.19480755638535052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005308301829142384 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.33266720411338324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0045078698777430235 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.18536983565498788, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004299838463484588 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.2821382753377313, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0063280271310375115 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.532955297379096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004605656886036717 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.27872869979256815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004804872599934385 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.298538665654306, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0067989076976235135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5279377913839256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004350056352869954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.2902240542226799, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.005189448695060585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 3.229929819791474, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10809788252140991 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..010836cf2a00f4248afd132b23ef3eac335ddc42 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.5037779287889935, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.007188822322314131 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6013235088491181, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004702106644962896 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.4624643695844432, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.005423092759830048 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.2967411625801182, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005575768515677308 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.3427937666222623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0045155157665415 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.26767500932744387, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004580954162492676 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.4061070236594833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.006289328700511993 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.49351648432022127, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00462047959595513 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.3727832655109453, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004858552003307264 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.4402820106855099, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.006558923231150342 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5309933184677589, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004638055543942346 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.4039703218874318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00498186857977818 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 5.459046430007518, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.21326177813295816 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c5184ab5861441067c92d97009ef47ba8eb1fa5e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.45806774309488824, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.007318355183210045 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6241468092977022, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004668663969159961 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.4340669909812108, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.005456429814747237 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.26701579357766814, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005503172843923408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.35329352839698064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0044599483677181355 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.24713173271908043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004395624838836395 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.36755677377705864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.006350405645219005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5032053457704092, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045314258456629845 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.3463395333726421, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004817987549566419 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.4016479351386044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.006624569651668937 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5561992816740847, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004637830171648178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.3800941783693374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0049021744640361225 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 4.768286137370153, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.17352437696704526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0af9892c986174344f71cc9c9d72e87effb249fb --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.4012912849159947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.007332717541607647 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.64320056686215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004496400900386177 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.4004976431750891, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.005533519853929168 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.23173467180213086, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005322008089541369 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.36045746541298906, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004418126556266405 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.2256483945335245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004326347089488998 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.3181899530902145, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.006233483121725614 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5124106544670798, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044121171030032285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.31629484401516644, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004837305063133512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.34997948174033583, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.006495798513744922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5748853093658587, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004482155650671063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.3503718103768193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00488155342773186 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 4.0649225050991795, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.14182226731160913 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d88a14d62c13cb5dc7ae36278d6c3b902d71a38b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17814038490300868, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019339544316495465 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.3141329562889899, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026819443880949302 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.2117111727875298, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018438088393860322 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03845870479092204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007912707782195452 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07088130315681604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015849066154849387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04599914076874335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009026810349493257 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12177527865393714, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011744038036189456 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.22582257646250492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020828750946991857 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14701777739136027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001160350487915751 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16477812405030756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017822475337332542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.29175188388878937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025122798019773414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1960912017363633, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001706624400289841 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.0534810991559866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0666110922750702 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7935c4328013c3054350eced298af29e23102dca --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.164850593284305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002029018039576436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.28405561359995446, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002857993695751241 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.19313302430200963, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019316051118957926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03503377865570394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008530621853281363 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0623540484617142, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001548287542059019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.040864899057913275, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008899257763011835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.11907730193169616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013816606669132387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2131003193203166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002234666822976161 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14097329346444698, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013041417812161638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.15259882161084798, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001885756918977423 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.26389227182331143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026742133450759734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1789179111545324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001788238357802055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.8865327529226577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04249976221266161 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6fee402677c8ef90fc0c49ccf807857e7e2fa254 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17401161171600613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020881821450366707 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.29609796476117983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029221116925695636 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.20240023881581198, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019530480366369383 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.039624014311144054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008900870750819028 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07078627112644985, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016722377383979908 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04643912783550571, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009627363006805538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12735865745665292, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014154169873442247 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.225450514063261, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023420645348967563 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14993944604674414, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013384024265687326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16080285795577998, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019296398265542425 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2753859473796806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027696876252129017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.18742123863337812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018142650005546468 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.250874940153623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08256417332234947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ab3501a130874abbb01e2123deee9c232d6c5ba9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.15417391268004804, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023622060928554494 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.25177591335426297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003390550226163827 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.1732009883115761, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022644018750802943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03643560093458038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009248695850334793 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06270148510981541, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016134073336328298 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0414745893981831, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009648670616110777 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.11525851431820534, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017153378060412703 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.1945255102820067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0027243009700905294 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.13035603600788886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016318784961578442 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.1433709185752066, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021997928215724285 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2347702913915573, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0031942105153544165 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.16116163645425682, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021129773225797956 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.419872373280389, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0632053559231718 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5c4af80522763ef2ad99a85573f178b85c6ab989 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.05190785653580835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020577014263843353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.08336762757152712, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029759640480461146 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.05560797193518763, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019328666401600453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.01264978957372063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006811352498881881 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.022501646103038838, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012662878267261353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.013892451555974773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006715642301954771 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.039660746703855165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015372422364523625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.06621442775842074, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023962464441846846 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.04291751591005431, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014550040577974023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.04808436429768246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019131925346893503 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.07749318488885995, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002775519992790014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.05153615578362743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001791767114164613 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.5407149916214953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04531884877289389 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5b4da351b3f1e552009273be2c4984603f49dc93 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.008747426074934984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010334072389606014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.013166466939228978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013165673788761627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.00858766610820728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008521488341725035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.001993543076039664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00032468030555408186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.0037795884860438948, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005979090260362486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.002181790528509318, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00029155550977350524 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.006817828377343832, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008550758600300307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.010456992886461796, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010697198590483594 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.006564407691410186, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006360402903853286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.008318710239963927, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009932928437688979 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.0125244294943219, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012566507407968427 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.00811170366067559, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000800158042567113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 3.875581456441756e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.138882234754457e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d103b9115c1718320875702fe18a2673497b1d5d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.08751592392277548, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014885876589069494 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.14641105372315416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002277503755216182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.10170268353078239, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001552854434242085 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.012652600762562841, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004575918826167268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.022474938777637253, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009218950292896009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.014876069868397498, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005216997847218128 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.0743006100872787, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011597328400413602 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1279023695193692, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001964736141764416 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.08720494916711495, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012395877618178168 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.0809683487363323, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013609972489967807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.13640036278318976, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0021233241725256875 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.09436348999877432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014294898556339143 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.7056427126817756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.036208763595939567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bb8353363daf5523148e5085165b05f5b22ffa05 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.11704708597499369, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017208124581957302 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.192663652857006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027090117742870658 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.13464216389707748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018008035510529402 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.01695970372606848, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006027560326870975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.030780517480147162, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012354005199119644 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.02017226312757468, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000714000343756126 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.08787724537079063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011884855038448482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.14920400372412476, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020664312372579213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.1019122729108625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012525770397120181 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.10950085735058879, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015902682243807002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.1807018882877703, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002516295905690825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.12602710319462657, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016626827697359793 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.1951746948073685, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05498403783354158 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..239005b74ee69a1852b4e0c68ea372087b6af483 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.15158367096384875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002160293878134524 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.24779508708098152, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029503505282398473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.17364781357215728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020669795912559206 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03517592630819259, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009362444811665309 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.05870812861597759, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015348108636689321 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.040268213401871214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009739966672053971 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.11724613742268182, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015394113402297696 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1987576493460671, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002371525557855466 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.13599583814052624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001496120401269599 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.14002594651258354, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002012494612958899 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.22991508782822548, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027786907548584838 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.16058302443864383, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019230855651408392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.3341861618808797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07849200046628084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c3bc3a522a220decfc7458947f950839f86b0ebc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.14222703095938236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002476470176258003 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.21389332067876604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003224457836713378 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.15258466894138525, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002231238932667558 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03453726675386056, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001111852850451432 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.053000124965507024, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015287742053960468 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.03686577574547315, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000986211178727786 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.11196872227962629, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018972303986128753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.17347582553687865, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026374790749814484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.12093827771832451, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016818007172860084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.13231796413662955, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002328686040271986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.1992898897205879, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003033843371726955 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.14180007256699717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020827290155971973 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.5938240082605772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.1039770016299349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc5b6bba6d2d82b48630efa638013eb51098fe0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.04866014150454847, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020483076366846933 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.07060422651814766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002660644139013408 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.04871798574935136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017935599450487872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.012614536951186476, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009004920229567338 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.01860558990495083, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010812577940661158 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.012358132206090394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006693685268812792 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.039145972316534536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001664987060904554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.058137528796694114, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002223198690894717 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.039216854579977777, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001419722439788474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.04530693688620845, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001936598783228226 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.06555591615366842, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024912463291508627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.045169216003218556, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016700917030067347 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.4670924044746616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04946249100806658 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..805890004baa70f216d35d5cd1c83b0c52b9701c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.008041599675695846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009393646375217224 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.010590450930500163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011298193688755148 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.007375281725767917, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007598214513616661 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.002035037570270077, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003375852776151478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.0025466017472514603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003590177860459669 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0017789109729195612, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00023905673552756363 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.006463314313217977, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007367508919664071 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.008641735031241883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009172071264531992 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.005916436260589178, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005951477346238279 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.0075302854587578905, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008806851003942225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.009830760205551979, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010421634809987866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.006855742708837078, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007014519003240475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.7431796693295205e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.3491931466841714e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aba12de563cfd127125afad735d00e66f0bec919 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.12685534513451716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018887365773073797 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.19257637263679828, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021883455691780597 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.13902820692376117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015977409115201353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.016462988916567202, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006707808949734338 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.02514451000634953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009497451781410877 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.01758898085911187, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005845142555911145 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.10541733933259849, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001488446061768874 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.16496936421708008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018405198920376908 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.11668382606343859, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012164143612498303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.11691349355480424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017447856277329906 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.1787124652688377, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002037393870462275 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.1283175804360348, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014569156963222744 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.5934908400066984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04577158251574039 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f45aa95ddaa95a9027fa6d3abf1b606f585f1a92 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.13945230313441068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018821753367623763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.22481790415194117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002565910341383482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1575281230169748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017363328564521583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.023062949689263944, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008856830610119579 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.03747209579391718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012382175456694833 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.025193685294665726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007407727831149609 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.10342195238697749, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013835117875348828 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.17152534934129882, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019955254451948295 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.11740619203596399, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001212497657479112 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.1306018801081659, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017653906193587606 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.21110368454548847, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00241423606851821 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.14750316226109086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016133291787871627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.3183556866717046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.047701497933856064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..45833586ec364d3cbc4db3abb48df6f66ace47e7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.17257284656874033, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023886955187102186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2624694392412262, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027556283158437244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1865965522646182, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019292479914873961 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0386635351793387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011380738975515533 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.058135448256113796, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015225329835211754 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.040241799290576724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009520200071847311 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.13532659869410305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017966336725582315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.2128994118965257, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00228475982913276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.14758577085564545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013996716496550154 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.1602504598601161, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002234879306428014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.24407970791374364, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025830465673148976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.17307145674730884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017845742440570694 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.251008452787017, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07741675426485511 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..38b5d89af5b843809ee9f00920f1fb493b07d936 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.15910068424231602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0028942512572503855 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.22204167759688737, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0031930213084605565 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.16004715198678016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022371069572285113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.03901258316156372, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001398659027827721 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.05258690577442277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015262833449626498 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.03722078217429424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009797813487457106 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.12723416561359413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0023377769258987246 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.18222910972085204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026727913824271736 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1282770493927655, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017127973815793287 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.14797898460424116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0027432966222511084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.20640960313414, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030075792341701397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.14829642927866662, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020736961426557118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.699909605867917, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10552509596678104 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e1b53173c0d12aa9c88dad96e95078205bba2887 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.04968775822887688, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021399482740943475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.07040614521918066, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026795450412985373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.048912234414672136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018303249700878453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.012311807543146269, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008824124738308823 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.017322028166765847, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010491107468874295 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.011579301131776956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006430202421005574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.039942899566434244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001742840540946017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.057791070453037226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022301074454849456 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.039162591548067456, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014375174353609152 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.04604352091174109, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020115234889020277 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.06491403804508811, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024844589212858962 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.045047671152385296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016882686974440538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.3436540247746996, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02229209860808525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..57916f0818ed4d5d36ba0247e513b0fe3b0d531b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.006943053713544341, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008438126140001231 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.008894699789752037, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0009656196544936773 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.006704558536864876, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007268399140477246 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0016201459969527312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002822313186020891 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.002057657522078538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00031150670601567205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0015940878813423497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002489498244534957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.005508592130950043, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006491909281406889 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.0073750235680540436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008073064738323795 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.005412786572883017, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005765647323043862 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.006472729964406455, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007966089744646425 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.008284399822459577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009017405138743037 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.006217775628514679, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000672203252135069 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 7.509001958231661e-10, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.9581849644344306e-09 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..544a4023081840913b672133e8fbbd03bee891a9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15222794945796614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001854391491208858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.259279440153063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027052349535794085 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17809655126532312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018236388559653763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.030497998250497895, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007413435249853986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05425261740808977, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001399269212332445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03601951697280678, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008300101222283796 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11697977840134756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012794261124303204 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2062333696623697, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021597198817625285 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1384741786574652, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012894950527783321 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1389877568936686, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016763555392295747 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23808538118300834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025032807703488534 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1628530750095549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016530415994811475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.6192080325529026, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04050520870343064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..82d4a736d1305850870d6ee981a1fc707d75ff8e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17909391212556103, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021684904700656395 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2986428547454232, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002901154619840759 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20484942755932814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001960405666708866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04098672752953724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009402949935154375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07250823843480829, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00171407343966866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04757609861819433, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009961877982725383 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12586154047693615, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001447455908294373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21783174447787615, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002284390945146576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14525233898326748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001303267897245817 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.16826083836269604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020324268022024173 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.28157870760475356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002752878166728396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19265194446101522, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018360173432108782 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.545643626821724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.048869510836124584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4e1d923facc8e12c1c46d014d05f1ebf67029d08 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2332474242904849, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0031768172038711467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.29410268890498986, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002958828414438297 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21836409386191818, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001989518194370473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06566711930000235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019413943466877388 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07670871046547496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016787739182338268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05650249608530642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011487340298214372 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1752085713341631, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0026584779502129303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21819240455646285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002276168106614223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1603606067900483, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014157142739287894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.22112816157170134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0030769042746772593 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2776979400072693, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002813335835445409 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20617794726883687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018867289872942854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.123719853905042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09435758546693512 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..24a0440a29d3b7011aa3dfade5bbbc3360aa00a4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22344987630747953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0038946320752308397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.228687908011972, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033634865916312945 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1819863863181494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024282963824572346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06821362116644611, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0023091245896002433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06243742554002245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001665626169617889 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05005886014366939, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012507484027525577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.17380086539073145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003294779021368952 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1728818390227638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026287282834976822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13680266609806765, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001821525871477116 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.21200019749157653, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003735810714751958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21587781354640423, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003186682013692883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17195845477450572, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023005495305984053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.5006949715383118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08071997944617754 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b55697c96f61323636f982ca24b521715e03d09c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06907034391802865, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002842925157819051 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07641790711530233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028071608967584296 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05800966504497334, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002055695120200674 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.020323118606363738, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013765538659170954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.020689214217856734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011098261782535187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.015594437236270214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008033551932234715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.05326819078634553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002300594360643888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05823666899257084, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021806250615139192 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04363217232305618, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015475509311173715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06512227133445396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002711627122094951 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07184593801381658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026558284841563095 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05441077080277394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019346325978603714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.4203638205606742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04062156232814086 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..923867ce2dbfa74c3afbdc5413760609875713e8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.011141473368752484, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012984382630292146 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011356368661606248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001166446625113568 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.008814606254748622, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008820141996097188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0036979579695363266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00069734856115032 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0030116660387216835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00039145176963269976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0024833328621297794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003381158590391299 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.008749076310088717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010822814361618575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.008702418570223684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008937526565969341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0067361341505456805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006778741138953477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.010561687151292019, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012526496630397305 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010693942253000091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001098581608402249 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008282398875850052, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008304343325046589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.610620675193976e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.336079313722214e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..036117f9bf74290b8e9b3ecf9a5c6d607954b30f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10260040509975965, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016816885739169474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.17029243506464356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0024565476844548746 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.11796833391406458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001679540196270818 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.013771295330145835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005405889426764763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.024597820085107556, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0010575366347068554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.015984403876231276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006030736806193037 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08487885507917314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012383575679524605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.14493899736320978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020037770730611786 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.09851081780240795, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012485527286575612 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09525971665769668, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015577333350387276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.15944167499687736, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002320656358626716 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.10980074384140354, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015599379035532135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.7950296589720617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02515184874378254 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..41abbf42a2dd7578891c06ca2844b1d7778d8811 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.13088003322560143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017327225800732847 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.21418661344518894, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002499792501862042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1501712303062816, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017196466104618196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.017915022495063486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006375485225038393 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.03152784532661522, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001233712804278278 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.020818331143036432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007112541235966938 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.0927062754049998, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011190335418292052 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.15760113826039301, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018824698988046046 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10744769510485963, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011180856260891987 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.1231860191540673, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016113685375994105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.20193059778802494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023296824464227 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.14140181753666214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015942864602239222 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.1020316820663745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05702217937857135 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f0ab74b3f7622b5ac10af6a0e92aec46ba4c1a63 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.12906028226799743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019154820247282277 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.20575231428257332, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026866298708369675 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1444547659918946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018306868593415573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.019627327385611428, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007723877693570129 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0319483746867405, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011670173723061863 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0219486272286028, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000730523081944293 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09457002128612854, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013340023606244708 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.1551264273734134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020478473866823725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10633076283099206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012465424058332557 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.12068760136915536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017801778755423962 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.19339826374990432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025250499570330727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.13526431674777373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016980499334396653 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.2530485760326848, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.034281822221775095 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c9ada9cdbb05427b8b7734eecb6df36127fe072b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10422127953279672, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020439479764111446 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.15837206489083266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028160022730325317 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.11206218209748141, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019274598908821678 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.01585990327947335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00074827880117735 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.025799942301719232, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011077245741293905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.017248853072901214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006648333827724484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.07759975361518856, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001488357218316451 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.12105717330360508, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002169577022250882 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.08351024617373054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013594205425130031 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09810666829139612, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019260722064131435 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.14946639296314937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002649765078910915 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.10540334302928316, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017989673027563617 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.2880401565259663, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02892700209020493 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4120090dca2f05318b1f175c0bef6e17f3c9aa0b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.025833527508072014, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013553627152093723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.03917838397118692, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0018502036256180985 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.02664836221163967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012443957611582865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.003887727371925059, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004542038297200875 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.006181327739501699, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005609017845600248 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0040223276062826534, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003523486305081411 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.01966558062594936, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001016340019434063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.030934646851062786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014674198673725897 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.020358325637102978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009200768723482526 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.024201111892969286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012739866254506131 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.03658490742322415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0017229024892846027 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.024854830906791463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011586779799544313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.05493255544599618, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.004843136897563013 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..210a3952dd360694cedd8a05db20caf4a4efa2ca --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.002265233515641718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0003579734134523194 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.004150900437862034, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.000611002349583528 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.0026710840623148636, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000399411300114256 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.00026974488596675497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 6.71291598724476e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0005227590853525244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00013134134677067233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0003338217989499952, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 8.131145790626592e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.001728264703351125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00026158511522949576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.003287044984691778, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00049276938287603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.002058167115063423, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00030026229552540865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.0021438711710157696, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00033684195204710315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.003922128301743887, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0005753087418442966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0025284149541073293, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00037733824330625027 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 6.481159719965521e-16, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.6449993743474514e-15 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..51f603e429d7ac61f2229bbe790899ed51c8773e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01486539538592837 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014758652303574872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5c97efc8c6e14156c40a09b34f10c4bd51f0e737 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.355, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015139491543780532 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732963 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3689c4a1ca7bf1c34c43fbe6b899f127606dfe87 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.36, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015186527932040117 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015029633724408947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c06b2354766d42ad515c9d7328315c14871764c9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.363, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015213890444671287 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.353, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015120172605483696 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..50d9614a8974737e185a0ff60ebf6391f5de0316 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0150806639915631 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.353, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015120172605483694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c88036b97d741b4606c369e20402ede11d557761 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.364, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015222868840522024 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015029633724408947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d950b3d6cc94f20248c4d67f04172bb6add0c10 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.0150806639915631 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..63705fe3c542c6ac1b6d7774f487c85c1e263408 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9f92f7149d3acc46f88e9557192f2ebb8f08502a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.352, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015110404505648658 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.346, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015050266127564436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b7dbabe5f816b017ed429295a28ad3309b5884a9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.361, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015195720118175122 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0bf10fb84260509f96dc9103460f5affe99341cc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015090650341444233 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015029633724408945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d4467e9dec503292d4cfeaa969c8cebe8be80369 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.351, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015100563798316403 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.364, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015222868840522022 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0ad8f4639ad80b602ddef62bb578c584361ae9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014888272588203941 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0a9d002874110f34649724ffb57038e90f46c78a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a4fc7a5583a68f866e0e55ebe8cbbb22bb69c32d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.355, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015139491543780529 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4d0507c2326a178c6ff17a91d2e931505ada9cfc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.36, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015186527932040119 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015039986742055237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..889862fb33d099915994604d2bccbedade61a6b6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732956 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014955087918653598 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..18d20decca20e7b92556df2d8ac3322a59e84441 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014965960710224482 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014853842487270333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f4db0a75fa348fe64392bd6f871adcd24a9d18a9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811494 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b7b60e07c02e4194d911830ba07f1f43a10d038 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015019206922356953 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014944140233795025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8efe6847593ef6f21edf3b136da7a7b538614cf3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014758652303574878 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f17ba458832ccb0ccd1c1bcaee0836860818b349 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014830507204541042 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014888272588203938 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..864c90313b1dffc07479d6df0c34ec00f7ee5718 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811476 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014818724459095526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..421b014a8eef6e6dac230515ae101bbba82dd595 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456732 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014955087918653609 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..50288a6e7337a85ad9577caeb9a3a11ba3eb7862 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015029633724408948 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4fd1152b08c83a7ee20827b220a76056f7123489 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014899597242811483 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e653fce7282ee5e12d61de2bfd29d92b4e7e55cf --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.351, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015100563798316405 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014888272588203933 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ecbd9780e9ae384d0391fa17de33ba32fdeb5ac4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015060472031706618 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01500870618212173 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea59e9e384eab1e74598bbe6fdf814aa0ce9986 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203933 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014955087918653593 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..873b53a115348da0e89537638fd51118e9a8cfca --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014876872027456738 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014721675438880217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..191996805a86a4ef1feab03cd35b34d1dbc7af5b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229871 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015008706182121734 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..23fffc217fe61a22aba78317bb2bcb77bded109e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792515 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014683991951087973 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..23bbeaa261287b197f94daf226c902de32a52021 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732956 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014721675438880213 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..60d1d6be169281c5f7cb6f0955171f9aa9a68a3f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014830507204541035 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014721675438880215 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..58d00a485b6537291e6f7e25e40236f975f371c6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014987482264363933 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.304, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01455320568795044 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..eb633c054f97562deed8ba00a588c20cc804054c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01472167543888022 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014645596385722694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..002de8c6da5065283ea38b5a7307f582cf4a3178 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732958 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.344, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015029633724408947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c4d154562bf46e3964b8ca59b904d2d81b880799 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e88c79d8dac40edb2bb429b392d368a679b9cc36 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014709193056057127 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014794927843348633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..18f0ce38afbcc688a36649ba451fe2d85940eb01 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880219 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014606483127342761 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3edf6a75882e0d59a9244f472d3f82dbdfbda4f8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014758652303574883 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509005 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e325c88fbbb1b53a382c7a160671f80a28e0a3b8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014658474370509007 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01467127282297789 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2f025fc6fcc58b11e77155acf98c14468e38ba2f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014865395385928369 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456727 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eea051cdebe5bbe0cf4dcbbdc5232c6310f73fda --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a00a3ea8e4548b9168f9510e8e91e4f4431fa988 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014806864733738868 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014830507204541026 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ede1af033dfd88d551831d6a0b798b53a900be5d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014806864733738863 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014853842487270333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..520e022ece3ac63f6c0636ea83d5f7fd3fd88166 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014683991951087976 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574888 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4e21aa858923fbcd893a225b1458efc74744e862 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014770821817934645 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014683991951087964 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b4c4ab006873c50efc05d99a52ef2361acf50c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229859 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01492201952373297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..517df06f2d3dfcb3370fd509fb611844d9870918 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014645596385722692 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01470919305605713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2e3e01a08392ecddabe40e9717df8bcc053ddb90 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014830507204541035 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..68df52456a48c76cd3fbddb6dae4234a9561d2e4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014998131348402709 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1e333485d988e065a83f2d4cf3d98ac0969e75a0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014899597242811482 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014944140233795018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..16f77b4301f7dbe6f9afd46260c09245f1538478 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014976758771620345 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014965960710224482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..191d8cfcd25c5745fdb1b4bcbdef35e974fa72de --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014888272588203934 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e43c07b4a3d9d0947ccd287b839e0ab6cc1bc544 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014683991951087973 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a241816908a2a459a92d28c2361f5ac5c80a95aa --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014758652303574897 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014782913600996686 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b9ebc325ee27407da429c1e1c85eb50a84a7473 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01480686473373886 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014806864733738864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..117ab6f7ef83b3ef362abf1831ba999d16f66b79 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880219 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.309, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014619600977206488 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e02ba98fcb2e26124ec82448c0080daba25734d9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014888272588203931 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f626e67169fc3704ec0f79da304c0ac31d07716 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251947 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013728421539454872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2b19269f407f408d95dcfad0a256c9369261b7c8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821472 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d0cf09b4ce3731e36b8201cde347d38d1940ee81 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013490095282989521 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103244 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f8a237f9786f2b08f7606bc00d0382758ed3864c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013596836729485163 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013490095282989521 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4b6f14ab9216e1d7a5e9fa1f3a83ea0425f367b3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013490095282989521 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013480882752851557 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6f012aca57773b4c04b415bf9c25d51becd89c9e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013415009084004864 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01362243481313678 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad7890b0173accbb5af1db7baef57c325190927 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013655897185463664 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3308333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013588208070709002 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3df2b514be53a4af6383aa3acc03ccfaf4c2ca19 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821474 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01364760294240639 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ef9cb62b33b4e4d673e6a15aed246fe1c8dbecd7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013570806258433625 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f64d4dd39675d08552ee36797fcbe4ff236baa0d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3425, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013704669762934723 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013696658778002515 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bd450f307ae6095b9a088d2f4ee5b98cda39c0ca --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821474 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013553211167251947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2193fb6665b0f3c10e66b5dce301ba5943af9388 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01345294899699629 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013480882752851553 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5b9fcda2d44ae72ce0d6555e5c2a7525b59a0c25 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013544340907003663 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e0734f18c4b00c186bcbec3a9403fe1734103be --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..98bc894b85613ab3f3c082ccc7bd2b9ae182b920 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31333333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013395739415639082 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013296358936471105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0b22b2e5d1821fdb723e7eaa5aa628d5372ab303 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3433333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01371263383046586 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32416666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013517438120881633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4f6f7465934a8207c3eccf66df7061c015cb7391 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013499258621103245 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013596836729485164 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fef44c5dbbc8eafdf1d675c46b6fe69d744dade2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01337626879098211 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.33166666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01359683672948516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..87e2096ff935afc1309ae8ba749b9cbf06eee2de --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3075, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013326707242912041 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529017 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9c9c638b7da7716248360f2813d482197832841e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013655897185463653 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821472 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2799fd074bfac8f3fed69398c6e90e82f5eff86d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32083333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013480882752851548 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3258333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013535422043417459 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2b873d63bc2f780020f29fbc8e87815bafa1dec5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251956 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013579531277800922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a101b4a2b4b6f67ec7b8d3dfabf15b24e85a1a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3408333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013688600793296936 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3408333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013688600793296934 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..80aed94c92eb015a7db6bdb555273176f1f01cfa --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013655897185463658 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01357080625843363 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c29bce920922352e18fb6b213dd93c755576a5f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3475, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013751753243291854 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013605417345710526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b32a16866fd1c457f9079b9d33698f996cb86913 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932889 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..be20eebe7c112f473fbe295dd012dd5ce227cda4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013471620929769145 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.30916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013346684134591945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3dd56d971c23588675e4845df94116989715deb8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3525, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013797164918918366 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013570806258433623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..67d89bdc1aea7ae25223fddd1671944fe7193975 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01346230971200514 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013630871843821472 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dfd11949fb812ae3e3e3ff906e6158815b4e6a21 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013415009084004866 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932879 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..721c73025b8a30bc13581c159ce52d04129c03f1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012414960524301834 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012414960524301834 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..52ce9cee6d7a94677bd5b4b42f75b99848e53d7f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012336718284948856 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012336718284948856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d33c6f8ca1cd4d4c7f17d78385715571559668ae --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012288926760890793 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012288926760890793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d3108dd86a86eb0a214602405455190f5377cf9d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.22866894197952217, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012272853582540806 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.22866894197952217, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012272853582540806 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5b37d61cfadf2536ebc6cfee4efef650bf282215 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01249146853239057 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24061433447098976, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01249146853239057 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2adfd0a6fcada0b48c819e1359b2517ba52f74cc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004755 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012399451855004755 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..baf75148e3713168159e91a1e0bbcb64b5a5000d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2738907849829352, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013032004972989505 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3097269624573379, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01351205841523836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7dd772c3011ca8a1fc2ea8399de152c660524754 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2713310580204778, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012993807727545784 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29692832764505117, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013352025976725222 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5f726fb36934d9889288882df312f9fce92299a1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26621160409556316, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012915774781523226 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2977815699658703, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013363080107244489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..84565c8c661dbb32ecffe807cc0d7fcbf5028ec8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012780770562768422 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2909556313993174, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01327307786590757 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9c73bb5d65ada4fae275400d816055cda0497f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26109215017064846, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01283552390947387 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29180887372013653, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013284525292403492 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fe95178428d335e74b40060805b36d8757879c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2645051194539249, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012889272949313368 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.28242320819112626, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013155456884097217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..07780154a815a8a423ac2ac551c6cf82d1e4fff9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004752 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012980954547659556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1081710d414583a2fbc4b5bf75cedd84b743e99f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012653835621466646 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26109215017064846, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012835523909473857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d5cb292e4dd492021b2826253ea49b2693486886 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.0127390386952021 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2713310580204778, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012993807727545784 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c4578573c2cb6fca1b927a9955cc918db3d3e4f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.26023890784982934, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012821930225112556 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2696245733788396, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012968040686869154 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a213599833e7581988a77d08962976ca1484a2ef --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313964 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012928933196496342 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..883e007e01473aa766938661d3ef133cbb85e3db --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012780770562768405 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012902554762313962 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a47564c305f69fce8a83aad4d366ee86a4b5d5a5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012288926760890793 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2295221843003413, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012288926760890793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..000258e962e945889d4e75ca5dadcc3a2d83122e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23976109215017063, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01247630412745395 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23976109215017063, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01247630412745395 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..61d330434433b69ce020d20974df2f038bde144d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012383873560768666 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012383873560768666 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..da11a444d63e506dc518911917c16e073c8fdd90 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012383873560768668 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012383873560768668 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad5c73f4b756c9b3e96f5aacad4761cdc6a7b39 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012336718284948856 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23208191126279865, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012336718284948856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..626ce24fb895909b6f995911eaacd7ad955d54d6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2235494880546075, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012174896631202614 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2235494880546075, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012174896631202614 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0117e1f518b4d2aa880a110b032b15ac705d05ed --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012980954547659554 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.318259385665529, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013611993916971451 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a91f9bae0fbe005ad8b2fbeac80c072a2c55b16 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2619453924914676, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012849054826858117 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2977815699658703, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01336308010724449 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e61c788d1017317abc3e6e35a6d6ebb1cd573160 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01279455375428867 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.28498293515358364, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01319134817983879 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4fda018f1c768fb29ba24cc801a57d5cd103c5c8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2738907849829352, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013032004972989503 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2832764505119454, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013167478735134576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b564b2cf11af5af7c145c574ce3a189bbe5699c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26023890784982934, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012821930225112547 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.27303754266211605, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013019332762635725 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..896f5d75eacb27ba0d4558edbcf132f8f9f532e0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_challenge_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313969 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.27303754266211605, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013019332762635718 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..30112ac0767823b9cac7459bef57cf769905b0a9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.255050505050505, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008944265906130714 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.255050505050505, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008944265906130714 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a0d901bba4bf730f4d8fa7d588d9b4392ba9ca91 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008844984581934907 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008844984581934907 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0fcad25885db53485211c2f6e2098138a04323b2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2558922558922559, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008953950243013993 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2558922558922559, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008953950243013993 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5ad1b3ad8dd9a7beb321af6c140903692d7e8ea --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24831649831649832, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008865199020660961 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24831649831649832, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008865199020660961 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..76b42c2e75a58138df6b608c82900a7c327696bb --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24242424242424243, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008793651516485087 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24242424242424243, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008793651516485087 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ae0ca6c56423bbd16da8c5b9e7bfdb63ff1386b4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008844984581934895 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008844984581934895 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9fa3fb4466a4473c944bf32ce31fa18b97031e6e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.35185185185185186, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009799078929868707 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.32407407407407407, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00960372885009539 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..028dbd172216a8f31cdedc82ff504aab58d5f19f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3451178451178451, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009755139387152023 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3122895622895623, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009509325983631462 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb8eb8d8ed08d629b23c417b07b5ec0138527c3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3333333333333333, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009673016668133394 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.30303030303030304, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009430140669278948 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3753772606d27567d0843d17c5e1d0c21f2c272c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.335016835016835, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009685160765932356 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2984006734006734, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009388855914040433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..423f74969df0af2a70775bd99038a3ff1729b9e4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.32407407407407407, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009603728850095384 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3026094276094276, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009426434542371227 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4418895612834fac5d1ae4c89f6fcd774c7e5a27 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3202861952861953, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00957415266873942 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29797979797979796, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00938504606669487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a21db0f1e8b7280afba95edf7b56194f357baa40 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2878787878787879, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009290733161670164 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27230639730639733, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009134218447652666 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d54917b1c57755ed466d1b66e31630a861ad217 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.32154882154882153, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009584091575640627 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3148148148148148, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009530150430975607 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d2144395ad515f0d19014e68c754d3a248507e6f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3282828282828283, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00963574950926216 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3291245791245791, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009642048058060978 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7aca277af3ded1ae12917848907c7837ad4377de --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.34385521885521886, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009746660584852445 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3400673400673401, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009720765494805264 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e5ad07f184613c69596b6782ccaae224dd5c216a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3367003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009697166595752467 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3425925925925926, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009738105469984201 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e1c36bc94b6d695c4130f8eacf1051735d276d79 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.33080808080808083, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009654540125986126 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3371212121212121, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009700146509130078 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f8a2816cd1f0d8d25a5b2e240e8cebd9304054ee --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2563131313131313, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008958775997918365 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2563131313131313, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008958775997918365 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e25b8aec507d52c8a09839d6d648a052957ecd05 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00879883644422203 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00879883644422203 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9d15c892d72b29266745f630b36a229a651f40 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2563131313131313, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008958775997918354 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2563131313131313, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008958775997918354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0726805a9ef67f23b0134615e2b872c0fe1b593d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24621212121212122, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008839902656771866 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24621212121212122, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008839902656771866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2477bba252ef863c500a199d55206eb86ee0fea8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008850055161459234 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008850055161459234 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2075253d5e6de1958b322de90600915910cbe7dc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25252525252525254, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00891494899149571 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25252525252525254, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00891494899149571 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8982c8f9f845b7fbed694bb53e2567e5b58686ad --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.35395622895622897, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00981237064417443 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3148148148148148, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009530150430975593 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6f4cfd77ded427c46b0949e2ead94136b44a7d9d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3291245791245791, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009642048058060978 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30345117845117847, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009433837434252272 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3f9f608601c3ed589d2914544e2ce611caf8f4fe --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.32154882154882153, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009584091575640627 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30765993265993263, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009470292575831183 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b5e3692874c1ef6e35f939a0f95316e0cd0b63c4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3287037037037037, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009638903167022171 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3005050505050505, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009407763090599316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..05cbf594d179e7096166c3a2cdc981c21b65c93e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3265993265993266, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00962304703826765 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30008417508417506, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009404000558513339 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..96786538486cfdeffcdd52fce0c5a04eaa1b64c1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_arc_easy_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3164983164983165, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009543851857323891 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2962962962962963, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009369711585684292 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7fb61715230915730ccd977120fadfa473a493ce --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.589, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008984425782182318 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6273333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00882919733890307 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7a28c7453c6c5ba65e10cf61dcaf284a1f1a7ed7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6156666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008882569490543049 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.63, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008816229842524025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..04e3354bce6b6390988946834a1ffee2f797d2a3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6273333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008829197338903068 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6273333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008829197338903068 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e1be2679ddc800353ad46b2a81c685ba0c4e961d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6313333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008809638003862736 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6336666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008797928274394058 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d9574ef5ddbd162ae8f6ad58adf0f404500d7da0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6323333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008804646702971675 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.631, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008811292732995706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..40de779d19da2748244f3bf7585e228dd87acace --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.6276666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008827592133099664 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6323333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00880464670297168 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..75af316d2ed339fadc78ca25cb2f8c63f356f1c6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6206666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008860362324722527 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.4083333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008975481073137033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c6b9988df89ba3937ef68059cc0d279b5a3dfe70 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa366a03218123be0b3e9490fee8a04d56ee06e9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5963333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008959169522662576 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5913333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008976614094836194 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e9b2ee16b6b3707e1ee626ce80e709ae278771db --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.613, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008894007408882734 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.605, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008926639623340282 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1bfc2fa7b8c600a41f4d31d1de65e709f790031f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6173333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008875277637761267 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.604, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008930542249025189 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..47b064917307338d54b7326886cbabfc49b4a639 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.62, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008863380835773165 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.6096666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008907909838637955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..19c3c4f152225022fab56c84bfaeda5683b401ce --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6226666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00885120015653439 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.46266666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009104744524973354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..05482d08e8320e401af3f26e15d26e12e71c27e0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5423333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009097447488896774 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5413333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009098980657278165 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a74693df03942d3fb820326bb86f01bc9f983c7b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5473333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00908922749948324 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.531, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009112665923139413 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2895cc70cc4b88c55c1f1b38c5b967f0e9606a72 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.546, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009091509877386517 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.523, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.0091205662238016 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..95e908fd7b65dc28c45a24812fd95e87ec30ba94 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5476666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009088646624339614 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5156666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009125748094153249 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dea951f6562e4e18fad338a3a2473e2d753c7583 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5383333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009103358843448796 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5193333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009123403215694962 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..55bead7b03c2c0089ef904b2acaa90779b8ce584 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.49766666666666665, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009130131705156546 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.38133333333333336, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008869364649389163 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0319247c391c8f83ac45b2bdd6e47bcf06e22511 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5426666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009096928229880421 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5423333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009097447488896775 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..956321d0c38d8e6844b00a669e93e5224a751372 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5913333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00897661409483619 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.588, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008987709736566396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0aceaa8b98e18b740e601fb6e7e25b139b424562 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6136666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008891174310695494 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6033333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008933122315228996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d5fa7d0e83666dfbe0066912eaaa495696523017 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6156666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008882569490543054 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.603, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00893440584870012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..44c32c9925eb388782ebe9d1acaf5071e97e0ed2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.6183333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008870849530787626 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.6013333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008940758594209433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..40558e99438387e49ed1ac1cd3b9639df59e9dd1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.38966666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008905164372580985 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6243333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008843442555522142 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..59225b8d7de05e92d3e90a14c612a7c01858edbf --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1431072ca2e3c1c54df3b5048d12b27cccd712a8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.595, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00896391565823638 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5943333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008966262991425923 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cf144b9f7c2a015eb3f54f386ece4d28b436d4e7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.6096666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008907909838637944 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6126666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008895417372116205 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..526e80ee2b7c12ddf12b41accea6303a0c9ef608 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.6206666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008860362324722528 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258924 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2312f28b4d0a79766997fa6af5696c8bfea2acd5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.616, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008881119942353993 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.624, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008845002997512752 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..262e0b990c64436a23f51c161da7eef73d6b5854 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.20571590265987547, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20565a6bede4582b327ab28e5bcee619f9e2b79d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..80de9e28de444d89f0fadb3fbb67f417878c1331 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.31122702434177846, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4c23327387293f9052cddbf576ce633320e10669 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.3113026819923371, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e1471350554c8c80887de2a1bc5dec74d728466a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.48214285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0673769750864465 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.33963161021984556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b0cb8a092f0a4b6c0856161275c2cf9dc4ea6a95 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_GPT-3-style_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.31340255400405775, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6a834ff56cab58233fed541f2c4c6a28c9441290 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.19047619047619047, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e3b7d5aaa0c1946f8f1d89cc56894d1487d287a6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6a919adc0d6b6403f30700405add6d5f6e183c28 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3081967213114754, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1801c7b235748223b1e86c6c14c982fe94fa5f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.23488400048082703, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3a156a0c3162368a83a3d65bcd971df1a3b873f4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.250952380952381, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1042049df557f6632ab77f6670783ceca4e04048 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.26353276353276356, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b78797ca15ddafeaa3fcc21625d092574139a8b7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2407177363699103, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a13c87e63cca16fcd285ceda507024211e15f01 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..71520e60fd8a846955475a63312504152dec8d02 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.32100667693888035, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fe26fbca635160dd542060fd9e1df579ba74cd07 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3040078201368524, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..16411beeac3a953a57fb7011da4642dfb4c46baa --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.31340255400405775, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..35375fc503e0f55ca5f1dd2ef3426de240c72b60 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_can-we-infer_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359542 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2886268730041759, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c2272c27859dfc52fb19b866fdd7ee34daeb3fce --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.30357142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06199938655510753 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.24545791620318877, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e6237219abe79d818b97b7bc8fba8afe28f1eda6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.26425954997383566, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c48b3a951d6b12f143723e04cb6a384143c3d1e0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.32806324110671936, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12e244e51bf1361901356ef3644fc0056c1feac0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.26785714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.05971290310957636 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.23582766439909297, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..090c93cb8855b75213671da5fb17526983b968dc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.21428571428571427, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.05532833351724884 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.1915830546265329, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7e7bda88ff1f9a87ae2592c18f2df1414975d5b3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.21428571428571427, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.055328333517248834 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.18070818070818073, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0091a5dd788289707e0f48d5f2d14de935bd72c6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2558647026732133, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b8ae9325a77d3b166205d4b876f4bee0c2308ce6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ecb92d7eb2f29c038e534050da7b2509ede1cee0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.317639673571877, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a46432649c88771b4b403716953625c4cc977e2f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2716672677004448, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..084ebb7982b00dcc92dea3c1a4f7bfabf90f571b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2880952380952381, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1cb0621f87701c0311caca6dbd2bca449fb2f80c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_cb_justified-in-saying_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.31761006289308175, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..09c201e65ff60d07903b6a4938be3e35984dad7d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04923659639173309 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a73fcbcdb0db73ebd9b200b86d5cb1ffdd3a5ec8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050251890762960605 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..80046eca43a60fb7219b2aab7684c3bff9dd348d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049999999999999996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..74e5bdeecc92cc7e5344164b231876464a18b46e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956911 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d919428c8a4ce490eb141844cc35d60f7508ec9e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050161355804659205 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..104a0bc0633c5a9b8a9ca8ac41140b2b76753304 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_best_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050251890762960605 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ab5d39a5e4b1f7bb9fa47b4a4d3698dee07036e8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d3f3bdb03fe182938950eeef2fa26f706ad6c54f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049236596391733084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a000cdda93441837d41571754a6c89499cb741b4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04975698519562428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6443b23a8345a2416e528f9253f2537cefb64125 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d820442df0ba5ddd80d393cf17567c3cfa57d6ca --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999998 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..22e6fb82b82e20e6657c24159c978fa7015260a7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_cause_effect_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956913 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e320369ce45615fb03ec419b0e90dc1a2e54f298 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050161355804659205 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cdb347232c55c5721845834d15f26e8ac481c6b4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7c64eff1a8b6771dbf4801f31285790413c8211d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b704b9c9f43f3e36934265b97de3ee772497310 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.0498887651569859 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b559b0969980cf47b7498f7747325c38624e287b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.39, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04902071300001974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d54139b20bf1ac28d3162454858e21999266d36c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_choose_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050251890762960605 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..42b473a341b9f9ed9accb36c17e26eae987e2ef7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.56, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9115d293fef0a182f5dd239539f3bb5d192825d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..748ef7ad592ba8a4273bcbebd73c5178d5025518 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bc678438cb9b84c51dc7680e260cb3b633f24a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05021167315686779 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa5bb45d7a45700efd5b6d9f1cab82b4a2acfa0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2c1a91fce83ac1d61e3b5f3f58d51872f91a6c69 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..caf3b939f4b2a4914c100bf57debe06d77db0307 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050251890762960605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f5c752a5b24d86af0d258fc0e1456080dcdb0c41 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1fe32ae99543f91f823b52cf95d8dba0ec2bcb54 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..978375b8874fbb1bfb2210aba953633719e5aa78 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ef201357055d75259803e62513e5378b972ad5ab --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3938b20549db1f88d4b6e57ea0ea0601b511f9e3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_copa_plausible_alternatives_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8799c3bd1a28d604d49109f366032f32bb35600 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 3.3348308138228138, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0542442268004444 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.15695463233811197, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0013172199856712327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.31753062729975684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0021570780173147865 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.2057045204344029, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0015278039224567346 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.054730014529234865, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0007967615111152051 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.11321119898453884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016182734674237542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.07227415925772734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0010191143299311432 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.1355356394919694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001147779810670209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.2762000750744859, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020067480630458333 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.17810439394384858, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013638022161662619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.13583123172424444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0012122638205754406 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.27559480017616955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002087623902803619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.17817139272595273, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014357182624015643 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8aa36c2b0613e25572587b8d339a13cfb6058ce4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.124007942744359, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07045741934241276 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3538932352903374, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022379532621590835 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5881592755809824, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002734526700250523 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4315579334995969, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002196108406977491 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15240494036724936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013595392500719097 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2605145852434897, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00217480883356232 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1874037992560188, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015276804768402203 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2466966884536201, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014576935302306722 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41914793158989233, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023677125845129993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.30315812846780715, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015402915151127962 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2906120521359478, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001962484953478978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.484233118629712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002583500073418052 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.354643328344842, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019921434871566575 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..84a5b20c0650082c82a0c74b447c19b3820434ea --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.873416618205943, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06451768534270613 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.34827807118661636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021235627642469048 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5915788110263536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026271815009548663 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4285046970746608, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020742383144155244 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15293244786496132, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013754728634272706 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.26790058462792615, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002298680167945361 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18976901148520062, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001566892088457795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.24630747239149695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001440271745251014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.4268276282189634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00236641291461196 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.30509672166724194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001517855741385593 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.287048775958497, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019274716695619075 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.48857994815747946, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002609384753544022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3533423486463186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001973114674110923 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..16943ec6c483cd40e4f6a590e10c6bcc4aada761 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.24802349891997, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06635722949244867 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.34180817756753284, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020285133445260843 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5920459286923339, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026236762956107166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4237555248539553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001992167963719159 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.15289604438013946, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001375777527347721 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2737065876148546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023716671424749572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.19128265567370034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001581453769153973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.24394200099420235, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014272228885483314 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.43011826387213586, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002376768370329661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.30417906792765287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014988806302313118 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.28376532763365797, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018837986492293225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.49209992765341304, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026597190806332866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3518251181572721, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019444219873398726 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..88baed6437f380ddcb0f32ae3682a25198694e73 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.259603611889512, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09137927162900815 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.33191481035446385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019317351081010318 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5860564434153247, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002586714587351925 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.41457403908903073, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019274435714256614 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1491204275015992, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013513647776642052 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.27220476893326745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023984250517094926 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18797801160921448, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015801875991192818 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.23674148673246934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0013752390609366295 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.42520126985132733, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002377551920288881 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.29732435133761553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001482323108912372 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2783455423171209, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017880786684828078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4922469101029398, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002602328441781233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3477933389836102, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018713097091873625 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..059fc3d6fcf3736d56c19f4a9ba2e45b9c440bb6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 7.083478384911307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10406723270307067 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3214279899425999, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0017937724757931723 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5794690131338354, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00257830350864617 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.40449455948498675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018006323163830978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14499572043902326, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012779901757949512 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.27026431913729904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023571465046724562 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18412349501175196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001510781529338931 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.2271462950868112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012925654415635542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41645498805365133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002379296311255394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28734129485544363, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001403833180540554 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2746492057015669, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016737991466862709 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.49571786150582986, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025524072970345467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.34568503871928113, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017485361441317436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c13157d5624dd2d77f7867bbfc8ab9f82be26f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 0.5007977834063807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.03795381558072733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.10661165424565706, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0009177614571421352 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.21742813713741768, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0015639823409019026 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.14034402421598652, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0010836222389849536 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.013032347739898609, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00035990837418407645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.028194149116957987, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0008205075711069978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.01749967959480122, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00048519612999649395 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.09463196140112043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0008085663634196383 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.19429758597333047, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.001444769495062812 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.12481249710425146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000964737385169164 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.09025046396276212, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00076036607720926 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.18536439953811315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0013433343943009591 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.11908206350989242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009080994588157214 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..180a3e35d8e40e1a1fbae5082acb87c8bda2a548 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 5.912367109877884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.061836973143818245 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3128102978451566, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001691715951507062 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5681459084996038, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024899844904499157 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.394742251409417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017032945676370627 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13058114965570497, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011618657961389826 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.24495367272360657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021854464652663118 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16624694535550544, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013782062542515164 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2101887975592876, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012001358599358798 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.38833816310299923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002272167927909678 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2666452952973803, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013235129810010833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2597848984036747, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001569416089582477 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4719852089818159, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002427421024485974 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3277807853351648, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016501024982922983 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d46bd74db71d5455636ea2388f22cba89766a75f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.6948355547841905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0511979514463551 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3163535969719062, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016603427152068276 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5834971336648034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002519250385447222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.4017351734479823, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016902369451423883 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1381896709382772, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011500719976283826 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.26453724451723776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022668939643765884 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1773205809463223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013921382173492855 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2163360334247349, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011827261695832954 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.40582280287415906, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023312901734070086 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2761947809578966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013335952149149215 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2646418877094466, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015454852649096642 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.48824736439219957, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002461897429922055 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.33603536027203956, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001646750924419848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a1368aa718537189f6babfc9492ab17fe1782e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 7.079344515403131, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06095387919902322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31434504165137933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016756850149852798 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5838649601706869, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025270103149731774 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.4003285706279974, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017150724139835165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1411501512049683, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012071435589295404 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2710239140164862, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023428408193831283 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1814061261607152, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001458658344245726 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2167987032269629, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012303868538522155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4087432941457496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023596629155231627 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2773948472348103, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013888021320340568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26527285008292856, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00157683567546418 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.49306302016696496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025313515796034575 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3378608633527859, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001695654483915732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..671255e05d91346c06bb113cf9c1686cb78c5a4e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 7.3036749892250326, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07515670477596327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31515384991950207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016788769894757376 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5890522804534821, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025220913561448603 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.4023616713620362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017219623211045963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.14276401815964648, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012323791161579336 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2759462068171959, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002391724415993567 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1839557774098788, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014913102819955194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21671602881600943, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012338874458127408 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.41184759065700366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002409432664953705 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.278135454552631, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014055461648461573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26652069267704986, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001586801161274008 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.49825625350122094, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00253173094162978 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3402770443245213, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017117460741257117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..db02b4ce75c5faa0b5c4d3a47d1d8f12cd9afec4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 7.252202833742519, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09258081742959207 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31332420313697207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001643155770755078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5876744067602289, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002515657293017262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.40056764017959046, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001685883519704545 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.14257489924729352, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012041170840367955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2769111512983329, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023686465359635234 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1840109955212137, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001458917708562315 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21603612545582793, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012167765328144225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4116394939841814, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002384213224350567 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27755157420468274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013836819536529941 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26549263778003923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015751904507686763 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.49799879824695253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025632163763559136 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.33937537443965576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017048672562046127 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0def11d650d820018d2c67c2d00c112cf8cacefe --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.675468702405354e-98, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 2.5576140191327723e-91 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.06825, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004585546668182722 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.003749985635338865, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00026048740343422104 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.0070392305995167406, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0004846274711079081 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.00046666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00034636309927530367 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 3.063572910771552e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 1.7771220236267286e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 5.466015466015465e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 3.1579886935067115e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.06770833333333333, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004559937242192579 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.003701609566962797, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0002566134309674225 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.006953552388838532, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00047845164190825134 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.06752777777777778, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0045570416656537435 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.0036533251906324437, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0002523074490538275 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.006877766213052355, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0004727965098252039 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d69ff6d7ac8f8036ea120104cd3daf086e26fa23 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.09194884482583202, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.018000796645724612 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.10846695430848712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.004476787852682004 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.07403964823256537, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003217977904697661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.07620341314076311, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00314933874193351 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.04014965767231649, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001936657761303654 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.03382193035815861, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016247551035281998 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.034204603336784774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015853396117442324 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.08344446002840113, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003725636670151101 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.05251848706263357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002326892413736413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.054364245776715184, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002273440248490721 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.09421039833488701, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004060934406491245 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.06112231322923796, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002680843364295191 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.06338498822890376, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0026438955448359253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3c744220582ab628ccd997deded58d455fd63aa0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.59707844485191, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1425795749931428 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.19136701846801105, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005379543595527197 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.14581299068539502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004242345755430707 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.1494166605053438, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.004165824722062478 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.08162398770498766, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026060437165329924 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.07003897501562158, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022428114359523145 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.07093954631020417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0021919661337184986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.14327202198692984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0043408686018408405 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.10348587771415337, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0030734426965391064 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.10636913992897945, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0030096187848934145 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.16186299066230891, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004753003923288998 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.11938311850161805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035231116397566264 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.12280164375185657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003473100919506952 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f38481b51fece0c74500d0f7457724fafa4b5680 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 3.5532345468202933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1633344563680152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.24121792609367423, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005557178035436053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.1980561640610698, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004678758581243681 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.19980679600968698, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.004522960796291835 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.10899469492781635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0028561009527101845 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.09627564067307154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002532227746801326 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.09632029448061452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0024366646851858957 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.17666987032282006, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00438764652241939 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.13985008744550848, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0034011563714505364 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.141384148982077, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.003282216691402862 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.20064541312984716, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004834286841759822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.1615193895639558, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00390851563052825 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.16308829854180357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0037724131070999297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..07297bb1962a6d50aec50ff0941a593fbd117c17 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 5.217629954237116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.23284930274397692 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.26662745576267594, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005575953278388338 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.22983869250345296, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004858036351505812 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.22663036657248486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0046087693412430126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.12167903294101802, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0029190844237015766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.11277983027021372, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0026762229693605183 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.11048253025142175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002540966652479063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.19277110900350442, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004362200883339936 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.1616071987301573, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0035572917110778075 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.15912782439760098, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0033432276357100712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.22055060611521593, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004823378796586691 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.187891178897973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004094656427625295 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.18489747608203236, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003859601516136119 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3a1ad1e6f2074cc20fb12a4a737cc42a8f4ecb --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 6.376693171193141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.2748726388858482 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.26448067560699023, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0053005540267231106 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.25142750063623637, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004979472392281411 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2375382569438019, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0045304403027307755 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.12190247907289396, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0027751227368017595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.12160621956065581, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0026839402002743567 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.11427201259332177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0024437856983114473 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.19188124181418145, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004147195755453536 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.1779950318253713, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0036462875523432373 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.1682205810105168, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0033058315417317363 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.22052854784819315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004603014131646932 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.2079492427367012, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004237911852406221 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.19617441412979775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0038379731820601636 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf4cdde76e1054220e9f8fea9052d90ababaf23b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.8175837539659996, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06903661804515053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.067410137720777, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0015267778764287776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.11905569111444682, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002272837057404917 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.07913875454413057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0015242869094641899 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.00855571479752271, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.000610070681102783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.01452701665726237, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0008619364610158222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.010022915068112901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0005999619167956302 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.06588530481318243, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001456682483397516 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.11730596366118605, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022143035280371703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.07774011765647057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014739968390031794 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.053281568752607505, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013584259494467741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.0896663423829268, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0017758029746831937 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.06060524038626222, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0012344605903209444 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1d2848222c62dd4757ddc80cec70c0e7f6a883 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.360023014480657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14523312599938573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5368002507342822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003197714150333534 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4204347436029918, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029932330301144583 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4457059970163705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023475038584115206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.25010730577128015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0025416736419787912 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19320140085672308, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021059841419488645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.20489136085595536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019458972766888125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.390181985500488, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002842772643373759 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.30202346410754144, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002382966736707436 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.321161092281206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001995161634007105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4390382418075218, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003082038398447118 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.34172069570320107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026874775375555484 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3629740875330838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00225666081685001 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d688d2017c5466f667feb29de1b92e60bd0bde12 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 13.284050272128148, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1770460818661209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5714641277328295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032799480051059426 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4468451656698964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029010255062063995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.47571760194079554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002265550724177173 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.28407643182119585, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002743346128512875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21834852344926514, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021773726271045162 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2325284196471626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020182406639818696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4201198181536731, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002974299270315645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.32583054256145155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002409815129643757 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3475057918065784, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020325138450125527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4715445246123803, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031935178217488633 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36737757013379746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002675859339354407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.391612619922272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002268675240750965 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a2d240f85885197a72f00dbc3e1dc75d6f2d73de --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.26351682307982, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16518466746497154 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5781766953393467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003232493897043414 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45846733751843627, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002862839752183954 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4869136869814224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002252377471622215 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2936437583754312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002766357156065721 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2296605325605603, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002239048819483412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24388713793667496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002081168246828599 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.42323041568862063, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00299893377825116 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3336509571396467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002463831973937884 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35476450327695513, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002131136688683226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47879240286661645, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0032211942725843233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3788603945042402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00272506620717064 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.40259898572566205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023428037926685654 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f551805844e34b1e07ab4049e2ab76e2e8ccfc7d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.568456206535862, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.22714934644025087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5824109780846385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032128656177573896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46303711017335764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028550395782162385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.49248119559722, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002261462995557444 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2975497604665679, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002721358209608908 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2337393664335435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022535476965624083 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24852828649672148, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020894492308823178 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4258973373500384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029570811260566745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33738654578439453, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002480938975354226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35908240651080353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021535460141111546 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.48186572666282207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031731113183941597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3829876385027774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027332507970680904 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.4073362598730636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002350552518906019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c4105fd18633c49f0cc7b71f3924ed30452733b8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.41705650902542, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.25051935957566557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5768002584328835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003210936196026848 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4663927464698937, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002827189059265761 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4912358527300658, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002175083911938509 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2929623409468226, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026747147184981385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2340255710785021, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002212699322566578 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24634621400768708, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002025833005255322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.41839619423807317, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0028660522019280242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3383776885015685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024683861897523925 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35597711356903294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002065825901549598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47696170422006223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003098280726576402 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3864090320075692, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027224907209281047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.4066143622258549, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002273545252564866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..52e52c798064b218b67aebf62287ee02aaec1325 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 2.135318771277489, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05149736456182392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.1300442556023875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0012997860884800998 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.26964198994481503, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002413258698941157 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.17219385386945554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016039551103560214 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.039727029516438334, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0007171629939039988 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.08739502207944705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0015891465126372196 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.053555609070310047, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0009542998217676876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.11658569686014478, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0010550984665999475 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.24489523558752985, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002084234926217945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.15509057564768253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013250287969709912 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.1085790664749436, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0011429046649663122 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.22674868093509162, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002209787876074805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.14407710887166456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014318610459664793 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..be30093c14e497693fa59c2c228491e53506ad4b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.514185829033243, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08644240018036378 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.37271218983917304, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021586668011346864 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5972780988107822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002538198650027097 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.44848731359904886, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020321502995675426 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16529227064321578, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013977761177152697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27152870296975423, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002179427881235387 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20027753375836946, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015441978236127502 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2586035514720966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014969615123517218 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.422455743313377, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023558444341864183 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3132180262669656, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015462995225589086 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3082635717959347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019666465629011544 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.49456210205094986, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002502870066737165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.37097935121646175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019395659335613548 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7b1a19a966b270933fca3cf153387d1d41489800 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.3652204803444965, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.084081338904228 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.34970551268520844, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020964374351578907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5664575487593494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002630711098096289 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.42275422267896534, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002039352116930094 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15364365198250277, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014033191424566054 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2562044764289271, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002268323282291491 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.1872354549740021, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015796108546654892 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2474806927681275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014727021825347633 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4081431372540134, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023551278568623465 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30097802629669945, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015401844960097636 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2908623165977362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019170217364269356 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.47214399804412605, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00260021880713798 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3518120413336342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019523611899019283 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..83957292d6393d57293f29fe4fe15616be218236 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.4968698400953535, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.059669479772880306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.34117325753136746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020233228481327443 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5604137274546103, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026266097618435577 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4147491979933095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019885367884029 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1512868929321076, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014148299890455783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.25600210025217546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023581582821936968 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.18544033119790024, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016196353447881983 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24337336103065083, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014497505785590113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.40654419517533735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023761049290174237 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.29753225476479545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015402025665262522 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2845042264341882, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001885327286169968 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4679542291978592, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002638793268605149 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.34597699574768703, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019510235256301094 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..82b1e60c9cc220a66a12513781616c8187cc201b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.6317178163787345, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0876389482196388 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3400198376127372, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001969604534522521 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5624978911685358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026055056008207375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.41464793929679983, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001938651237761389 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.1514447932523371, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014010545789579145 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2581999705469884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023705600248679587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.18624803563290596, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016110635518348864 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2418798336527905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014203318517984557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.40664147075304874, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023808852507882106 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.2965625670926111, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015189535046102405 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2839226907614591, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018528730766754502 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.47007701746039365, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002613092569755996 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.34627588323332553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019163771810995604 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7c8eebbe439f12ced2ace781eaead40553403d13 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.491534980084141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10379177513611697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.33728382733850976, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019389929582716602 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5613995582020262, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002595310912421587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4122007802452186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019059990682215938 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.14882470581081966, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013513137828580892 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2562146590122059, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002346866630933058 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.18362949663211528, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015655061377427496 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.23902186517429475, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0013782846724426727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.40466863634778255, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00236488455920865 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.29378677375300194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001476945763349084 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2822952417456622, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018260149187799492 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.470397132239544, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026203121351988648 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.34505218091652107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018918651643402265 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..10feb9e3be1e00ceada758dc3f509a4e0559a38a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.10101705898942855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016024170779631586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.25441720721083605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0037811710248573083 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.14273720741484003, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00215614367639498 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.014796714574159396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006925399675854941 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.03874020571430558, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00185089897813793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.021087066079578522, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009789482177719121 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08142707182033677, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011693922318445696 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.20644516180144928, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0028652502483865186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11527495670864521, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001581234925824072 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.08118614796808597, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001299485058038869 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.2067297549645945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0032511919359005962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.11505369846922088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017783784436516575 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.764704482900491, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07729220338781524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5e14e1eb7af48997a2e2332fb8013a5a225c8f9e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.10639113371900555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0015564566569882924 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2612998397001183, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0035416281966467018 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1494286022448456, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0020905023117744753 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.011201230051773512, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006022749049849383 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0281936592385339, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001591047010891972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.01583906203117334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008546628932044431 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07481912872892685, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0010039125628398355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.18571610402684233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002406722992733317 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.10535331505143768, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0013577546916502855 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.08543958378845312, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012424221154866547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.21159077593083697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.002933760670380391 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.12024087462673269, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001679730935315816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.5933480280129028, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05429786159159416 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1b78a7a72d4696e3aa9068425ac25d4f5c3d1ec4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11490099337353599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017855066978262773 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.28416098353802643, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00403393246491068 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.16179129667309153, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002404708780391458 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.01814304612682325, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008381266313809256 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.045832024333924644, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0020855696544412475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.025696677971347164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011719065575280197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08448464202437457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012405791092172244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.21062724107258454, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002888253274044044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11916137004159028, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016721548286313965 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09202870499262696, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014247653846688566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.22909088132492106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003296324605941834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.12976468768250315, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019232886149564547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.956802800637642, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.053844707729285034 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..74bfb4bf29a8562c309b51499da943b26b70539d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11944990921004701, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002075349245199532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.28605812303299377, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004658807697952179 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.16516309621402628, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026871657655052933 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.02287253200603821, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009561691761087596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.057624882236011116, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024533377849923813 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.03211728661220163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013308066074301423 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09017935363635442, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001527608100576742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.2174725953720994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00355507141242734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.12491419978896767, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001992774375798432 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09579245704657499, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016893680911935236 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.23081299236685363, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003847882937168944 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1325926800413563, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002179322108142391 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.2524115713328146, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07566638507964296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7cb571a460594a7027708e1fe0d5d08bc367bbdc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.036672752688489164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002280550360708834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.07104357839010789, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004348885899507683 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.044505675987276345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002633675660123755 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.00723200667268081, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006697782682481475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.01625946634947881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0014906194095147963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.009527033234766386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008493752428441728 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.028537184733923405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018341458712957833 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.05427164298658868, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032950923739180006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.03404085416130508, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002000857011564909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.0300121458572667, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019272475921858912 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.057470387886084315, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035480273356727466 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.03592767335941958, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002132797533713022 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.6706678456407895, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.14282899844134608 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a7fafb2c7a4c0c722342194865653e57140c9dde --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017130559457731738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.00016538869567667977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 8.34686515261994e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.00031526656966758077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00015897565882719525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017130559457731738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.00016538869567667977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 8.34686515261994e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.00031526656966758077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00015897565882719525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017130559457731738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.00016538869567667977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 8.34686515261994e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.00031526656966758077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00015897565882719525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..22db300c20f7c6647e0914d922653556ea5a76e3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.18114983702463375, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0032933592986375145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3436733855851609, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004317974276915552 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.22203482324927531, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002846869346449123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.047446576026624596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0020034775467036724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.08991408171222039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028845441646191392 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05712816335799515, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018963073568218371 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.14097151038726158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0027300858693928703 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2690913907957878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035417684042767152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.17285149090538388, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002306787503569375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.14261278916768066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0027978086937486806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2733257622994336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003865650205750047 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.17513809375886846, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024578450883517545 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.3356469238799042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08798372854463395 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d37169fd86f84fe7411ca92d959b00c72da4e549 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.15000019598004125, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019021792570688687 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.36983583808020726, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004482161415274735 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.21086782423473222, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025534571718774162 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0368473261642897, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001172295467736883 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09474599631418981, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030950408198772202 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.052391077282527426, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016560324357599706 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11414094299528126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014353953204332421 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2837425807321581, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035949604065181845 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1607709135525665, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019498240736311654 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11932299339443851, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015966278483692021 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2967588714120258, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00398015313663571 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16811226782770253, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002178990811580748 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.1464792246976607, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07488613082754325 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..52b783db5b29931eae6d8b64b22e1a9a16a77db2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.15639600620674704, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018335197504392494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.384927926107643, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004418750266865289 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.21985072281272192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024623644197026796 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.040354659501515545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011772215563549183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.10426062090763166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0031736366813986676 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05747020905233346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016740650674689041 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11784071641372797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014063568059494577 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.29208769322286265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036018446554112815 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16596009416926394, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001923269555875606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.12311951167098538, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015736137729286763 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.3054920255583516, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004042312320168184 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.1734564395771864, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021637914826953624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.3300715197661055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.076927398815874 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b4681c376f49aa0580e8dd42bfc69b6243ce0c08 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.1560692539475909, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021493143139258548 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.36907923753393884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004843054637600467 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.21450007180041575, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027403785317734034 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03862402056888545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012274248154787231 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09630094014738012, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00313494655980441 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05413486443607653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017007510337812338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11715734774896432, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016770553946149032 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2788115939785124, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003945546036537552 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16116371553163497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021510860622327255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.12407625422586686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018426611135377202 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2950692838761085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004308493244658286 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.17063343819296709, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002369199596245067 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.219342171346122, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10669483486054532 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..37c36fe1d4e6260be081c711bff073b4cd23477b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.0465851524353561, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027356501105498567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.08936680319887794, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.005013916901074763 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.05633682692560753, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030812766489717136 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.010250954815284269, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008867140781829498 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.02226634854406678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001849713945167963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.013394347210809258, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011084639218436085 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.03558590757936969, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0022148829116658956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0672125091911251, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038250623727023383 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.04233274207639188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002338602629818879 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.03778568077281841, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0023309139307629827 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.07189209907306558, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041053780258082095 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.04511518475148196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002494953102030446 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.0148782549474955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.145410394761497 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7e538203288bd1a522ae2b5f66e2e168938fbf73 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.0031814398143476946, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008461438789843281 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.0025059491300529427, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006912202190712276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.0027092066961271533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007219195635749328 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0003440741648288817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00014037543336007386 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0002614759454382096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00010902516863349196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0002940707111925786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00012089379086475229 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0022868109507954127, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006237767036215238 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0017473662230373889, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004792256951751278 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0019169580279472128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005161311601160227 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.002390441596878889, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006439443967438869 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.0019002132238758686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005456581356378058 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.002035329984614452, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005499679200350618 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.122977737748436e-42, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 9.17065601470076e-37 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..104c97b38d065588316e54576125d512728d685e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.17908873736938333, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025998582918028913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3401581574130041, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00466124187806079 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.22142206258867178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002657386115409225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.04001978076164408, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00143853172997629 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08351989855551167, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002908378017045409 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0511787638415587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017012685833384458 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.13280826032727655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019720134198053826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2547013277658435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036980006484927316 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16457376114227693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002030624196291875 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.13820058290248866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0020236107693674434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.267820674383363, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00406330233863675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1721074292499855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002183282479578269 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.242352554457821, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06303290180001715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e37fcafacf7bef1d44114e384bda9403b15ac2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14242240386934715, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018474131654211898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34972417416826374, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00425304183688617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19996151191164502, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024642136191649943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03180784324852304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010605401760687887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08122973807701163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027530425274732025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04515071736102295, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014958437813087835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10785508103994479, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001342675196432051 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26778299938242467, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033228224655886393 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15185456372717573, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018177177985044526 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11366701902108305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014987342312601044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28182954712893066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036825745017303583 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16002746501050483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002040753526176795 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8494036251126653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06618274112679547 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0eba155cb790e4d91f53d19f13efb1cf9337f415 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14299555153351842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019414837646538163 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.35054680023331225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004405959066068025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.20066139374359412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002583892537384957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03372540037348309, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011433968826317544 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08536452458478411, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002896468085770253 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.047730927310845786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016012182287873963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1096711314096099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014240979490601532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2713048964277041, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003435168695225616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15423466735433622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019141237080501433 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11432354351222646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016135722781519138 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.282399261929163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038458320187164767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16073844532804232, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021770573542344426 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9553979785993556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08114119801334493 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c6852c10a3e9933a50de1316b24487e0e8c42b0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14095312565957024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021252189798371952 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3355112558235932, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00482038827448099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19424816162568462, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002758326610864678 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03309503575773881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011583105646001772 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08361606756802129, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030274265177690917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04656621187743751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001620760377451161 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10750740692234483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001617470504570499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25822344513710505, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038342113677037708 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14844786782549954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002106590246901777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11272654966890937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017739663038892394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2704314687293562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041899251242752955 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15564298400864224, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002325750760252049 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9899134724216982, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08189694158167818 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0eba562ddc392772bb4e39debfb4656f1f790041 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04182867272383347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002516912133041032 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08068317129523372, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004630110172525636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05061277872693584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028337392524855837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00821242826897234, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000780085704496626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.018229654639556718, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016649615737807282 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.010817994039374855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009898135477985153 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03237406484505732, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0020316037823294927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06169464366711305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035477374875163793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.038452182498697945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002124389719631779 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03434532775935572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021605467450954603 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06559190633881043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038270052788935026 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.040956310132902136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023102690323821627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.851414956048288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10142038717157466 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e59cd1fb12b966e7a1b35d969e4fc2db9c8379 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0021529907822185413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006324831746531657 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.001764574432238271, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005172389485732619 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0019051009413407058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0005593343846105789 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0001457415441877151, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 8.410282821934284e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00011802662746058974, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 6.837858900511585e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0001299594149643802, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 7.503370260000825e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0017522974182971845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000503946684193751 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0014399120468157657, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004056116387189239 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0015504765957310557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004402385576724586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0018058994766162238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005178578773672206 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.001474217364139951, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004127036649374616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0015923123485654278, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00044995087651658854 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.3013943780107486e-40, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.513754776072693e-35 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1d3383a9807ea5bd1041b286a1469fa3a92a6511 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14742845406065047, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022525058672854016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3423037012370881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004534073547427449 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20038284398674655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026502877333823838 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.035566259003931525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012143665358549194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0856761804632457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028762266574800066 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04876836268401361, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015994140460390957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1119655682266218, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016851034973666573 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26208954500477866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035709505116761774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15236796425987564, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019755777625985065 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11645877000552814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001831460190174745 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2732569800295104, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003986118877385603 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15870323247009147, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022161037551435207 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9361958605276772, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1105276321242612 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..17752d0f4e0984baf3b5d9b6f03e79c6adb43e44 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14311406793133874, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018301237072922785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3539643364314241, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0041947517615325515 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20137857386382021, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024449071554284354 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03128359088712783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010905195100737691 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08031060486303311, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002840064526422232 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.044490758416455556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015403655337067337 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10608525021287649, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013726454652157288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26468135425429246, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033595462893916675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1495663368544618, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018563556646033327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11421808592242337, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001519079728610017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.28497861112523026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037025598565927275 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16108455797657517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020640892705846995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.8413841516657772, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10867724653105522 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..db191404958dab2c4fd2d922bb40f67d4ad388dc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14180365518935828, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019007951978877876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3486026747587321, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004261696866882266 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.19920399454150123, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025260772599993864 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.031146297678497057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010625332283172775 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.07920383582526831, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00274094299078919 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.044164071082009024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014977618687604835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10604824573625245, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001386014229937149 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26286006784713417, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033079692185273243 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.14926255759990167, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001862261876173521 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11354760855679208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015422259389720333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2814350135656724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003681070167391697 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15986703304551134, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020835266037317474 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.7406142615504723, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07696728998860818 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4e8df889b7992e860553e422e8acd22b93535427 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14423414216147484, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021959372951004916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.34261604167692117, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004689675276520268 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.1984246140855233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027300551089189254 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03256635443812794, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011365296591071454 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0804693811178453, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027940237622737594 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.045340440062370646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001542686638928427 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10661260527903711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016541928452697811 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.25522999799700063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00362478715746941 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.14687329421715054, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020304285556757055 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.1152311222340281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018256115213476074 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.27538991990706024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004032468272508086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.15866027545322184, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002268964508976531 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9355476748458424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08372337758759396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6e2bc88d4eacd25f2981e166da2263505f714ea7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.044405906375544715, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025610959506775525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.08540051490880955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004853700270964368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.053882370203528755, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029733530298338136 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.009550693224487655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008726966214714558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.02007700952508465, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017411005950916892 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.012207544410113281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00104718267565319 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.03383880268801385, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0020073366629025043 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.06436025797802344, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036713369177924643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.040443470438169406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022120809378912516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.036232835041777094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002133037107479806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.06982777383177885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004049740359330814 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.04367643240984686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024209022639727494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 0.9480714926440517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11225116755335138 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5055b31e1b1b44bb18f2ae95668762249e024843 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.0029613486934300065, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008017954910988925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.002604820968979451, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007205582057724681 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.002683283542434088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007198699976945498 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0006160191869252509, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00030225046555707244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0004629935054463356, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0002245599913382063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.0005146148795680421, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0002458736058430443 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.002211450919027296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006050217509925478 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.001984170387640971, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005699295324941954 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.002013235956184291, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000545919825941679 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.002437420380568345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006809965618635652 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.002143562481916302, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006129726717806505 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.002196929788975592, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006019250148189245 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 6.583932428818449e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 3.862037022971363e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..303ff58d43aa87aad471f7e4dd9ce7fbef924c63 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.16879708539303456, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002317898014536499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3723291290939063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004545165883948288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.22431631903155347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026093322298271567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.042926431575461076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013931045314890859 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.1010845101974381, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0031251208246711737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.058199865858872574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017490056690093988 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.12782464697639043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001786854774861224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.28577217256570797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037793340502424826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.17056577152255134, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002052264214122958 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.13266264249564455, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018939748248399677 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2973642224201739, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041420819681628265 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.17737257514409943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022595548538614863 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 2.4757752294894693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10759359573160089 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f84a0da3c0ef93198267405ac815229b606a295 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1361829095273917, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018976552774942676 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.33393036619688554, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004242354030815657 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.19107539113194497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002518376366873991 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.028698540492772314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001082902961563565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07241022779028632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00276886558899143 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04062921012242493, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015228641462241134 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10299681457974053, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001372759455933598 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.25531583728222634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032867969214619296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14491822198289625, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018471762207363737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10874915835982801, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015461930795802182 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2691233096679109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036642185726151262 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1529660415125503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002085977228826524 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.7005679889378829, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09964986448064858 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f9191089b15579b47c913206458f91dfa0d6d5d1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13249564318284043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018709338649646882 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3246622264280915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004249757242473774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18596744227922282, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002501821848962091 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.028665504454117075, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010557844669082574 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07274913333784847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027319168569439082 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04062641993018198, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001489738006922335 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10277571882616827, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014001676277520247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.25399401920963616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033229190889309318 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14454037675210504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018835020887859987 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10648169866167753, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015420217510554847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.26335591514014295, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036975443420346143 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1498182518022352, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002092342016369401 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.6591704296842291, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.058952940117381177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c2d30771e0b5f96132db0d5b10bfef4bbce5c286 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1321970313885188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021585317256625847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.31326781741517834, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004809417225791706 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18225506964350696, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028014365519188978 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.028598963986615815, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010992959325129637 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07104911617756425, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002803269832027392 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.040076565671093634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015348459428170054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10064574048197487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016306171384224302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.23989193498285913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037628092205002047 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.13893464976445405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002136322118023521 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.1044393755169441, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017305664557089341 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.24983012145534195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004050169780460673 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14439166349049962, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002291465287441587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.7550272676426535, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10693852053384469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..904eee6be9fbe09d627b73609569f18896535a32 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.036052583256726584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00222465814162892 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.06806165902172936, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004214887088569051 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.043595060973618746, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026168296236483567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.006971532253971467, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007060259100817015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0151236735555749, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015108099841256314 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.00916357714654539, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009047152239114784 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.02826611508296682, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017443681779659868 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.05303756340984109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032948227775611407 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.03393069801696403, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002029922326331422 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.02934032328626938, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018132080168363025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.05552522032023831, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034822420283327185 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.03536876258776992, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002130719555450794 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.761689921419185, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1328182623834428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..02a83d0f2ba7c31403501dc2c7f1afb9c5b681d4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0013223146686700555 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.0003997094985643063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00015232935691739325 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0007072965816124758, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0002672293328922938 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.003144654088050314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012266788586519496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.00034926050249932806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00012487917660613927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0006215332883020128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00022231592837332874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.003144654088050314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012266788586519496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.00034926050249932806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00012487917660613927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0006215332883020128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00022231592837332874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0ac95ac1c9f69483c150cd9fd10916909b37f1b0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 9.584748891646283, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.4634311680433123 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.250581062033785, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0058239075018178725 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7062158738344604, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006429129435489434 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.31552138662440865, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006271351560477954 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.19130230064405784, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005388995993524528 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5459773371219434, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0080205859504265 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.24351260224849494, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0060038232867892245 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.24167318855021985, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005722704187225168 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6860409065042077, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006645065801360722 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.30520915879086846, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006214719219843871 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.24469422917506942, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0057794266304149345 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6893060160244424, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00664328067734422 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.3082991401991823, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.006258332068067462 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b43c2f0b1f07bc7f352e023d8abaa3dc96dca12d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 9.533178886935461, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.280143031253315 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.36248265473655833, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00786742415876683 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6236177733252095, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007294999942203647 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.36387361635607934, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0072313190667876864 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.262274764045787, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.007192741692019697 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.47184937877299354, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008337489044044633 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.2728457121204912, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00696791537970479 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.3439193525878385, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007678026405379061 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6036216040299907, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007526769831516323 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.34999255403663865, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007254981296861731 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.3478926854036015, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.007735989195619495 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6064434019351521, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007514234152920643 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.3523209043106269, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007250830483715573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fd1ae47cf7084aeeb5b898354c838096831e44c6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 47.02235168348344, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.7717853003889126 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.6572202360495694, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00684754645297467 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6314346295705424, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007231765242823908 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.6186288635304461, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007000082166722139 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.5015222551527236, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008055814149315598 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.4939358543250565, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008237670269354857 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.4813163134151628, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.008040016805301602 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.6352718835824829, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007015561715569228 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6161858038815646, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007440102458339062 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.6025255165585022, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0072208910046901355 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.640484622899746, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0069959465065261486 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6188983018867796, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0073956327995358855 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.6055217771045465, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007171276947403789 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..38890a00367734f9c02989448e9f7d2eb44d57d3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 52.9579720847619, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.3100172607657463 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.6766298604013554, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0067056488771926465 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6598734365934378, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007039834715151128 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.6471778364388423, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006873790000568802 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.5291084288377738, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008003327478399361 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5238663129877955, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008115248816933896 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.5130291514276495, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.007995711764359356 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.6572059275393904, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006902013951713947 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6451122696509598, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007258135995140546 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.6318467907423595, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007102064438778346 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.6610004390006043, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006869316294428751 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6472344224673601, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.007217696942084704 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.6342013853383822, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0070565440039982155 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..489060cacf88c31c31821f6d5ecd560896f93c77 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 52.34178177222339, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.133611224110179 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.6802288907430989, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006708946985682678 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6684627731217727, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006968332579326068 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.6536521914658334, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006849076962719914 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.5329584588559811, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008028711159823622 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5312726398193803, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008141583217459525 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.5191957202312659, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00803537497169239 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.6616061710878812, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006895248485381961 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6544242595472203, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007187512365355003 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.6391386680743565, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007074861598280453 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.6659368424074638, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00686752783122648 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6566926468449152, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00714104853293446 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.6416329037978986, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007024709271452905 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b9c35124b7539d02c9f53fb498ad36bf6f2d006a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 54.45931011034328, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 1.3566590232163973 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.6910659299797782, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006587178887176958 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6819893555935073, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006829836530109502 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.6680605340297567, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006698004882443747 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.5479142345751371, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.007979754345711795 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5461321733007606, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008056987802607279 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.5349097989485111, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.007953481702063668 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.6748350327218241, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006785040185272315 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.668942921143986, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00704009869784028 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.6547986538369945, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006916766018240786 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.6778670501360622, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006756650353609527 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6709060733280635, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00700381652397326 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.6568291500377871, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.006878708628824482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ca871508086ec3e7ecfddfb151448bf2c50202f1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..41f626b36c953dffd45f1100de17a8006c98df0c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738877 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..59d0d6e3afb761b87af4023889984cc4df63be70 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.4929270946681175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664656918145945 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.4929270946681175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664656918145945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5a8082176ee4b806375a25bbf3a66b65b9fd34dc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5065288356909684, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166482959521097 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5065288356909684, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166482959521097 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b44aca4167c6125aa16daf8e1b1caf079314e2c5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5059847660500544, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664988455853328 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5059847660500544, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664988455853328 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e196c498700b6e3654bd381e96c7b1fedb4a63cd --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166557553076037 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166557553076037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17c97cc4ebe9f2b25e9b40a545aa9c0b1879ca24 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.18526507984327809, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.014352687013672758 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.0207974489505957, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005419055674694086 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.2318870955901451, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004096738969172235 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.036171445152019484, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008434569441985617 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0031982235045390026, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00016202149012648599 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.041020991401829215, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021462417342201236 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005621669068984276, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002680503297844867 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018948626201834524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0004633920176847882 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.21626027275928003, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0038232690449784353 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.0330631932561623, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000733927602740061 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016844577935812375, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00044192663287105345 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.19686515382910386, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00370184205433221 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.029360548512863772, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006872127389303035 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3974ea1a8fb3c063227cabe263851953a320b97a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1702000893250766, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.022602277898476263 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019738604164709916, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005943670794712055 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.21655770189995802, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0040458352158600835 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.033921577431504014, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.000851851348681226 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003127993584163729, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00017993685583142814 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03773492345933376, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020242951172695168 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005440119305280117, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00028436946832994327 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018344912777774225, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005160040189181007 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.20486712491657955, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003806037253425624 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.03161334313286401, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007473887108116619 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016140312647768006, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004983217012086031 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1850395128939851, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036539476740178115 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.027751716986245508, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006946521749936248 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a061f568caef299a927896fd2a366e1721a8382d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1503842124457421, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.013366858191649686 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01879881841993818, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006724234666717016 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.20311375385070707, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003999202804540118 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03135729952678734, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007915594925290304 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0028249484242329798, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00018807827347922434 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.0352797748367778, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019648415800267776 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004831334688117916, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00025723946213493105 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01766775823250998, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006294794523191221 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19315703981184032, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037946160886462503 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029545104979527895, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007329506313779751 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015392857681802947, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0006117781547052657 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17259808203436278, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035891800128922888 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025479267218651926, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006476891534007975 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..68a4d5da33b18158cf0c228eaaf456710a632606 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.13272684019612407, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010461864794627634 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01850944530673799, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006860099150056603 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19748557905607972, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0040233681628607295 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03068502702468395, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008064587848542262 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0026003584508943025, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00017066530138983938 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03498197751607171, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020301674433790403 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004497352970943405, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00024384382231740025 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.017220549482260407, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000613096462513708 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.1861971663320917, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037489583224773423 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02864531316521238, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007274072742712978 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015199797557400578, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005931310311220563 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16911250346695345, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003583275698147175 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025128864550950536, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006505763970428052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..879f630085f8c37f8d9e5c51d415cf737cce0318 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1116223813797144, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.009741200478014737 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01686851468849681, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.000499194047802729 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.18775401200901778, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0038484179354708087 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.028877688849418524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.000724243387986705 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002223786720488836, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00012230483001260983 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.030293584456631554, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001817796896011939 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.00396783046089762, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00021343834712482054 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.015625504769488287, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00044811071614759033 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.17632683978230812, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003581217055366121 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.026778467249332305, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000647417434268784 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.013796234932842528, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004109358448293982 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16103835770466796, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003451182928178216 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.023672570832917728, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005880376849600522 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a4dd3d8d5afe18c939a37bc5f3a9831fa705ca29 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.12258212222097367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010470680109155984 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018668170976776546, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.000825403324535186 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19105972479056632, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0039771031374790326 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.030015026802667398, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008029789068806333 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0027317811603090264, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00028543390592784026 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.032512896092599414, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020795575853301057 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.0042412326403525056, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002561460325558005 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01696418124382772, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006649299596873597 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.17869693365245193, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00369968586124689 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.027601201325242745, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006943269476354326 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015314009901949352, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000662958634304753 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16508907780803328, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003604116467893131 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.024736080266790614, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006498006943115617 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..711b9f8136807c60e7ed1e1bd657fb14f2a009d2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..100eaa521803eaf69c525ef9788383e04faaa913 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.500544069640914, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665817258899177 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.500544069640914, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665817258899177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..daf6af22a95f6d73e413ed2ee366d921cf3a79c7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.48748639825897716, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011662170084916892 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.48748639825897716, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011662170084916892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1a4e8d3cbba8aaef03ae5ab428438b248a672d5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4776931447225245, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011654208652596473 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4776931447225245, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011654208652596473 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bd63b5953932196c1dd3677bd8acd7d2062ed8ff --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738868 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738868 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fd0a574fb48a3d6fd9aee01b118722fc3067b52b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4967355821545158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665575530760367 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4967355821545158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665575530760367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..37c46d4c8734942fb77c6818f72c5a88f833ca51 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5669205658324266, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011560864423151377 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5745375408052231, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011535468840824526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..229262d36601f4c5e534cb7a5d16bb6c360f6f03 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5495103373231773, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011608491028638191 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5549510337323177, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011595157509775765 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8142e21b236d31411c88dfc6a3ee2f60182f8ca3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.529923830250272, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011644913435420153 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.528835690968444, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011646407809944715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cfa452d92ab22d515c381adaa104c47ba5ca190e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.529379760609358, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011645667565050859 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5397170837867247, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011628961491718635 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b057f33d40be4db6bfdf056d58c21bb5f23ad4a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5277475516866159, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011647846656062251 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5353645266594124, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011636607860111557 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8c3078f6a9546d767debff1a96d9fdb291eee7c1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5348204570184983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011637500993815848 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5359085963003264, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011635700809215629 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4cd58e02ec9a2ac551e4b620e11cb5d83bf0bd57 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.617, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01538010232565271 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.549, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015743152379585533 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb90c8f9a10a610e3d3e419952c5527e34bda66d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.675, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014818724459095527 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.65, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015090650341444235 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fdc12469eb1f7528309a59b31733ff7f44a30443 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.689, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014645596385722694 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.684, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014709193056057127 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..48d54a2869fdf7ffa089a7f9673cdf89c390f014 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.696, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014553205687950436 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.696, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014553205687950438 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c3a1320405716da39acda072413030b2ca003868 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.709, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014370995982377942 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.7, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014498627873361428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..43992204ef20d4f6ae310276740f86b0db0e9592 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.714, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014297146862517908 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.702, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014470846741134713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..396b4f51736dd90f21c5b933bbbc61e51c5c0cc2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.866, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.010777762298369683 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.791, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012864077288499346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..067ac7510a2bd8043d11326f261bac3e75b572a7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.9, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00949157995752504 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.87, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.010640169792499356 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ea442bb464fac0bbcf29c239d6a8c2acd90297c3 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.901, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00944924802766275 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.881, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.010244215145336666 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bf9190686d39c17939ad2290b53c51be63cc28d9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.911, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.009008893392651533 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.896, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009658016218524294 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..95931ebc2847e0d2386935bc844d555efcce81ed --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.904, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.009320454434783226 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.896, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009658016218524294 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..15214da179f074f87950d08158892e8793606561 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Direct-Question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.906, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00923305200078773 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.894, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009739551265785127 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8232e915896c9ea7191a92e0306a146c397da279 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.46, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015768596914394382 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.424, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015635487471405186 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0b2e58e59a1f98029d5b0616ec34fc8119457ec --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.457, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015760691590136378 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.455, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015755101498347093 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a60cc2f736bcef52c2e24e2a333e0d6fe3072c15 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.543, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015760691590136388 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.523, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0158025542467261 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bc407a768e80893b2bb5d6414902e9a226160194 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.57, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015663503610155283 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.553, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015730176046009077 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2fc036d1142d4b854b1f22af7863b88a46b9770d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.565, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0156850572527172 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.563, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015693223928730377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ce420f896ab58c088b7c98f8112ad89cd8d405ec --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.579, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01562059547530132 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.56, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01570498795436179 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ac5aa62f3b17804bddb6b664585be272520d0b34 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.534, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015782683329937625 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.463, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015775927227262423 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f4ad4d270b40f62d865f3434df63777bb91696e6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.387, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015410011955493933 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.375, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015316971293620996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e5dd044d01ddbd0a60043ee2439fc807a9a4cac0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.42, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015615500115072957 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.401, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015506109745498329 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..287e79373e576ef6c3b4d3534d56b77b477387fc --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.42, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015615500115072957 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.42, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015615500115072957 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1151e7ec1412cbdf62cdeab30171be8645e6a976 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.445, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015723301886760938 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.415, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01558903518560463 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..52bf180690a8afb762aff8a13a78992b890d95d6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.462, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015773547629015113 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.442, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01571250721186421 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c13f17c0afcb2daf6e2f45ad201e13795b5c213b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.583, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015599819048769618 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.499, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015819268290576814 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e71ef1f098106157d26d379c0551ba374fdc3631 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.507, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015817749561843567 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.487, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015813952101896626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4ffd7abcbb637eed6a2dae9ceb3e3871038f64e9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.548, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015746235865880677 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.521, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015805341148131296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..474fe2d3c705c4dccac75fdc49a8bd3fd49b025d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.575, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015640320317040105 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.564, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015689173023144064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d2b3f7d22379c37afc2447dcf3f25a1ce3853a55 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.584, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015594460144140598 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.572, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015654426245029288 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ce0ce700fa43b3e0305758fb5898f914be00a29b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.581, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015610338967577799 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.583, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015599819048769618 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dbbde13499bce024df42e7b3d972aa67687ee6ac --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.49706039551042225, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011562232421541944 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.5221806520577231, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011551049647290309 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b6d06ff6f33703d2e3d5d71af447c88717cb8b78 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4853019775521112, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011557435464292914 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.48957776590058794, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559920087347783 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7083954c52b1bad933e6c51c723c26a63526613c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.47888829502939606, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011552120807053819 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4863709246392304, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558135970599896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..83a10e668220bb7627c55cde47bfb79f86ef4437 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4735435595938001, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011546234813777406 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.484233030464992, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011556682042196382 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5286e17c74a2d01f060e675f4fd2369d2b1c73e1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011528611805439891 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.47835382148583644, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011551591851683333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3783a58994895892d1f2fef2cda9870b1d924081 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011528611805439893 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.47888829502939606, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011552120807053819 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ad9cd9afa8db5b01b2d5016f3b78143675c6db20 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.48957776590058794, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559920087347776 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5334045964724746, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011536599118298177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c21b92e7e7827356e7adb46f61bec9e29d986806 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4906467129877071, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011560409019420362 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5125601282736505, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558783570737967 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ea3c016f57019826e07b01fefd8de1d3fa4ed58 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.47728487439871725, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011550494192008945 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.49438802779262425, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561703928784335 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1416b8bfc24a9e3d8e565d9fc61de4c83e56761e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555016408505476 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4853019775521112, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01155743546429292 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..172439ef910c920da2738a47faa45d077eda6bb6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4730090860502405, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011545573278697235 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4906467129877071, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011560409019420367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6e2b677d0542ca2727691fa6dbacc12650b8a86e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4826296098343132, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555452669106632 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.484233030464992, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011556682042196382 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ceed39e8b7c23e518079b9af5a94e0b87a2831d1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a582543671854cc272446abee7059fe222b2bed6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..46d5257e418628891428b50173cd2c6a94e35df0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ae153849607b92e64c2bb09684faa1da9c847eb2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a6731951ee3ee4fbaaf27f8963905c2ddcf67c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c22daaaa87b556181281443854e2f7ad44fa8609 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c2e5f22f42d76b51317451d768975224c846ffd2 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4879743452699091, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559087533800687 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.5125601282736505, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558783570737969 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..46db7572fe19bfdac0768d0f10f53b8fdfd18dda --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.48102618920363444, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01155410417401969 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.49331908070550506, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561400034509398 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a7188fd79e88b3354dd25e1fb8291f280c43587 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4751469802244789, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01154813982307477 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4927846071619455, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156122826464673 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e2a5263d76ba8edaf5c80dedb16f3713cff8abb5 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4740780331373597, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011546883081384901 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4879743452699091, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559087533800689 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cced2f66e459004ef69852c1465d20a1b871c32d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4681988241582042, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011539022035111228 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4826296098343132, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011555452669106634 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d7f7062c01352baaf69ea1daba2253dfc5c8df17 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4719401389631213, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011544210396951669 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.47995724211651525, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011553138977961008 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fd43db8a9a8d31d51cc2c7603c6ca7384e98f43e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.49438802779262425, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011561703928784332 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5312667022982362, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011539803085637724 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee36bc5af68e172f0e81bc834e531084d58f8e0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4917156600748263, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011560845076525718 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5056119722073757, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156170392878433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b012dc9d9b9997be6be7aade50586629be2797 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.47995724211651525, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011553138977961008 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.48743987172634956, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558783570737972 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d1635c56940c63c522d778ff534ff5e401f7bf83 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4901122394441475, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011560171163157397 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4949225013361839, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156183605423878 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5139701cf8589b56bac8713c35fea3743a4740bb --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4879743452699091, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559087533800687 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5018706574024586, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562351329083271 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..617c7e7ea85c496293b88aa4315002412f7c0e3d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.49438802779262425, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01156170392878433 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4938535542490647, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156155858904076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..841b50f09bba9227fa0cb8006abe73454fe985b8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4620938628158845, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..331ce6b8b79836bbfb2c9b5204a310ffda5c5461 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2539427980e36717e87b147d05c3da53d3a1cfbd --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366422 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c02ff1d94974f67642570547c626410a6aad272 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5451263537906137, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029973636495415252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bdfa9ec89182d506d49fab474f8908a40fb49872 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366422 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e3668c36bdfca08c258e353f78fabc9c10865ddf --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03003973059219781 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e625e8ca4a792105de259ddec35efeeeb305f5f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317194 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7755e91825cad2796a820cdfd557afd79a84a02f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4db64d70e2781860826573c614c0387c7fab7d32 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c9f7a68067114c9ec8625f38be83cee49a5d55e6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030025579819366426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b9fa4cfb0fe84d8b47a8f6a04a9de0a8b6bf060c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030009848912529113 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029992535385373314 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7bb86ed9ab9aa1bd928eeac58d974f17db9c8b17 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..527eeeb2b3de32492cec019ce73d41649d372b1f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.44404332129963897, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029907396333795997 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..68cf586aeab57b03fd49149a48a0d9b58af26e86 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e499d80dc5b0c0ac54c3327655c7d1f99f0cd41c --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..65da48996cae1b1e0d7560fe6b646ae4bd648fbe --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030009848912529117 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030072723167317177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..19c2ddbba21bc7acf4dd3370b290e01e9810e087 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317184 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..99b00fbdc4565ee4c9a4bb22545142ba7d504282 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030039730592197812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..89890422a3714a146ff84d063d767a197310e5c0 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029992535385373314 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a385d1e219b477e5351214a9eaf696b49ec08b64 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a1b42c87f82d5c6c5b90c3afe080d908fb90a06 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4c6fe8f45d74197879c5241d143a572927063634 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317184 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2db3b40ade2db35e45e47c3608bc935823b708a6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366426 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..910256224c1cb5b7b21e6c9677e478c20e5a43a8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5379061371841155, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030009848912529113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5b4915cdb3e9bf1302e81516a9dffce994f45de7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e079d6ce22e522dd165183a8d4c1e5ba7af50f44 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..924726d7d3816239dfdd3b79f3347462e695dbd8 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030039730592197812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5064c36653160ee8ff73c02fa85f6607c1e77d12 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030039730592197812 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..65f44a9005f5dd7184119cd1b12a888ae195d8aa --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0214032d37e15e3fabb29e89241e4e23c9b2f119 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_superglue_rte_should-assume_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030039730592197812 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d6335f4fab9d27a1fd172165d68c12f14d6119a4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367597 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0aff441330560868f49eb022c65e167942ec4349 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049749833367589 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050170094497697 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6b0af2471a59da79ab47b424ac6c7c53cf958758 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5177584846093133, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014043619596174959 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7e601b00a786df8f77fc23fb0aedc931d72f441e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5240726124704025, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014036189665395129 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5185477505919495, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014042813708888378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..85694f092c3bb81ed48b5290f178f07c88f6c5b6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5177584846093133, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014043619596174962 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5090765588003157, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050170094497704 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8156f13e2e8d591e6162ca90d7377d0f5416bb59 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_Replace_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5185477505919495, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014042813708888378 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.516179952644041, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140451261309786 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..38fe170f35210533199cca87063dace02f41ec4d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140519560640769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..82a915159e2b0c229170ac93d6eec40e9cdff444 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790516 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367585 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aae59d0208a6960a0dc5622d4768f22b7d8eca7f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076908 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824189 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fa231e4625b008f1efc2cbe004f1b789dffcea4f --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529022 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e80085c13e8f3f30caafaa000796d32efd4caf6 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616441 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048278820405621 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d0ca941c10bb267257df1f0c0abd326319416403 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_True-or-False_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225632 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ad6986695f5e8e3278fa68e950b862e68050e77b --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.48855564325177586, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048804199859325 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c37cf84fc6a5a4f0d6193f53ceb307b3aa348d4d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790513 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dfa861fc194ff8972826086f5ff4e743e81deb5e --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5303867403314917, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014026510839428743 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..360d111577e898dcdd461d451546df2695eaeb0a --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5272296764009471, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014031631629827701 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5130228887134964, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e254beb234c34f38df9ccef7a16c7783da4b4f54 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5288082083662194, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014029141615909612 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5153906866614049, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014045826789783658 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a89e1e43326b56aaefca4d8705b96f9c27490d84 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.526440410418311, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014032823874407229 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.5138121546961326, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047122916440426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8956c81bd5dba4d3c24c9717d7b7466e69abc862 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049516 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..de7e79f40ccd8a2bf42cd147d0ced2327dbab8c1 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5209155485398579, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014040185494212945 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5082872928176796, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824194 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f103a6758b0c5e522addf2fd29ad90fda8d50395 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5240726124704025, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014036189665395132 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..69c83caa67f706070765dd93914b755b606dc135 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014049294536290391 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c42281574c2f2e3d067b3763cd1545d844dd8bf9 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330349 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1e312b669fa26ac0b601edd844c0da7a2a31d735 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_stand-for_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616441 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4846093133385951, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404582678978367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_0.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..58349122ff7ae31e26b2adc2d5581cfc5f13633d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4861878453038674, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014047122916440419 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_1.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..50b6a8353a51228262dc254a59f2bb33be4cea4d --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048278820405621 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_2.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a834220c11cd1a6a72418ca13e28da43f2b30859 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5146014206787688, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014046492383275842 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_3.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0781e53590343cd1bb18937ad7039c21562a12c4 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5248618784530387, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01403510288362775 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5114443567482242, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404880419985932 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_4.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..68b57dbc1411fefa3f1915f250f69fb3e2b8dbff --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5232833464877664, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01403724130957364 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.505130228887135, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405174596179051 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_5.json b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4c1c9ecaf5920974170d5e2c521a7047966fbda7 --- /dev/null +++ b/4b284b28bc4/eval/slim.4b284b28bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5224940805051302, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014038257824059886 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5169692186266772, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014044390401612976 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3679678307227526ce7a2d17535f779a586b5b68 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1464020b44049f648fa6583f606b9e0635dec0e40e589854e45cacac48b7c76f +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69c5f4589c9157a29d9eb1157731a67a9beeb1f4 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93bff76d2dbaec8b0bfbaedff3973a77d9610bfba83cd3c68ef11d57b9a88c8d +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec2f682eab2dc6034bb43f5bb083dfe3b7eca774 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec9dc2cd6f4a8b51a660047cf910db3e1b4e4384c9bd4842a248e8dffe8f4ec4 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19bac14e416eb6a6a8be01fa1f94c7333d21fc86 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5c11d6cb150aa8555ef4ab58078f56296be22b0e5e2102568b724cb815c1579 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b32d167995da19722c75e3e6f512b2738f1836fb --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85171d39c8db890d0617b7abf5964ccc5324054ee1eb58463897cc046464fb22 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eac3e9446fb3c095b4490e37c002b8333d65c556 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8406d36e4d1a32d3203c9a652c7c48061632d6606909e93bb669968e136464b +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..376a79363caba4d286b503ff0c7636c1645a2e20 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e782ba99c9aaae54399ce15a76bafa42020764c58789878faf82cc7b9392dc10 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac1184c46756456aebf0c85ec660b60cf50d795a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d53907cfe1539e81ac86c9f70c3169bdaba964a2cdcaaa12bb391e50add3078 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa7aa86ca73b727d95235d345512fa5787d9ba36 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d67f91f88dba628d781be05416d7d79df0e45db3f804b9a8f89d064ba881b056 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67e221e9aa50db6bbc9e2bffdaa232adcdbc57ee --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3b95ccb352674dd958e725a157815bc720a7fdf21f70bedab3f42624b0a372 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af9fcbb99e0c041590160d748f07fba438c74162 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a98e1a0b041b3d5ebcc8358f4202b0f8fcd205fed4121fb64e791df0b8baf7 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e47d838213411a8aa750e183e9bee6d8355e788 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d5f633eadbc96d684b93fe8c44d2775ffcf8baa09682326ee038a341017add1 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edcd76c439ca09d625c27f400e0acee52ea651dd --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c48ee38a9a71244701cbdee46db14b7b423b8d0b4fec71fa27b00f6fe9ec9bb +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f968ee805862d5bd2de1ac9b47125f0fb835a6d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b00af6a39e00e09cb66d17e10fe5a9f6112692cd54320d5021180340b59e992e +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df40485b120305c74d513096b8d4b170d3ba13c9 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5540db49540886883d95ed11052555ce03c00f55ae278fd02dd205c36135572 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63d8b1a42ee894e93be5f4af609ff77a9e55c34 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae3b2a02f822d212e9174921e3535826a05a46fb18f93506e95c9f574c2ce2c +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcf161ecc2de794d78547401a914430f9c2c4de0 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2619c91f47fafe89c486a0cf4898ff95f95a2c11c8220a064eeaddda42e1b80 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acbd65e206b37703f472c9bfc666c28dee370390 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be941fc14fa3fa6ab6df6ad7e8c89d74c54c06a73f4d69b5b594b8022f316e0 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97b0d3d841cf808ef805a35260a804d85438e7cf --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc1d74c6e209c6697d2b30929417e2648636e9f468d7a72933286000ae24aaa +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60b7638336e9ec652d92bb5e0acfd3bc1869c7c1 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfb4132bc84cf630786e6aa3c28117e3ad1f8168ffe06dbc4877232847286ee4 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8c1630c1dca8762084bfa51ec07dd27ac649b58 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2743fc5feddf34f1897627cffa31165e071d267d00953c3caeed13ae5e722d3a +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d31c99d57ef6ef714beb28031da2d3b211964751 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b508dce4a9f3587f66ca7c78f1c9d0a1e9369d7a120c49329a8401e81d8732 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad473696087ea348bed8a7d37a6c7c937f44986b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e2de60fb757880be367d71f2464fa7c3835883036ee8b633785c927c56c55d +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e41e7aeb3c16fb7c5aad5409fea691cd43ee28de --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e4b510dfc3bb9b06948e296b2aa05ac6d2887cb57446ddf7396095daff3b3bb +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9fcfcd0223511b91cbee03aaf5afca33d6b8756 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507c4626a29600893e52525cf73806aee1283c29d8bf2bced940bde89884ae9c +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71390e46fa97a7e11b4f1a506ca8ed8510f17858 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7029984db8c25057fa371edefd92d7fdb1997f7c0f415faaff71bb118a571b +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..186e86f93280e76b2710ae245335ebb90a75a88a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679ef7ca6ceab68bfd2f0e2ceb5a5f3b243ace2626ca2720b22111303dc8ecfd +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef6d2794bc157e4f25ea7b1301173ba0195752c0 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a69c55295ebebe599d3da8d675f44e17a5e5ff1ac41c1d0e7fe4d473f672ded3 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b286a912fe63fd6de330c87c14c290638db3a8d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c21bb9a238b85efd6cbe063dcc4b0c718bc3d72679ffb0c21e3af8010fc146e +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a762e53fbac4a6c210b2496407dbcf22ce397f07 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b24f64c936e61ca8a850213548960f240a16e76f7c367fe7a838c47ea901dde +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6da6f7c84d24cef40557e5d4a8f52e18a99140b5 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9e7b889c03fdf25fd00bab87f466c122964e387d9352c08a42fb98e88d213f +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c00f528a5402192f85a3b79715c1150759ee557 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa147fb185f9d4cb19a7461227c43c1660f67b96aba4d7f698dad8a6dce6096 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..187525e8170f5ef997a915b228ac505bfd91df8a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43fd5213284dc53ce88de486e315f2a608d7e1a58333d12fbe561958e7b914b8 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c11a34415ee3f8583150fbdbdf25022869672040 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb54b876145674898af573cabea22bd9d6fa083d9eea1a05f2ddd9775649312a +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1aa29d9b330fbecd3a4e6ba2b62374aca39d6a19 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78297f9f308f69bb5034506f0d368fa2f13bac9fba5e447fc05d9d9ec5136c18 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cca3cd1a03cafaf8ee79d9703e00b42cf1c2738 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5825a512033996d076df7739b2f7afcc7ee0521ccdd60ff2800233b67580a6a +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..010334a00fa4c82eb8dad1195a50dcc48d0c72f2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2175ee33dedd9c19ef3aaa7208f1f240180f6267c956bbf9bfba5613a7140d32 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65aaee967b18e191942608f9d4d8e95e315ee558 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cf28861838917fbd04685f1268f09c351e6936be213a23642f737b223e2cb5d +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bec35a0a49a118d6c1a5bb3d7fd514563d30977 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde88b4b73b77a571ec9f60b0c601c134d1733a86226df520511c4e8480dd7af +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc1ce76b5d9a344585a5caebd3470456c80a56f7 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d119f252bb5cca185dfee983db5a293cd5600fd0581939932048028d00ccc29 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54dbe6379bbef9d6eefa72595ecb92e90a2de2f2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92b831d7920ad0818f394752ee11a7285a1c71aa5e51b813357e496802aa8ef +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..266b2a141e5a402b847616d24d4bcc75f87c3558 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d120f000c5e390e7c662830026360e41e585cae1233179a15e194164b29842 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a5ef3a329f72aece3ea99ad46f97649fb566af2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd8177b09f46962f20fac65ef1df54af20d11063bdad1b2b4974c89a579fd4a1 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb723449a704f2229c8b5baa35fe11f79c80c5ca --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b761a6dc6ffc5fdd0156636810b1051e97e9c4b1f2d25e15c6656e020457ac +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54ca846429deff02193779abac5d1a2f5fbfc6cb --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e142be7e1628473f8a83e464155a135c60e31afe43044e318d1cbc8d13ce099 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed5b36184c783a044f48b61d33af8ee34c58a8e2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b15880219d0ffb81bc90e9088e50ed037274c1dd7983d5b129e362fda74897 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54ef843e5822a81880e8bab995f327492124064c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f05182c8d6138ca62a52b7492ab115e17f401cf8804571503e6ef3bf5ae575 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e60c311810b1548b3fd36fc03948765afacc1f61 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7091a06e59e0ab3335f5bec3435e4ef02976b46aa36580cc1c93d2eb8a03c04 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cda25e032ef3ecca290858f275f96f8c362c3437 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24184d37980d977b7c6874d8cf019c18ae7c87aaf557fa935a5c76b2cd55bcf3 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..375828c2453238212697d1ff1e7824d8adcd4ad6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9dbe3f380cbefd51bdcac52db005b28e35cce839835e48cd095cf876808d277 +size 199058797 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e714b09f2621dc4e27ce575d627f9ec59464e7e6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:973516734e06564104d0ac4f564e59c9f1ac8dbbe9d4bb158d9580bdec7b491c +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..524957b328620ba67995935b42f625b7cd691165 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0fcc3376565b678a6f07d25602ad201212a37e0f6b1a44b3e40229b88380a6c +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f94754f4fd9c40a796e39ea762a1280478c225e6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067551862dd5810b8df595c9bd476d817b79787d1869fe510795da6b5203fff2 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..196fd9f11988432722267bdec8b07446644523ec --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d01dd8927a0c0e8a0776bd7ceca66c561e00636e2d145e44ebcbc1d50b992c84 +size 199058733 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6a79fff0e5852c1ee38379679a42e3fda497106 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2713cf3621a10a1abff9dff2bee58adcadf5e4fcf21a89b2d10dc4634b30237 +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d70bf7f7f42e4703612a40214c1467268f804f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5523e6445414482932f49a94d3651d744cd5e74cb9cb970fbcc0adc3c11d4b9e +size 199058669 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee3ec7ce0947d5276f6007aea2622123ca93e568 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6538d0f799680a9471934db1d1ac5e9e27c0f715b9126828f658ee56c6ed10a0 +size 199058925 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8833b00adb12c25545cbe57b11f0e9c4107f569 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09b6c2589b64cebba7a28061e31c7f4ee25b9ee17f1ce289b59c77b320806d0 +size 199058925 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0e14f0bc30118eae2bf89dd952894149d7917c8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d21a0436ef8ce74ce513aa89d6292cddc0514391b01bc6e268af592e8eb94aad +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cb6ed101b6ebc7ebf74b74a3e9f7596e97cab84 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac309a4cbff64a9d538877131e2ffce2d7d407904aebdd1ec8fe74f728e3183 +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fafe55293a49649fbf654597b2d0db359689421 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2171e0f850e05c77d5e98040fd86dc8cf89c2c0e18da3e0bf4ac80e4e3475a6d +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6149ff676e308f2311259b758c0fc02df038a4e --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51706d103c1aadbc38b550cba6e23fbcd901d15d1304ceeec14bbf80dbdc3d84 +size 199058605 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7038bf7edbd6c5e61708d77f76f227ae0cfab9f6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c6dbd1b4c8bf0c8b5bb72d213122127b02db8ae779cf4d65d7b8e013a379801 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dde9c7a9a271d376e018c8e16177cf81cf15840 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1960629fd59e0f27ac102dee0d35b87634c9e98c5186534fe919875122157579 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eedb5c3eb96a94825d41aee622d3b773e4d96391 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e9a57c2427a5414998139239e147a75dac2e2ef71af22efea343c43fc497aa +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d21404fee6495acd3c4203d3961ae25da4ae45b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3441c96fff5f13b49c81785754b5d1c0c4f1a019f00798abe41cb1c61aca04d3 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e970c8cd87a890f24269a9851f61f863227c602 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537db468acb5e35da1817ebc7fceea234e9f8ca530237d57878917e1266a7f1d +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad63b292deefc81a833b6c6939f5f5d657186c9f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333770899f673652d60c630bc45ffddd9a8a8a74f5e994723e156db1a1c80e37 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32605b2d637c3d1f31e6f8ba739967957f888aaa --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd168f654e677751a6c7b63bfb6f77b8840652c6a45cd1ec91961ccfd0a55d89 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4978e5beb507d7252458f29027bf2cbd049210ba --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72f88783a55a0bc0606c25f8f8064860a95237627d1abbf413195fbdb1a0314 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bad1805c76da7f1352ffafc82b65665b6a1ba22 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151dd6c7da4f80b4de7871eeda4d46046e85fbea3b0aa3c729ce46204f945703 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..560b09e14ca117e6b8491269450ee00134cb6db9 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e77e9b46a793802f5ac7c3b05b5de48dac235db2802b606b9ea2124f72cbe49 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9db7e1cebfbbf9e8722dffaa16cf88901515b931 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56290d88f26fbb1d15787d8cfca2c7a241b0afefc1be1c79b2406c0d5c6eb11e +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6cf41ff6a5b50f20ac97843a52bb5a05c358407 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a280b63378408c9343b53422c0a83ca46f6375da6bb43448c04832e24bca9437 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3d75b22a3ee097a258d098c573ee151458b9f4b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7016fe5613a0d762b27dbc24ca1f2c4fffa3c46530bc6db80d47340ba818842c +size 199058978 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e2a6fc2a9feef5f7f86114d1e17a342e4a535fd --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca5b43b14d05fc4bdedc43e9a526b57a7e597137aa1daac56a80438462296df +size 199058978 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47282cb308ce09c7fd9f37c395b2c41df75bb6ce --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa8f95203e10524499eaef7ffdd666f772f86c1070d1b3b2b82e26d645bf04f +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9832b88c07bf9489e4d385815601a810bf030000 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3f69c0b78e14dee6ce60904852414ca7aee71308f07c9719fd97ba1546105d +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60af5b366efdceb75c6073b1cd71622652060ad5 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331d3fe40b9277d8acf30542a69f78fa19fd89d71b2bdcf95c8c962706a1a098 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..017bc9c580fccb0aa726bc4d0ec47e12b5c75d58 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdae3669bd8d0d02f885d92e09a360115cf4aca77fa72f58afa092e609564bc4 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0acbb294b261552c0942b3845f455c4dec96eca6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:824e7d0233d6f4dfb88886ff658f0c7e646127b4faa57ae2f91b1c04d54779f1 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f25b19dc097f452017690c12c73f94da1cf210b5 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2df949ba25c7f0bbc0cee5cf2e426faec4f65290d2e3b1670e37868833d8ef8 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a395d268e0146fb775f958437ba23cc5a5cb678d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c45a9573e51c842502e5bbf555a3dea048ecf561f20d5af3f79b61254279e3 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67e500d7d390ad7bcf3db24311d1cd59d20d1a55 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d913f5c03b50b1953d1037950e94e08adade7f628c740fccc161e8e228fb87f9 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4f139949f11517eb5121da95763f6004feafd8c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c88080ab55964a524d3c5171121072a9d229c4c814a4007d172ff597f1d1f0a8 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63de72746627f9d407c7f4b2bf57bd1dc9f58f00 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a560654fd34f033f291ba467f3dca2c1605bb41670e64cf6a65a9841c9abf67e +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5844f28fc9715d278fbac8b4c872038515f808ba --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f7703e7a439f7ff45828fad51f071d53ff86a06ea79e5afc6bf79cbf4a1c8ba +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b9d5d5eec4eb73ada8bfe5df44c289871a18a1e --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33a4ccbfdc778f6396fd7f6c55bc1260a19d8e08fbe0571125ef4dd307349144 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e460ec173133f0ee2653de2716fd4b0c0de1ae45 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9821dd2401e425bddb463185f613a43b09aaa24c13de4f3a3a432af43b1c7eb +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f5590cab8e26b449303d29713d3b31614a4a9d3 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d52a5e6a343b2f284e982209de9b03f4570f9e452d043ea9d14c8638ccb55cb9 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e5e7523f31c8bb4ed5fc06e220cf2664815e31d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c631d8d4652a17e7c554f1f6493192fc1f0b0256bbbb6b56eec994a6ea46984f +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed9462efa79b1373d9edec89d6f6ff5bcdd63261 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78c5b032964d68e8080717759e0bc8783a518c7dc71bdc64d51ba88ba3db192 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f0a9527bb14c4dd651b858ef875b425ceb13b5c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3ad75df609ac84505a0cb774d901baf6526a0cf09278c5bd3b1aceace645ad +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dffa102aa3e4fe8e8c435801123f5be8866eb3c6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48939e9888b8d057a312144923633d5b468c20feff90418f51fa79320918f33f +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03aff72c7842af8c9d38b0336832f9ec998d178f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3220c505b9f66b8825ae494c2251e192a6eb021ed9bc345f0205ab9b987aa4f +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48b0128c4eb2af8c2933d46ae1054f8609e7909d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8580f30824c9c83a18efa3d7e146a986c253c4da847345780e3e41edfab69c18 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74ce51873ec4f2d23e8b08e34e8c816f88663968 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbd6cf7e96c251217a89cdca2c8c6fb864d00fa30f5954f01d0a070e0fca9194 +size 199058594 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc314c48ab11cee1807870c32d59a9258189ad21 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3561f20561f8ba77af83783514067c445748ce6d30fd7b4e63ff55b30d9f4031 +size 199058594 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5684379afa1883c943a23d37ce6fc2ae113b7f7 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ddb9647ffd54473596033462d2e7247259ca37326f58c360602e0639135aef8 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..238e7ce1418c43f465ceee47e386b76f145c9975 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e34c60ec63878745c105d38e42acd03dd9df959382835dc55e79820218ec9c +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1076e0c119a6308d71c15eaffdfe1819117b20c7 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c25da88bba7b782ba073e2a80332615a5c62e682ef1c81c542c6e5171eb95a +size 199058711 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..008b158b377e7b542eccd6bcbb1d62ffce053cc2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c0dda5c55e041fc96a8aa208787bb70d2755a21912d368e5791fe202aaa178c +size 199058711 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..853eedac6721fb690eea9da18907db5c19f2ef6d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868fd94e635511e94e0393dd457846c3cbbce807a8384f191efca12cbb4ace7b +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..983241f6107315ff71c24ace91de5fe88f16479b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500dbcca8c872ef0788d460989713002b53ab69a0c23e5a649095a704539a344 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcb35fd1502f76a2f4d08f6a32ccbf4e34d1792d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05c191445cbd95a0046a6dd2938aa32698a12f7b8ccd9fb9886828233362425 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b88906229ae7fce22e462daf9ed401bb0ce4496f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d8e872c30a2fec1f53fcf825a3ab1ac93cde294883232adba8a50ca46c0b94c +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72fa63aee5ba00a07e402a36cfc4f25b7a59587e --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd3c996f62d4b4a32020cd9197049a7802bf99ee3ffb0c416cdcf7212b7fb46 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f4e21c012fd2b71bc559a3a94dee5b47dd012bf --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d93d535abbe55db0edfc425cc7b9d0e68bfa34bb9b0ab56f7a673093c4dae9d +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d7708614a006c2213e5556b332c29ddbd6c6b84 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8757fd9e31b4aebf200c6dc97435e80f7e908ff3f3d2f3c0d7187bc88916a22 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12453e5ba1dfbd221d28cf50ecaa48926f2a52dc --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790360489757dd7774bd0732354ff46c311c952b801d40aba404cd7ebf93ec68 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6014810f6250e026dba493ac80ccefd2078465c4 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56057a72862790fd945cacc77e0f45193d139d3f1d329d7fae4b4e4b02336d56 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3323d6af354be90d111f4a15540ca68f35592dad --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b02afb461e1ce8ad701499d715bbd5f015296aa4351660430179e2104d2681 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d77d8b3efab92ce7d2b3d5a9221eab83fd328449 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24bf06ba303b53b99436ad0f67aea2cfe32f65d0d3560a642ac81323411b20cc +size 199058594 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3b1fc7e09a8bb49029d7d9136fc8e4b8e39a670 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c48cde2ee7f19add98820f433044a6ade058a41afec01a3e902202ea5b56f244 +size 199058594 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2a37303daccd24fdb6068f42f245a787daad198 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71841e327b961abf11bb61ffa7cacada6572067a7317b905e285d356e3bbdff4 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b78d249a03045d31397ff63cfd96fa7b8da64f74 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2661c5a0cb485b4cc863e4eb0f71e17ebb1d9f7df14cc3c9e6635c1fed82e15b +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7dc8d53814c61c2db557bea662f45a2afe78aad --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b09c8d0f0c4d3945f57cc10dc6bc185cbe7ebc0fea3c4d93901282860b0fcd9b +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67a7b0067064317e8891cd47abfe720bdbb486d2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c770d270acfb8be8bdc6467659e45fda0a117fe128f3c0a875a338f734979b +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76e80dd0abc11d218cb18b60e552833c64bb5064 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e5b56e12408cca89a79bcd8bc5d6fd06aa480cdaf294936f9931ad7d6488947 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd7e8020434a55ed0f9df0628f1fafb002e70d8b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da9e8114960b029fbca0d1cb9f86b6e260d7b3558d32a774598d4c4e0e1e089 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..663e03541c3cd83e25bcf127bc307b1af6f3c2ba --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4619caaef4e63e6b5e6f125ffb10c871f1d0a2f8f084d4e727e29498b8ee18b +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d9103d8ac92fe5d86505f167e7b1849f0b1d445 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450c2fa7f959f54ea395be4a46708d24195905d1d99d562892910058baf79cec +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..922d7c3cf02ca2eb943686b5cbc6a977061bef46 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbd94e5e37c883d20b33a3fff75c9cd58597834bfbcdd51c3123a389ea20eb1 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8c6ee99e2f38924fbdec75ef5626f42ca730f7f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f51ba2b0afceb88f9c29cad7b500f3bf342ad4aa1a948f2ecf9bd5211a63cb +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5da30d243a3186bbb4a125808ec65a82c03b043a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:840475d6340fbce63d16edbad4edc38dd3b3cc25dc1c28586a89bd54af2810b6 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af4aaa27f83183e474819b1d2306353fe3de7a26 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2939f638b3b7119d2d96db6719d5d9bac3b7122f2aefad64f715a0062500dd +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d19009b70ecd9eddf268a09cbfffd3a5b0f86f9 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee3060e1f8d9f2b2e3a381a25fece0cf3fbf59460007ad964510486a5b987338 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67e659fb22fe1dcf465ed9235e81b090a26a002c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef0f426995d764da9b77749e504dff8c35445509fb822128f4bf245cb97dce99 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f6ba33bf1ed092d39bebe5a96687e5e0c2cc200 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b76269619a5c501f1dfb1ab02d7c097e1592f324209f49a8faa9721d1bd6ea +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d3172fc739f1bb8295eda67fccb076766ce237b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9169221cb85122d492e5a0b1352c14b6d916fa3495f710709a9e2d20247794ab +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a85f19ebc4e8ba84fc4132abc0b3f9aa49e4a7f8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf96cb655a40b41492e4f51c7dcf8e36d7c010776cf29a67484eaf31c7f0958 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..627883b7b9f7a17c49b1e419adae6bf897045f49 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43abdcd1417c34e72df536ba95a88d0080db79734cf1408a88628e14350d587e +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ce3c393e295feeadd0a95fbf71fa1ec4b95789e --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:236e305aa7e3db01c598d599955933cbbf678124e92d12805632716c5f4f1ac4 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24cbd647a852d72609523a46242f6a0732980d6f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e8c70d19c6a2c13534232859e63e308daa73b181ba8f0c1e8409d790928f30 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8972c01b0b8bc1aea7791c25c17bd76e48a8aee8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f721639ae344c4ecc33bb1e2dc9ca57411bdd95400a9272633ce5f326565e8 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a92997d0e6b2351f84e54a0a16a4dd4d39f5239 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:710b6a0f1db04c81152a9f8253bb563c787501924339b3efa4ad66ebb517e9cd +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65280d5481004a32551ea6789b9dfaddf7d85166 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a78d1bc9df376db5710ba315eef9b3ec60269c15b9b3823625a15626c707ae +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a531c8c4ff2c8e1bbd7ffe7643ae60b5f2b345b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca587b178392e6c3fc56fd6f5cdc0f3c09be6d7ec63deaa4617e17b2cf71269 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d47a3ca65f4489787bc2aad91fe6d815bc788372 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91656d43a4467c00f93200e4a43cd091f6aa4f77815a24786a75fa5388d3809c +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b4f9167e9f0021d18ebfc94bd0f855806f31a21 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047256dc78807383bbd78a09f978978d145c9fed9aba70800f88a5f03b34227d +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..866215a73518f16dca87437ca35f4a30ad195d17 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc9499599d5ed333068b1832e56a413dee4e0bc3541b3cb5f7121b1deb486c6 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6728c46e03781daeaeabcc1792d88661391c9c40 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5cbd5f16dd17e4e5d3c637f3ae22146d6d874be5926f4272bb22f9a3869f315 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc07ccd0c0b626c2bfb2967c0e9e041e7dd6f67 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9d3660e42853e601441baacab661f8bae0ef5be2716e6d1bbad8c1bb24fc3b +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57e341558dbbf8bedce052fbb4e0720da33c10b0 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b961d81190f657ad87e4c3b28850a6119d0b7a65112e644ee9e595a60a09c4 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3539f81195d00664611a4503ae45fc341352cdf4 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a89919835f4ae597bbf740840c44ec924c03d76dab33b833f7deaadca51b60a8 +size 199058775 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51201dbc69ef1a3fdce170f7de936deb7f75429a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39b8225021d29ae4149f38c83db51f9f41540c6986d43235be32b3bf2598f8e +size 199058775 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54fbc4f908d8202cc7d06d3c8c98fe0df7baba08 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c46140c4591d49a4d447e2b33a8cc7941870996e7c1873d1299f7f42b312f0c +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ff79525830b88fa5885e1c5ebeb943950a2d5cb --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582f8a9f7c21ba3591b05aad4cda32dd312914828472f1c40744139bfef4d8f7 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24124a52b8b2bc483b8e82cb26cee5e5d66a6a43 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94892d38bb3806cc9c2bc212693a8b66b07127a159f9f3fed06f1e2555387aca +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9da22137a172b7508b33d677d311c3f176368a2c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f004f58423ccd17b3565780a4df2cf672cad51e5ac2977a648df95532298ec +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b6d8c920222394d0ddfc4f84e4fda82f60c23f2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b256f6833213408a82fee886518bc5a74a3f8fc139a20a32d215bfebaa328c6e +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe5559539bef203c705edee20604ec3792c5a64 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84929b93f512188f66ada668295eb7f50d5189966b9e8569809d440eab93a2e +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed10174264f1af6dacc40c1a43bcbed860b545e3 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ebe50372ad3984c176283b0b2242ff6250af46ffadab590f145e9f72ad5b1e +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..777df233bd99ff83fe1da6e31ce12de9adfd092f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1814d03df557f3ae5ce05e0a5d69fe767c52c617a40692efe4a9b21c4a00ecee +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..711afe0a537c9ce0974d60c8022e24d9082772a3 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f8c2cab8eb51a3c0fcbb4786520b0f5f9c717e864ce08764c6e0bf3c2ce139 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efdc1a83e0624819d57ae576d5029e4333045d08 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d40e7d95afd2806433c5bf020619a69335f8f978fd294203a95505bf4fe7a72 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e1f288da064dafe7574a14cb6351942c567f57 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72ea5aa1a56ddbd5277e33133a0bfc8f0a1bbe928eaab68839df5255491f3f9 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2f64a099f17a4ceb102914f7eb07633e403aee2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01533324471614b644adf27a3a4b69b2c5f1b93873eac26b0454d6a41811d302 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1824300959e0060394eec2107774f5109ec9517c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:656ea17d49e1045ddd35bfe3cea5eae48d5ecd320956a8420e3c954721b8cc35 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eae531e82fd9f8b49a29d11f45266b3dff80a92f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b1076bee8b14671e5a0db7b91bf4a73f2766f31473325d230cd2df87c3591e1 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54014aa6eb9f15de012020514ca6e25f0eeff731 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cbb38eb69ce7c0bed986f8a12b6a4581ef648525e8bd7281f6449ff49c98b59 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe4829e46c4193fde6e1dadfd27fb4c90a968c1 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7063874a223a79c30af958e55f896d5e48112b197f2f1661916b33161cf322a +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e515c493a7830b395ecf45df056d1b284eda66f7 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37b2de57513780bd76c16c6851376ea2c71a20bb230752926a90d5a39ea4078 +size 199058914 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c52507c69b3fd161cfee7fbbf4602c05c3dea67d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7478b2a0b6591a20040730ffbaed5c7af05afbf25ee0a79e8c5d007862e65958 +size 199058914 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a867d90ea78471687a0e9e316bfc58b23f0bda6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6757b60be3732ce7217810f3e23abbc79faf2720f595ffd9955cad0636678eea +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3578a982724b6acdb7a16e94a1c21841bcf0348b --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c4e2daf7d462512e42c3d11808f3f223d4b489c9da6b0fa7500e522b04ce5a2 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9b427bb95955b10316112d585b04e674e52d77f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8afdcf5a7256c3a5f89f4abcb8eac27247229a9a7203ce77aea253a57ce0996a +size 199058711 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20a9c2533bc3691757d4bc031e7e41a33b320953 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca78574fc622e461fc6e51cc6f44f2819bad807755c641cb9db2dbec72a9341 +size 199058711 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b67f650b5cc082c81576736eb6a3eb350753f985 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc97c6c90849d46af9010012f27418d8ab5f7488bfd2f1840caa4c3dd2e50a2f +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14ce2314c6815a1bc30cb1c65aec53278f425590 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b40e1d2e3e09ce428955cfc4e98eea4592d331f12a3249208759dfd48edc4f +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f88628e75e175c555b7f665317062a2ba4eb8723 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dbbbffe527523b86a187a28737ec114d49352cc54be05c939b795b13b8882b1 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c8068c927fc284cc3eefa7600e965a1b47ba03f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8424de0789eb1cf23d517072ac32136401f3af7c46eb1ab5fdd279a95562fc +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dad1198270a1b5870bae98f25bb589d99ec07adc --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c96381e19813175e49322a547b823f07d797ab64b8fed8ef9f6d6471162f51 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09673988312f9ff9d3be8cc97fd3bd54cd5d4087 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c56992589e5ca7d871c4aa5808a7f8958e62e03c2b0f62d171179a138d1c32e2 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1482f7a6c19fb963545e4af4f558a87fd8359053 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a123071b14c96ba5f421e05de210361c72e63c8da3ad7797347070771d87f9fa +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97ab42543820dd8d5439f6abc48c5d2874915233 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96dfbdf145d3da368aa0705fa74b4acbe84d3990482aff478d74380fc5b862c3 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9286bee308e12075d9231d35e73ad86bf120245f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51b9e97917737c6016b478fb6e853e06fb6b6afaf34c94b4eb8f4d8246bd2b8b +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ca86176b09403b235f49b2b3c706d952b46d57c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88eb520449d1b396e80ec025e4111fc4f9d3d15566705bd834ffc14d6f1b6b70 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d18a893cebc9580ae3061b6da3ff1688b24da9d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba90672f6b5cdda71153f611ab3d9f015bf329f0d565a17546911e76789fc11 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa171a75a285724372f56832c964acecd2941f33 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b390981a3236bd00141c69cf094fbf36b8d2ff37c9f0f1d3446fb4c7306118 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e01edf6ec13a16cfa2fcd888549c300e814e642c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e76eabee385c3f7c2c9d732e54cb9e6d8501421683283b35fee828643f572ec +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae826eac739ffa86ae058531b64602ea1a26935c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1fb9022be7513eaac4017665b35d99afd145377140aa2acb3007f67cf7dbaa4 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fef967eafab50680454c6fb49623c2898f72e30 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6123647f9a92d83debcee2300c110d896a9b8fbd4d047e11cd71debfeaf55448 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d2825d15697ef6dd7b56474bff6bd2fe3b17800 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0f21fbe4d00e3520042aa9a8e5a570de72af594060d0ac61ead69c69bff8ce +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab85c71eea10a73def9dcfaa898a7cdcb0f1a0f6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c734eb465c7b2fa1a98f25308a81f35f2b802332bb704972518093c047bd0ccc +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8959e1c8e05fa74a8fe0a3075571e2826da9acb6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ce4f9ef97a2d091fba49f43d5052a1e452068364742863c1257bab1d1f35a2 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cfaf19a8e941f530048853fd776af6c6db0d699 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5650befee60de2b672534b390b5e27f762e3314cd2f6f41d4b7e431ba9091f91 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e4ddfb4e2d09c63a91f8db9cf3f3ab0d8234c93 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1c295af2e078672cc07d9ac4b9e8ed1cd591730151be342bf5376d160160d4 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03baac82cc39743a2a78874f822c8e2cdad0aef4 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8220c1d61c07a9b0e8a6ae513cd4310458717290cbb517139ace92d8428f7ef1 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..799eff2b2956966a378635e916d2a401a51e544d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b449e03f727b87d6815ad060906ec28c1ed6c0e5014fcf438ca4dd1e96f93b7c +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f95d45efc67596fc24937d49312d72a750f9d266 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298650b5c1da33d13cb57b36102494596d8945d9ceda1f0474664beb2c6bb804 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e107a360e10e7b831d4f3e36350550dacf35e9bf --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7158331d32c00e53e11762c44ac3cd619eef30fbd573d5ed012f8bd5f7eeb11c +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a22031f34fbff5d9ee9d37a6bcb0b7df2343267 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:250a24a1a3591808a8e9df802aa418594e1eeeed441747d9b846c00a7f4d6979 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b9ba8370cbeb5b2697999cb9b6977466a6b93b8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32844d241875c1f2f57d0308a9e7aee913da87f928f0bbd1367d45b30290565 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4b8be79397b188848d58a0f84e0f0d0cf81ebdc --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4063cc2595cc096b71767b13959322d525f66b54367089f2763a12a2337789af +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ad2bb69045c67850196bf3fbb5c3dbd9bd13816 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f5d0dd64ceb2383dc21e49f198e1f821c5be927cabd31390fd038df82b7e89 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5203ac4fdd2781d7633be0f8ca412350d59d3579 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa37b0513e9aa8aeb521ff449f6ad93fd980cc002bd0f8ad08588a270cf3aeac +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9a24d3d46155962b6dd2bebff12c4fbb9ece1be --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8de29e170ecc80447fa3bd7aa5e5192117837b9b767164c7cdd491e671fe1c +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b03ec8c74b06255f3fdd78eeb8b01fa68fb35644 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f013aa4cc10830101f5bf119de49ccfd07276d9b6e9582752b8627ec897cc7aa +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec43376c936f8623c42d7e84b5cd25346d7e0426 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fee41f715286d95370bca28a4dcf1a58a6ec571d5786e58d2d01352dacf2a4 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20c9614c2b4f8ccbd77e0acb581ac482968c0b01 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92fcad1da9f812a94a540de40bbd376a2a3e3c59dadab2bf6f794b2ec79d394 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb8168e4d47da03447431453242ea28156c6facf --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32851739f6e976986140552c52766f89b8600c4be111564f09dc2f3d2eb52b6 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8e9a3054f92b23ddad9001071c78e587906b721 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13c42604c3cdc467312fdb0d19bdaa18f55518b0178c448ff4431a235d059a01 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f2c903e2614e3a9c3b5d0c37b0f99b7650edfb0 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:181022439a3a2c3cfa9ce4b641defa206db15d026809c22c291c718e8f8086a7 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ae24714a6cd934b4eaabf3f0d2b7b23cfdbdae --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b75478dbe6b59e5c039b562ef77294dd3cffc3a2da415849969ce3d3e771f6f2 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e4c8303f2d41f6f7667e5beee31a317ba096cd8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b95d5de44c5005e34bb016fc45bb332894a0f5d92c97e6215f067cf49c7264 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b4e95a3c8cf2b3f6db0a0d16937ec7d4638029c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f469e3b48975bd2a53339ee212598b507010c78936759f582e0f0d1eeb488dea +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5935b9bcd899b84f33b2aac0bf41538d75e54c31 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59460527953d9e308214ce1c60c2f4faaed32bbcb793f1654dc09b95fbf02dc4 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddac4d11f4aa4e4b90403e68521312b772d48209 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4827ce7c85806516fe188a8c108ee1daee76af4a89ecbf58cf8d75b38b3031be +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98ad367137eccf67a74b03b9dcd64d0f1dc90c94 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8986cf4058181dba0e45afe078820c3f8fb00e43cd64b0de00dd57f3ce2f412c +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99d4f6c8bd17c00dcd5438928b7c90f26fb028d0 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f9eb4d1f1693e67c69cf4a66b39955e8086301c7a928b79c43e470e61a5d39 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..757a2d4a2dc9d7c2fa0555435a9548b9e8046b09 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:414cd52827766e31dbfec5710b2f61c7f334e4eab42f313e325fc525220fcce4 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c16ebc637e91ba4ee06320fb9b87949937cccdd --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c93cd6d9d01e676e3a66f30d0eeaf5a7b893898d7eb587087e585da59d2ab9d +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a93521d100391637ce102cfd9b02ebfb9705537 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330932936cb21a0a35d64d5c92500b8ac3b26a5a2b7a963469a4464900ba8d87 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b6742fc34f8f9e51de34471d673bb0b4ff2bffb --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6dd3cffbe462243dc511b355435dd70d07b3a59cc6ed20b78825e1b56ff637f +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6517e3bfd5da9526f6ee21e3da671163b7582c11 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0fd8cefe0130b6c1067491bace34fb3bbb49adc178e39338030298ba5f95f33 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e21b84fe05bf2e7a05b8bdeadb9a6d1244b1d21 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98661fe8d43cc65cc80783f036d04069244028969978b516cedb2a00e6a05d46 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34a3ad030a8b3694452b35580de10ac9d09728e8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84729b20d383a69932b88d767538f05917f764a4aa1a6cd5b28ed7f58b90697 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59e573dfbf44b65f30ef65d429026da33f2e63e3 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42c8d833ef4698c1165dbbde76a2fef176ff32805f6081206574a47be97f732 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c699bce590b3c08c8044a637a8808c9d55f3a34 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:599fffa6ed04cd2b19944a2c4cc9aeec9cbc37470c049fbc8fafb4a4fa3d5bb9 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f929160b69085508ac9ca12a7356efb0a1cd8d26 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0450b51841d7b944951766f83538fca5f99d774c6a644547b0693fb7cfdec9cb +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf0f2d918d87b9059c7f75b56425db6f746df94d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b311372142bf21cdea7f2f2c1bbf46f032b48c06105e3b8cbd2fc5227da93fd2 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de4ba359fedf182fed18c25b46a9815765f83c84 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52d7c9b1bd3327865a394ef781301ba3bbc365fa1f141e94c1905777504c58a2 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..801e7d0c0542f535c67515c8ab234dec96f1c31a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00ff4ca420ec9ddb77c39d43f27c13b1b75369e6f97a6d060e5b2d824ed3ced5 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fce2a4a4f51204763c7e758d98cfd21af35b82a9 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c79d43e52deb28a36f2f1945c973f82e48724dbfc163a53b9f4409c6adb91a1c +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2125e569fc839a16a981ff1bd0f6ff25fdb1961c --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99252febf6d22ec8e53479b6e4c2a3612f90109478b946f9cecf91d60a579215 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e96c39e7bbcc976bb1bc7a8067e26598c32d7136 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de3799dc5cc07a8914825ccd0089e248bbf4af42413e5522f195bd413b72873e +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1362f2c8b1af12dfe9d516bbd0803c4192b37c39 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e75a20c3e14513dff8e1a744bc804021aa278f6fd6f08799ed054dc679c75dca +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cb142b1ed25a5801a310e50fa4abe2653b45579 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad3bf5e85e6d1b7faa19e2f3acb6e5f348737d6c9cef076a84dcf3cb7e7ab00 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8665a86c0f56c618ff4b0366b94de1a8978ea6cd --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b58de451e63763c784a3fc35803cb38b7d574b4eaa2a75817032e26349818a +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97937dfa8e5012c77bc342ff24fc87eda020eebd --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a1e22f136daceb5c2751d62a271073c1a04a5e173e5906983819124565a218 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ccaa85c54d12a95d7aea82b2ae79fa4848d6da2 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3896c6d77e2960fbf72ae9e4078b4f1f729296d65e5e2741032ad2006c24d5 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07156020d87d02f2311f55fcc8e4662f061c9f66 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e14407e06be1d733d0e6d21f25c158c306759ef266e867c99651c4da12e0f0e +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e1970157fb1b5d7d92fee1e85d35666135b0bda --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2ea79be2ee2e2fa224ab877fe77e16b737d7f9bb5f5fa28f3f4904dc5ed89e7 +size 199058647 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb39bb2d39cc49ff5a6929c64f9a4d0c8b7b0c84 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997e18305edc1ac6c4d15a612973fc11af6716532514584f8e7b458d912a1adb +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb82d247e0d642528619c818f2a08b7cfc389972 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9028d87f96adf7982a84ef9618ae4f27c1477bd3163faff4b9984687ba1454 +size 199058850 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfb1ac2c0795254c9f15ef6567447eeb7bdc88d6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19381302c7dd7731e2b7b5e6b6fa116d1d472eb31fc43c45f899b609b4eb0860 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..851e060355245290c7c840aad680665987442306 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f58b0724bcbd0f6e78ebb429f4a7d0287a2308a7b0a6b461e7dfb5ad307eec1 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ff4f7d35e26a791ef1c7b12c33a2ddecba51b6 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0e4a064ea299475501075241f0877ceed02c5a73f54a9bbefa3eef1bdf1f62 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f803fb66545b91d58ab2c8bdf4774a73d627262e --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:171c912695e74852e3839f295c64e5137bf9a7b46312e0fc9a41a8df1fe95fd6 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5834a3cccac720c14de02a87e103a50832b7722 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7db17b6fd53ae23397d295c09db08a56853cd6d3c49179983b07249e490aa9 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef5ccd41d71bf20c993176a97109d69976245047 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28b05e73281b86a51b721f28fbf5177b2ad649b7fbe49cf4f5f2cddfbc23c37 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f1496f6c0a8845303770548592025c5054f4af8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e35bbafd8be1238a8db9dba816ee728f012d5760ddb0873df5b534ecb479754c +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b14685018dad1dd630c8234d7032e72141e7971 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16ba22584074aad36b77d7f7b55f608488889bc1af6a275e18f819b12fba0c23 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c52615adf540fdcd908db71c6133805addc81d9d --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0258976c0266308cbee8f34560de6eb8028dc3239f231c38f55b74c24de8dab +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d243ae000c1331f4b2da26f4f9ce4bf0fa211ab8 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20aa0fabdfc01d5133e89b783f8f1cc49dee7a88a5cfd659785dc1213522add +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09287587070237d31a0341587eef6fe2b9929bef --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee43d5bb9f68a3be6db950e02b05ea23bb40dfa20064e9c399c29f7f9b24d80 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..229463f1c84b84616368a5dc0572b48e3b158814 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28689436425e5bf18f1b2055fc2b9964320bf1687cf2446a45935113fedeb576 +size 199058722 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee29008b900324ff12ea72018af208cfabde581a --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:661fefbd35687e3728d841804ed9fd642ffb6ecaf75562045dfd660d139c3586 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73b0a185103adddf1335d39c99b85801bf7e2d23 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d330e3357e4cec78d222a65fe7d58169bea8d59baa3c19a3a758e368fa30801 +size 199058786 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bd7904c42bf01596e579163ea831de71ce18063 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31da45fa4d89fe5e179ca6d5b2c7b93bd75877df39952b7af0ac952c53d9e2aa +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf2f403e71070829346a49d122c71319b6452393 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d1691f2f176fbb7d6fe86bfdd63440308c62db5f0cb05fd7f585b38c2ecb4c +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a980133c9049ca957276caee1cb4e7e6de15b8be --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfa287952df306f15d1c5d302f1e1f6cb638a5ee4b2b821e804db18544415f92 +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4281bfff2bcd369bec64fcf67335a0d71f9a6366 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29a69faafb08f5a4067e2f54741f036cd9278838491a508ffa65af157ab427de +size 199058658 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a5e9c679d03ef2d14da9390452ec23661a0f834 --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f28e864b912870d1894cddb04f290b010597577b2fd104f6bc085e3987cb96f +size 199058839 diff --git a/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d6540154a67740f1b930dc15263ee7bf407839f --- /dev/null +++ b/4b284b28bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1b657821a59da29506d0f9e8ebf61a68dd2c0411a28ad7968a9a80804facc8 +size 199058839 diff --git a/4b284b28bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a80398b2894d20473f92caac2c331c9c6258546 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f9bd525c278faf018a53cab0fa547d21a331bc2219d46f9be7538d70880b4b0 +size 167511299 diff --git a/4b284b28bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbbe8ff8a92cf3d1fae184b6026336fd45048dba --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ff6753ccd4f404c5c7cad7d7526325bdc330762a2ff90db12313017f95f3f2 +size 167511299 diff --git a/4b284b28bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4cf7a38fb9b1985937a363bb8bac8f0b6855b32 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82806ce552e21b5b09ef15623ad03f2a13b386bb52bcc482d2477d63b65dc50c +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e54182f697983e923239a4c7c08904ede662766 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62f8e87eb384a537e94a1cad31c12fa7f7bebce6db0c9960e236a4960ac064f +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4488a00d569364648259e3dbc16161017eac6c0e --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c01e1370915ec4ccbb27294fff29b25a22537c0c7c5d968afe8ddeaf907007f1 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9c7772592ced2de68330a07146624ea974fb2f3 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1134e6dee45f46a00dfc8949e5aa13abd5c5cf43b8b4697dd1797ce1adf778 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1115c042419bf8ea082a4ab3a5cc84e7df7a8d39 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89e9f7973019efb9dbf5aaea4a77aec66d823dbcfabd901bf5e862e0c97605a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..002575bdc77eb8353c793b5a043d27746fea9978 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382cc714d14b5373c830b62ff4ead1845af2734e720e359e24bd37f0c0111f5a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87bb3d28ef115ea4a4cf8c16c56c4f4b9fa375f7 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306e4ab9feb75b7f41e48a909b5209d16c9fac8916dccff037b39d43df6b9035 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf4601bd7e7b081964d56f769e1e934e1524c9da --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88eee4865fe0da4eeacc0c4bebb6c3db382e6cb0281d766860ffb8dea95f11a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c80c6a650acff5e464c8f7470c4ff2297cc85db --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bce92140f33820fb460120da51ff22e053c97fa94c6f44118b5340f52478f24 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..885361ead0ce09b6a36ea3a0bfe464b95ccbec11 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a467fa7a21508a60a33626e2bf7d24914574f60e9e954eab52cba0931c4046d +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f258785918a50ec673983b44c14ba0734109958 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5883a85e911e7f48352c0a002ba66ff63575b463b0fef37e3c42a9f80d6c0678 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e33882ca3c8ffb33772c2294a945009e0103e08 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6352e17e3f3e7257e97de247a3612cc1ed9f0b35c5af4d6465a2d46f390a1a5 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1291550516ce074ce94010acf38361958e2d894f --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d05adbb2948c415a1c80e24d36520d56097d9157fc2156de64158b3756afe8bf +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af77c99de0bb4ad89bc19dc8a311ab230a18822e --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5acdf2ec7f4425ce374499202accbf5f2577b61d1fa7de58d0f93c7f67beae29 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..425b2584d452369b877ce91d25ad946f5a44abc8 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d21eda38542a3df55a7d1b16cdb1f0f4dc7c0e0369acad6fd7f341f2129cccc7 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bf4914c6bf448f9429640d9098d071f3767d243 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4eb74a00cc21cfd0fd70ffb23292c794a95f876bf62bb3ed88868da6d2d8331 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42a43cd4f6f98e503debb9efa5c99427c04e7adb --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e125469a4df79e68cb9e7f6fd2717ee138e58a6a62a33704f9870df709695a4a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84d891d6577312d0cbe3dc8f74c8aefb4073c22b --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13002842130c9a415c2d59b0236aba9cde5155cd1f7f768eaee86178ee70e4e6 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c49213ffb394abce16e694083d3cd4ba8ec4f6e7 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b93592d8129aabc011b7705599aff452d4982218bda385fe523f2e7c2230bf +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74a11187e5ce52c967787a03fdc1bcdfe22bacb7 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b2fa3af13fe9bd9ac93446d6d53a4353b98fb0710ffbb6815d4ff0a6ae244f +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1b14dd34f89419b41e3c3f10ba94088d2a00f1a --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b63e1876df02f5563dff85e40ac4151c741aa2f4b2a69a91613d0514eea95b +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df8969110b06de81dbd9daa283cb7a7d6d9d9027 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a29a0095bd8a6f88509d4ccedf7eccdc688417e84ea3e1efa4d4c59834a2864 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..326aea7952e0de2adcbf5c5ccf285e414b3f8245 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab82f66aea4da7a8e87ce5f7c562fb8dd36f562bb234fc5485339e15e20bc383 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23e543cfc907856c2c517b5ae2fc687fcec54be8 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01b5b36890afa90d43417a15397fe1e4342da3f95f14cad9e9f0ce97af45ada0 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ce9e93632183ca5d5f71866cc3161403f51d920 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f6c45448230dd9a111a841bbd35daea65f649e58b6617165163a95ed7380d8 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f819803fd8def43c50157779b930dac192fa7092 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66274381cdc817e7d4960b7400653fe8d4acfc2ce1147a3187cdb54b9dc06264 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e97e8c54748d6d8f23df8e058d0df32c9d770ae0 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b3bb329e847ae5693a708fbd1921b49e994198efbba00a25d6a274352648303 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b5b01359a14ec8e19c002796808aa813b7967d3 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ced17f9037591ca2bd0cd6cd967927b81138c68da64961afa19214981de0f9b +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f2aa46427eae7fec366f831cc7a0c11d7feccad --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77862dca2acd129ee4edc3b6473beba08d044f7332c9e74adf015c0fa5e88ae +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5a658bdc376879c51b5273f16eae1827ad6855d --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6ac701588c0e6c92b73c01b7b8778bcd91571ba1c7cb8464c76801f1786b6b +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e16505282a1abe3735a5d7770e2d06fa3d88af9 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c32dd6c4ed437858ac8088851d37b6a977af593a6dda4075e6401b06da08eaa +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc2b4940577e479c1c30a69aa9cc8f6dd39806f --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cd9543b4e9e6d007ff772c970f3e212817b3bdb669abb7857cbf22710271d16 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab12cc6481cbb4d6dfe9458dcd382115f4dac7cd --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ebc5c637e76f83328c98a6984558fa319d2e73ffa3b4c657a0a0bf28477d611 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02ed76d8b37e1efd5a17ac919d610f2d2f0ba757 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f754d1e6a6fe5ee1f3ecd1e03f2331cc391486ceebfd822f25846e9c9bdacc +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcc422fc5604d0121181c40ad55eb212cf14ec41 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b94c8f03aec97122f6806701a17e51e3ff6dd24717b87c70358111786a0438 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8524e5169fa75e8d9b721fe4617f1f2c95d7193c --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c43b0b1f2c80d1757652e4dcb3e541f86b1099ded8ce2819a6d40176936ba27 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80f3161c9ef99a87c6ddab6e0d19ba1e1b85239e --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79df5ab698b1448dec4ad019dc3e86346b14e49e2b7df8bb4649ee3048c350aa +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce14ff77f0b6f06c74959ac03fc1742fdcf938b --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f75b1906f497563c443c1f560d54c3ecaa2e8029e309dff5a59e1ada1de78914 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1c814415d597342b67c68ce5e3c4d5820d9e64f --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9581581f6284dbabac2913d8ff8e9d5a55d92e2c443272eb9af86e8a49ae7d4c +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fba32657c5945081ed51019fbc805d8f91e4f5e0 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba481d155ff8992fc70dcdafae36d574961f17a59eede9157d6255f9cac7bbc2 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d6ad2107ba37bd279dfaa89d7a5230a0835f72a --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28543a43646eeade04f384a8447a17528882c0fc3f7a328aceb3cf520719ded3 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf74a27e8ef467418b590e68a9d1626e49483ce6 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74867ef43183402b8ca6018cc57e1e5da61ed7ebd2abcdbf11d1bf3e4231b5ae +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e76c5f439a4bb6a2c70667048c4d806412964361 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cad0fba93c39ad32cbfdab65439f04a9a26dc50423c8262ddf959791b4f687d +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e13875dfc2bb48bc00ef042cb56242a145dcafc1 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea60a9ea033586eeb8f09469657a1936ce438439e380d6e5739c176cdcf22e63 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11d8ef82d5fc9976d75ce07e85d29d97e3000f61 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49377ec9fa776edcd6bef42df9b429426f29b06a7f4d926f1864647fa67b0cd3 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e95b340712ff8f044b3226b9836889b009721af5 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0803a41aaa03f3c5cb2299d359da6c805def9d6174b550fe57976b244e7aecb5 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8416cd534dc8b3124bb2fd135740a7e06deee7c7 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b50bbf56c3eeab658716722658ac04a4e26df817707a8b00ff7b6d7a491c24 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1de5204099569ed6c854987fe1e01a6c91ce0c94 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dbf335318eadb6d730d9726e52a21be845ecc9a475eb59c56e09d68c211bcb2 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07e9e93412a411120d7358bc29cc13460b45002f --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd167684f14f7a02ba147ba882945dd439ec56ada58944d6e0fef9b436a8d547 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d96b26552008a679bbfc4ae1e83d16159b232ef4 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fee19ecc77c7dd05d044c817a9ea8ca18decedd493393dc73834c9a25d93b0 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03ee298414fd607afecbe130b7221386391b6a66 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7132842ff527575ed004936237485e259a0022cdefd6bb8142456de7364e85af +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..958e65f5bd517ef1fd4ba9f1d5fb8a47aff9e778 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784eec999d3d74d0af4834d77e38748ef36ae8d02c501b80244bf38d48a6d4c7 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f9f8ceff1415254cbaae3ee49c72090921cdbfb --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd83d98bad0634492713c584871d8b8b6fc451dbb0f69c38683a17ea53b6158 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6504f5b25267e6b195e4946d3d1ce0d426aefe32 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:258df3c5d42671a4460ead065b62cb3f12410a3370222f7c18e4ef2e0086837a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7436ad0eeb1d4031e5fd1501747bfb93992060ce --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ab7abc167ed09008c6c44daab8e43c01ce80ee0f75179569bc29cabe7ff742 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f15eac80f8dbc7583c68f599f5687eaeb13f29d --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22e59f26818c842756328c08fdfa5d9cb68df111958887535a61f7e158551f2e +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d59127d4cf3b4ec52da6266d51ea2ba20bfd03ec --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0db2049566bcb79ecb6c16cdaca819db4ad9e3c91baf9634c4c2acacefc12f +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2916ed68f61bd719c45ce34ffb410e220fe193c4 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf175bca037d30b351565eaffddddef4ae658bf9789da7d2659ce274b9b8769a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0521b10f899048f59f1d600dfa086fe12fab760b --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d47c906891001076ede4a313183c7043bf5b4621a1b6e199c7826bbcb38d778 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5381b79299a50e93b25f58708a6f753f234dbae2 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb7841b99f711ac07cbfce0239c95b3402d769fe883caee0cb9e606c40d1f78 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..973475d5f8aa897efdd5cb031bdcf8abac568f88 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93a4d429ebf8cde0ae0063527e4c2dd89aa3c98aff36143a09b1561fbd54b055 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc108b21a63b19f37fb2dd9a3ba9a06b7c91d477 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff2248742919c7f84ba56b14f2e8c48e61f0bf3bec4350d67b070e7387514c1 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69141dcd2cd4b114c320595ea848143fd58077bb --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02661535469d88dfb964620ece709631247ba14339bb352ff06eda4bf8e9deee +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e1c777db5c0c6a412995c9d26924b3744214d4d --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54a05aa20f25ba0a24a71cc28b12d632645fd0fb90390fb73fbef1720ab0c4f9 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8762129a5e0cadeb4951cff4170475bd914553 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d9a6cac0aafb843f6fb2844d06a56c232d8b8e5089b0f34d955ad211068304 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ec186130508e67bf3c8c895d87778e3e490e414 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eedef963dc24ab827df712fd63c5c2d88c2ce3f31c3598c135bacd8cf8bd621 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e43d93ab08e11f915d700d285e08b38f2e8b71f --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96165d7388efeee7efd4b87b9a31e145cd4380bb0ee5786e868080b37071a94b +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2414fd3e0fccfab2c08bbfb6623822958b1417c1 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b41a32c03705139068b5c3509c4059f39d9c16fe2fc65147beeb3e83797fe24a +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7977b00f5c7b250113bd8b0643d7530f2bfc618 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd47d66a6f5302bf88de729ea85422a7bfc6d52b8f4cd98dbdd9d979ff371e5 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41b7845175c6f59aa76c41ae88131a45fb23fe0a --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af66744e4b7a227b27eb603049513323aee501d50b70f1f45fac3cdd48b2abf3 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c91b49d36dd3b52f288f24a396232414031df14 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1da1fd51842912b180ea4351eec460684f8d971cda4fabb57fb8cfdd03be17ff +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d6e5e3ec14b0970e3e0d2d8b2985574bed5fdad --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bfede779db51b117bd9370377eb2b228f774fb53772ccbab78037ef29063395 +size 113308931 diff --git a/4b284b28bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b28bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7940e5642b9027eadfee2027dac848a9a68010d2 --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b916ede3ecd26b6f70f6ee285841b9a149e40dcbf608c0b4c887125d3eaaf8 +size 13507 diff --git a/4b284b28bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b28bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca3351e8a9ad377d925b6be2ce4d54b7c4cef0ad --- /dev/null +++ b/4b284b28bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe467707eb8b4323e7f5dbd9ccd34b2d73738975e651d241d64f5a4aaf609b0 +size 13507 diff --git a/4b284b28bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b28bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..853a0efe390d93bf0de1dc4ae9838b8f8d1e450f --- /dev/null +++ b/4b284b28bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582c70ada663ffc24b49f0bae1b0368bb64655cd69d6e0dfb204bb3353d5b81f +size 51443 diff --git a/4b284b28bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b28bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02080a1b9e30ac6b543c465ac72a81aa7d772522 --- /dev/null +++ b/4b284b28bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54910f06eddd9ceb63c7ce9a408a2311c0ffc05e966cbfc32492e3da1777d2e7 +size 51443 diff --git a/4b284b28bc4/transformers/config.json b/4b284b28bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b28bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b28bc4/transformers/pytorch_model.bin b/4b284b28bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..af1d76d3a7da32b2e9a8d6565260fdea34d58df1 --- /dev/null +++ b/4b284b28bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c78328df91eeb964c8b1df17cd23b06a2d7d74bb0a5592c1249de64b5522e2 +size 8781203669 diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3f6f965b84a31cdadc6157908c44812014f2b45c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4273206525263921, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05364575256139351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07516332695488044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017494053619516534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3000002487080154, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004693559983294075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11230037179769856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021352190368007454}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03590952594711019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00111479086151588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14620648751654972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003234756290211622}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05344453588119793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013515518560690634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07168459834878929, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015668817269601622}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.29090649446634975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004555120095447256}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10771019147913463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019671591552995436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07152558346509422, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001609325107400611}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.28799432197952096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004451937577964067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10717155022915739, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019927955387855968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..812b2d20308def780dc300f73653b8b1d74ba071 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5663724921835591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03793270967595185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08217512985928499, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015069260165856669}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3920117360421087, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005374085547490371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12756709737220784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002030468381881967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03886435424085205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009357648884166175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1952944224324245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003818475633981706}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0604895960614538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013006193696080512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07729829270693146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013303444161253287}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3723170212800364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005024550440352146}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12035361435811785, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00181560137606144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07779518642109406, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014090714637609509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3702218835789373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004905029541255394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1206781467869011, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018904175388126346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fbc8697256c2858a1ba2d53f7cbdcc464e85f56e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6313130510239234, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02727704631142144}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08219107059651697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013422258523153536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4151352004015792, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005332508949374494}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12940251519432414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018563230360622849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03845495568474405, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008134611533833349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20877882384960775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003907086651198888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06086364336249341, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011655118907789416}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07626492600480415, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011790764283155961}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.38715675983971815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004836346504741043}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12014889628673367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016294871735385334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07790995067901116, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012592690527861467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3934974316637387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004930425413445172}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12261763705509457, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017345722248776798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d4f14e4a85199fee98366bf7e22e35aa24f277d2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6636681020720647, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03135011211987113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08335165133186997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013566594758271408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4363611958728497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005466144892220358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13179598729950448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018608653117238654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.038821951607757095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008309945118711307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21925719264824975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003975211126463013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06172653863702163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011849569250187196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07607458853736868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011617831771301823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3989579329521947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004797365715247612}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1203852422828686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015949578691764172}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07866244698502353, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012631336576740743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.41007264952780736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004984839846868383}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12430003630857168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017268968711559307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fd79574d668d155ba38e09d7094a1cf008cdb06a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7445914925255956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04471373508927592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08335134599012016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013601845542222193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4369343435538318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00545111390697167}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13191715691415712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001848451971487058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03875848226958462, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008040225568103941}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.22226421540491542, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004032909658463521}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.061883789388597316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011527436830992247}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0754476375575414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001174153361661276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3964743307100715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004828941243739242}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11940663728897792, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015860574558582763}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07858636401791905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012777097204336968}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.41038517941804836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005000143163857465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12423777278329896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017238674224559544}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..22ae818a83dbbc352040c5c0d8ac8d2aff4beb6c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8005310739494581, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.033724293174082695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08338013300594908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001260130864731573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.44613389749037924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0053462691621041685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13259315662099147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017356987872365276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03871058510533021, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007593773025926396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.22607554732619956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004027973657557073}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06197974009288303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001091686612001906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07523239671078998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011063220739786807}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.40282994585844645, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046937441735825924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11956303513601427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001512266881380029}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07851137617969797, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001188072848467214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.41817890606333163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004881701441444398}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12469867862475097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016220089487202947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7ee83ec6d79e7c2f4211e2558288200720dafd9e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.17179568666388068, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027588098305790145}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.14967776591519644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023741585126890443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.13734703076547916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020799156371082523}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.012844648594530184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000953695060736493}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.01366618248229483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014161977749096548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.011184314741657642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009686746779174165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.14267159823314632, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021803516732606496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.13040116723987202, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020787261762492937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.11556695658268057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001653530715898047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.14166916768123394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002522392923923697}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.11682503514042476, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002049865434311089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.11016382456596313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018129599146147113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.33659811457161004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07179334370346768}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b9e01149eaf04780206f5c145dcf3ed1638d6f6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.2979124156767924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0054955941631360024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4891593464751427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.007011942792981163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3274178122458887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0049286280426292985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.14521802379162835, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0037085106298636638}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2447744707424197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004896941862479997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.15888598086244612, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003356976364238643}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2295708148490674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0046150064407066435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.38128693273071246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005688750994881648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.24956913127077388, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003890488336637862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.26398378931320715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004954506205875008}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43612713385035845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.006362125054891411}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.29017150876669406, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004434001029770126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 4.108172938460409, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15815575820636524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6cb5d5d0a906eb24bf7c93a717001b05959ccb2e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.4187592773209161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005626717003828703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5928704577888301, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005099101322054699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.43240476392568056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003817266189092974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.23096493796741716, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004461915637136568}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.33273997806822503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0045935501380386535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2342525950120652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003402274339943465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3309444597203636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0051028624371683194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.47140835027491623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004716481756787479}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3391427202561202, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003536978206670861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.37021425910295114, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005128361770056663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.529932604416665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004898567623492216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3827941384172546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035523699230254864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 5.777777933818976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14012798168135124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..efbda5b98325ad134e10e9f52c454794e185cd65 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.39649918511817167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005575303775752109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5868851616945063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004874648360119763}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.41628159920481106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003759423443598214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.21990121411366634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0043685198653270335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3314774836213164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004469810663401372}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2270035578776477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003326641885288619}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.31489793141614947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00499101931869006}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.46789790447333446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004497579692650062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.32821478779416957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0034635730509386077}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3510123362567013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005090798191984392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.5229838686288929, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004678146069876245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3686101872995796, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035306120812830873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 5.50955888983536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19407341974434633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ba081e32227b8241b8e36f8322d22f487060e2c4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.35672902435293663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005099748319399167}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5855419342513171, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047751903798185485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.39437568045135746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036213014676998953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.19346061280634166, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0038984749084759063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.32689522978214663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004424054434433312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.21128493584064525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003094624637133662}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.2814695831840839, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004516835082055319}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.46812215627206843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044823471544487535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.30988671994943884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003261405607838547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3168249874123057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004673199828417057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.5242590494964836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004606375146707341}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.35042208286660137, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033912455978920217}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 5.135430012754724, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10515298831784857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4a8fc04f8e3c618c4757398da93ce958f60ce038 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.3346764742386238, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004843721993493443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5735003817601863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004776228919481996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3764579585424841, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035986469316141526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.17835412437343545, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035647689800114026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.32281685329216003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004493079326800901}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.20065053485376905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002995889807999223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.264408350747097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004225202781211489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4629069290187871, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004517741150892403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.29763435336229693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00319184324141869}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.297705821504551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00441559661770478}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.5151776462027118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004625313103403918}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3356555748821186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003377968614381854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 4.852629241722782, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14804180093942532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eb426c8c7c27e902012d3007207cd17f823d4191 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.15369360248720013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01910359117683677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.030770028240182067, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009226075888278027}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.230051193857904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030536564547568117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.050674302559418065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011850389149770726}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.005257320096320131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005237467608889611}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.031456692666470484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019898714460596075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.008207247449935154, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007072767342602839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.02948342463609005, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007987803660190042}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.22664687188671498, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028881333653311973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.049014322701212684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010406088468448477}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.022110816162663957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008133150658395728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.16487262586384596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027329966443987454}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.0361766232091386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001070618326400763}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7c713e70debc5a783c3b0f8861885c5d7907315f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.2681634787853089, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04998595810874735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.09550865383429513, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019686712411162346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5542119160082642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004657384009324578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.15190412398303874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002523824368707439}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.042165060237563064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00115469645580311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.25868024994419325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004235643426537448}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.06685938813989681, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015427103067306345}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.07997084816197671, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014665506771249248}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.49629192822486257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004293241239132906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.12871999313910717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018578488402400592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.08397023684370912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018152970797848846}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.48869658983291286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004444469008391446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1333199688923633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002328093617925018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ea9fa1047f2eb7a0b3b8bb7c0a7d12812ab6c430 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8205397209702776, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0673137206688154}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.1291177089470445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027994673830584586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5858999609001735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004575569928066966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.1894275171118333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002566906047548628}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.06520674856202793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017537810536778107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.319077314103273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004289568372222579}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.09546217670504362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017012398458140122}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.10404670276255262, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002280876939330266}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4958646402293731, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004496077189792039}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15348751527744423, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002000176462271699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.1158438971486494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025491115811417464}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5272294008125672, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00439142331254594}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.16985387548623407, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023697702811214943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..65ee778f53a42233d9dec4b19eb8f33656a05fa9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8545076876817848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07724809572451992}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.13600857468800703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032198416630618205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5594347319631547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004736205368048093}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.19184315592400922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026570113202673156}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0723944116238141, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022142669018531003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3143589440307445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004352566272275511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.10072973751222168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019070871002960905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.11104879023945975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027919267110680514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.468994364527055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004573707755828136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15619428997394516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022182380723355204}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.12268948388502472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029619856495348173}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5051081506672119, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00451999806892129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.17258810787266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024527387178268248}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b5ed749677c7e3c018f6cdc86672ba45abf424f9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8462916594144025, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05355432178643776}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.1278346302045783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002830129642681041}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5523726285950913, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004673512208644458}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.1860265502144425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002448379226213876}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.06637198877707197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018025537365178354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3123595828710712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004281424928475089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.09678025171714386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001657069001811618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.10317713408980647, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002276858012143727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.46461791835294375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004513282830193284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15098496922208457, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019252783341197905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.11465363747249216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002512595497125225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.5001173935227217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004512311417204082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.16714030208262487, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00222548922751469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8a6a35599dc3dda19254c2fe16f2c01591ae4f27 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.8681612789604296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06823634030163615}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.12245978780125266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002687499326593076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5431558597151445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004799564920689805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.180067960659294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002349653336178239}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.06531606229983922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019418038810328894}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.3098803783037246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004421579698743007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.09493898136075185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016863474758624926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.09968277865275298, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022756173885421344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.45735672460251464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004628702328621956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.14667555612439104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018835128549316987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.11042759700784825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024350506323870297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.49324760825101704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0046636387400625905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.16241611578249593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021493351407070567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8490513a93e45412d8035815eaa866c9f3598b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.041187412322809, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015212733720666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.2722316696175966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046313887753788}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.06697983816620594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002031025478858438}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.013070747205979933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009299883930244878}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.0784073545551269, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034778014934171812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.020385877678818807, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012767680070777197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.037903415422310224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012109396727457977}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.26240382375978827, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004287520636463648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.062415738929841806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001651359195900617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.034753268390260716, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012974419802824465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.2358071803161779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004143971381818325}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.056710032049994176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017539781237973047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.4008552013254772, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05150329890936936}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..008a4149cdf9c1448f61726519b81b510a48ae1d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.15982099926698054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023693100913810337}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6818447127780973, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039531831866198575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.24260640014350202, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027768727339828343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.06975853570163917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014283398957334861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.32360264018596285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004059465670996164}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10697454965926212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018050822143224363}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1216236757997378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017202747181496215}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5577387527759773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004272666872975094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.18681341391416414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001991917246485152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.13510402818298697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002022793072384455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5884796713341609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038365814028463647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.2056812857329856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023946587191271823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 1.9140039012454733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0890847222319399}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a26cf602f3a62bb4cd497aaf9ccbf024e8ccb0e3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.14341756746441994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021572189793251386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6692546744897384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042151038244085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.22138147276109751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027276614342971843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.06513694044684888, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012285955010832561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.340485606672825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004440352246050117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10198081937332562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016820967709998602}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10787155464448445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014894882606878578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5414697804415199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004443875807815742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1686362992157118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019210388579455546}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.12338580056305985, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018753197047131456}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5852144480947122, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004054572409003386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.19080393600333767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002379667991105986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.0435403008123805, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05599898561038878}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..781168d977b4be882666c31706b357bbb2591669 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.14272679805162844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002164859751576681}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6487556417334954, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004293557109036889}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2190831997552054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027341558417022524}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.0668120918836395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012982874018482465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3372622132047622, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004484332483533757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10398704909605484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017830520112251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10846308776944723, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015579592414390658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5276090858620072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004477426871140599}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1684031326291484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020124658427430444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.12406050272782151, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001899638347295273}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5733012952773476, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004139080844482678}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.19074419384356142, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002403668787230532}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.1672251632248507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.085436950676859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad0d8ab24c5903671ae14bee9b880f1cb78d73c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.14145240880709448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021670886996082153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6252122931764315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004362943212513885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.21502708033318635, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026834110246252534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.06554379446551205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012763015030346867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3230079259138109, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004400185110309783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10086843076345688, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017128031184130191}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.10695926781407243, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015582110755479384}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5039167223908304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004437183394069731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.1644008070500405, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001977392152542587}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.12299929966219343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018989567005332623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5541344307335275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004167377796893304}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.18738101319708167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002358418447535664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.11735852420537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05771649184585253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0682be4a81511e922c8d705be3f2e97b920ff5aa --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.13156331057584722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00203140416680327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6063200915125521, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004442245861735415}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.20234254736965623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002550195794881746}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.060544387595184664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012069679093256386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3113139243596638, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004491647148840445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.09425690323781327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001647787208632052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1005384257823588, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014914329572150095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4915833625433358, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004427689382576967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.15605859797727822, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019039344200042776}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.11521013016106826, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017980094224657457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.539127179437604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00423506881193511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.17736851108434767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002250984265206207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.0204780863218708, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06557727097019353}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..47d1dad75f569ff961b399320fb49139554ba300 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.01704208739894692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00038134288013957874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.11377181108508606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001221955325919493}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.028025138472397872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00044637295791461953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 2.243966255235454e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 2.24396625523544e-06}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 1.3383655879440027e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 1.3383655879440093e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 3.843511432044316e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 3.84351143204421e-06}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.01704208739894692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00038134288013957874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.11376898775808511, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012220146777960862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.028021943208707357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004460769556010694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.011845314198379468, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0003148449817594768}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.08190788638445848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007545756315167773}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.019321658838726875, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00028489753193219213}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.0009597827582021916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.787222007252637e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..143c2338590c610a36622d61349290497e287437 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.12338574759046983, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002589887344409682}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6456549100912315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00481962570332347}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.1889848661578333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026135009154337114}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.055329645026635225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015917418388580041}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.313132985326238, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004251686217898649}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.08423671425300502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016535014543450363}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.0994133930281887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020701300317446116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5475096620248264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044971516232100136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.15317825327978427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019859291190526406}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.10855678843630082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023039457528804285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5770399349468458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0046936823206671025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.16658141546860505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023468522672181577}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.6165943378598948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06490921111069572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7bdb9dc6f5db3ac8a0d3a0ecc34160d36dfa527f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1436735682332211, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026579719365293812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.7218682078589462, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004052459422366133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.2217914175746633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025405101764270433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.06775722832824736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016552029301074905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.37598071720158144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00439962213590862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.10461100159118264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016599147829125997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.10707337342923572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002042276382879633}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5690619918916721, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004240683333110691}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.16606908225684747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018185139704651083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.12837715690106088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022626789530070107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6569143874299237, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004022385130191463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.19923289796510618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002225786554401813}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.321779562659134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09538695554747774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..43e9052f77a0ab490b662493bcf01c7e5ad2c0eb --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.12704646175613926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014965307460908653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.7128153505808225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003930554377011592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.20833619042301474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002066573477323779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.05894052317262764, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000893949418071198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3735214163364516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004317456593719202}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0979814053469226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013226281454679096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.09418094825751869, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009581252383436105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5641669222725268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004154206174788438}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.15603166938331334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013588402692121038}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.11490360635669425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013800805146195233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.651781997216155, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003952076475725171}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.18866870584363496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001917270059093571}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.36020603763585, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08996406440773841}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f29f74bce78796f9304ce62edf2b40ec3534ca1a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.12364000397771846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014761152355166514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.7008467587934072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004059296791116624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.2028260190998294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002051576187767249}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.057702782901645155, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008602811108256085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.37151697900109665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004354887109733919}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.09599216387839389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012765491377424094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.09198221409882565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009571046005731393}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5557672590175016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004224535855348862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.15237152706065096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013583120131860718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.1119488799103195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013559824797201349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6409089457417603, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004037392997113196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.18385704643499215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018931085048969525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.2779805846158383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07170204680218034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e0e9ed1fe36a24b5414daffe72f52f95533cc89b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.12201036458298717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014806840791434343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6991506268276161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00406189988321713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.20011806334917795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020113768045419067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.05693913792040207, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008440989695321365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.37256288020219525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004406895757444443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0949140921236195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012601273301899127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.09037011023468404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009499539634538327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5528331022129467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004235035612235216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.1497544344956808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00131856546208141}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.11036350095750773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013509318154878004}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6389934623997965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004055163370934182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.1812738472805107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018629632662416327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.3867607776529205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07410446411824859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..86347eb4f0c473ef53f48e10f17c81f4afd14067 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.16426814795441422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001963248697761681}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2844248414233626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027186243949292458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19362563894365822, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019089814975787524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03345718350937418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007942980850551589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06070606915521197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015158618040175397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0397636720249358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008958485875661834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.11667958865167334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001269157880124843}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2109918938185312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002085058620844339}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1392443163337954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012458799774916409}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15221459831627412, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018171164600694356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2643817230667675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025380641829612694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1795453015126057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001766867622602647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.8179420568974458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07211632534304736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..039a16c06db2eb33046f2e4b06b3773904f8473d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1640591875496648, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002043851896737318}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.27910878442257525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002898171924167552}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19183391025210358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002001582947927051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.036188393072658635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008561684568777711}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06424101549232061, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015449018680878575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.042828281320032364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009516028511268984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12217744504962826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014008167708105845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.215360249932899, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002288377671801132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14451746994762324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014033068919906031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15179552758825512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001877771950198993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.259675418847687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027198844155353788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1777996809470896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001845289272394589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.0675085561659943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05768833984906303}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d49893658ebcbdecaa2694adf61bd48446a60fed --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1797105018003507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002084859264572737}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.3033184413058786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002905580486984753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.2094263617487459, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001995618924716392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.04358087823681392, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009101606204514657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07652593936670125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016877876980321972}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.051091351936466585, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010020514723555068}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1327550835742571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014273687313135473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2317387220648072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023325790748031853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.15636202509062477, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001396569835369375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16605597026357832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001925105257047686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2820819142889093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002744826890973276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19393075382172492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018522616086280982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.5483385377460994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09290482394504518}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d7f4b2f2178415f67b1106acfa11a0bdc9dd1616 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.15943864465828597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002433310645340882}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2568625432098374, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003461904472445565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1781210798922743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002324914677271187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03958355651727602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010187263362724825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06658917213497038, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016962168841314238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04482124580351551, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010283219957219916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.11796879235762016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017626613778932401}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.19527007389438916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002725370936396766}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.13242278894615536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016618685155073802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.14827800010457695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022750345989955767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.23948635929233136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003251371946293944}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16564510211599384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021627154072008667}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.8242304451124287, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11538371340257694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57a1f4ab013b11b3cddd69184123585ff2aef12b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.053042765456545536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019869824388181845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08545779450760739, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003005122083966226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.057482791279950166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019817479295706616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.012522243430672486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006528832775101719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.02241566754614324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012219610814187573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.014133038738676246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000694610933915733}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.04003813076428063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014800060863990802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06635053217761794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023888257381984475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.043580733989129754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014829182403567982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.048851682561389824, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001836524393749679}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.07862148294811239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002785921294419508}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05286546828976805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018276666697922153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.5910509935292934, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04934236043571074}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7efbeeab2f61d36575df35254a2bdeab6ff48478 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.008816748761467577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001006265857562293}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.012997624430774934, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001263899892629013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.008704060149453078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008478567145710981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.0017435776785760806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00023533716916175653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.00314320442872562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042253899620955503}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.002017912016348425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025853872404257736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.006962484807564813, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000844699745065237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.010341885117026819, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010188524693231964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.006723293959394975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006442903081428328}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.008338960824215367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009673737935465667}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.012211796140355531, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001190300176358689}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.008149504005547098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007911718409998427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 4.297671653759199e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 7.785583349989124e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1bd86126c3825f54d1a184113da6b1a6bdba0424 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.08776029171301661, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014617422175453763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.14351167622490646, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021269699450294326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.10088674481147626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014869970606006355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.01087054186081413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004430740620268589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.018812282252899392, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000824752401096914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012721191178839145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005085457582114725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.07683211106766431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011750417491257104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.12945760989081237, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001895510928096148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.08932496723516803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012301685164588575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.08173215113769984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013420824668087022}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.13446123935161725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019889224163928713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.09415637820011638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013706067956967248}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.5431527952918177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032104868441593624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cae482087cebe2e726ce245af1dcef709a058a24 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.12905529214461575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001979491067372841}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.20839846358975436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002894759247759541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.14631930516448613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019879191160377323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02375449052867389, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007917088515716603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.04007317722615956, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013050110520617926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.02711335795670395, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000808976754077905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.09534534429433016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001414874985280666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.15863360428057022, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002229164497978125}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.10882491551274338, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013993629759518366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.12009612909743919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018400178492989732}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.19446607740504704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027009360475646477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.13618798520877723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018366248198839138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.5569271440681214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06403146716065812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fd5e9e2742042f1e5bc00401e19d0d96f4e81086 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.15985727072408007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021342704211066905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.26171420796191186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030128510679455042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.18306178424233208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020567981461233308}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03897138221781143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009589153266609357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.06532718295828605, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015904932736581785}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.04454347983688272, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000997348114141261}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.12182921478329327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015428600740151884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.20506194499128635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002398145917032908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.14055026158657788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001469411947880256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.14876297631779306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019916419469703283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.24429220231662616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028317427638311745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.17048750507518073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019155865863537292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.588542343303483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07430586558531979}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f471c52b7fb79401ac78d3dce9654ff2d275a6b4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1414603636700371, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023891742509933684}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.21860881858220305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033065287240296643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15392307578099246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022338320016352573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.03540228251986772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001012362770647821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05680577059138669, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015537958394163272}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03895725296191616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009703362893100186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.10910133183002717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018029976378664805}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1734226719612005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026958607966266557}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1193487893330064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016609278652029178}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.1316568373218448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022375099093730815}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.20372540420257393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00310337674849783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14319992212601101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002078026313828863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.731309122566432, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12856901986784722}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ffbd4b7906d3f74094086cd72ae68bf6bf25be21 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.04529026039810705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018683347017330464}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.06985563534380705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026707350721197195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.047383213342466105, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017667540426735513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.010911973549852157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000610311148352081}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.018743643702495896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001066843466304922}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012100846520590037, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006347499136665145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.03563453127314495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00147733951442123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.05588201065370463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002163744865002187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.03713795436852138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013568378761987527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.041985085768643404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017408930957524085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.06471195868753603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002482277880366169}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04386313053982596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00163848796865913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.4824783196971761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04463252271552567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9ecc166abe8c3add948b531d505a1d0e351c8f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.0070764690349056645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008075324699720278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.010036534764679986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010720722462779057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.006986757193006964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007246985354434017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.001854426650392226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00029520732928741403}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0026116976110932373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003696132049834325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0018127250079443033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000246838125245964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.00560973325227454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006578100417105013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.007965123538597381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008557620706126542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.0054709515272040035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005631353830893648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.006686937698828636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007687710652974154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.00941357446197627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010074978588991074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.006566961010689614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006818902290682459}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.1023797520667392e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.3615613872249627e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6a317ab15869a59e7f3d140c8c0d2a140981da80 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.13940638469106723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020766596739565735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22027868660038527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024555989238055657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1545325646812761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017491504584820998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.02295354423344447, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008596166860623005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.03516541724082877, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011192218045994813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.024434310470910135, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007015086906141807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.1093757017485915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015570564427566638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1788578203459584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019850983591823343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12236419999758073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012480669383126359}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.12869616248408597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019249850326154927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.20402125079021746, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002281736333276174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14268473186817657, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016026623127596423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.9508280856877127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03110425637842823}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b65cd3fa600a65e0f056d781b2572921730fca4c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1443105260666999, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001976066139534122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.24132371040856623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002737769238603288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1670563070726708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019178617354935641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.027398570547821213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000827158978199139}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.04638771664106042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014224849717481463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03141526988382421, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008646679643430015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10951134631545736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013991751146943878}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.18961514474322305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021768119549567286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12809102133791261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013663009524185852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.13436615559009527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00183327155604958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.22552793868897747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00256618921407864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1556006928965094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017716161324128888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.7665424794124964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08195279766239584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cadfb9cb15b4615ef6524c5d8c67300893f0f314 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.18040456283641296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023668115610486376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2870648086297508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027667857701435265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.201225172676659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019811525577731063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.044638474881729014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001221684957935622}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.06999975662618645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016116340703400012}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.048234719869698274, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010310275655512609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.13940596176892595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017847331831893683}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.22920159148286207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002280659464388438}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1566809756684086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001423439082962154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.16770892114701086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022385377077080216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.26750883501904393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002616055257421549}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.18694734716033373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018516981957565392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.803596179721391, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06743147958318904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2b2f635e47d8df6f78b4a64fedd7df3cdd77ded6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.16137937728841734, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002729028979629109}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2387837109438451, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033294587186186273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1703204755658047, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023437253279040265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04090526043528262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001235838484277452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.06101732506395295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00160920272003809}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04242364396606276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010396516497661902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.12349766223247734, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002064619072302476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.188129807231652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026955200484454715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1308133731843072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001724287665682434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1502826430324195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002566907720178576}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.2228164439816456, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003150083408193586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.158427617468449, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021925425639389923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 3.062227755689224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10516467661969281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cb2dbdb4b1d8b90ab397659bd66ffe463be45e3c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.04779731174149245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020548178719198367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.07123791494122411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00272031115150723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.048444404337984054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018220046283176402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.012493623204348362, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009264858341368517}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.01866679464180306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001097487245025396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.012171454699532493, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006624651441077691}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.03785063497182873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016606407010599609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.05742149931165803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022331279191766247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.038101293212841725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014065239228148929}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.04434190566928883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019234474448849314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.06599904255640211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025261730166920423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.044795627250137825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016806124326569428}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.416385920099641, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.040112901840196856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..03dee0e4fc1475e892a7cc1ab9117009d30d121a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.005887065759378087, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007186584994339659}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.009025529640709563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010109891028141976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006175541531128815, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006856911987577207}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0013791403401351644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00025334887590687766}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0019778725350326183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003002459206151824}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0013784838984500552, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021928864199571185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.004586578522696645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005392063119391484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.00746110009096202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008469892635174075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.004937632375856857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005388614915141283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.0055058688026879445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006742675837988185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.008392335211224188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000940745680937788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.005738594434124072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006340338087020388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.5592181183403974e-09, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.9017186369415516e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8f0f98258dc7704a5637ca2de542d3365e2e22 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14692825480864516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018425931958625118}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2512970070836738, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002648136861575683}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.172029086258773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018269882294786313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.027966624066931015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007198127679157396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.050572163826476904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00137135380680017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03327297097578151, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008291280040639999}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11468189898145331, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012927912945671872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20361426191617446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002161893564032071}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13597766469494962, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013133399966305693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1348221046642956, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016706708007563362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23199038101436165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024449378448124903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15820053127675784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016635461401018714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.414984862410896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05940234677879655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..69dee377c311f95ab19df8b862ce05130f16dfe9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.20096070137159358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024183404012715405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.31553844384377067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002899492197080727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2221915237870407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019942889407287214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05245104495325915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012835508781176563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0826631709797398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017188300283128646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.056766090400891124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010805710237233974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14393356016704015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001799791687854001}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.23074840768589486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022312022411283892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15912396957362257, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013601645244304275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18819591770649413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002281534419296507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2963346702924994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027562668344971878}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20821150035815492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018771242477053024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.836407401710481, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04043755617370114}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..542845a8cbe7f7767094abbd06e98ced83b08977 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.20680907589782577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002488502986523018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.30989444650796616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002753786143548591}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22254451379263313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001915395486050023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05499744970071221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001313828428933835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08212190745009494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016893036964833342}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.057748452491246806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001071605656207478}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14843146397136125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018718282405222096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.22714968032490238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022095657686829933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15964814622161871, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013347717397906025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19471899528471098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002355009451192118}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.292336561157753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026262120649372398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20960062484582434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018046035863626521}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0672692499706633, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049106354646547744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..75953c555e253234f8cc0314aeb107eaebe31fe5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17474019948410954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027474429834987804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25521371457329683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033152648455079072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1820115847047415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002231556986150065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.045362687902130126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012564421439368422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06804888392817302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016992089106080323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04689779702656875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001034952296288115}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1265226291981258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002108936479136444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18709591094229805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002591170589040495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13086000378258256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015951624748139174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1650854923795236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026195222440337307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24111486429370707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003150424029915772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17171382413552935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021029154857551075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0771001660724235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10166627860233955}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fa639eeabd590fef7815f991df8a0cc621ada6a1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05686654081896349, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002233440497229162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08353923265042008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029077074042461065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05660115935642753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019128121580337133}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.012648534117560613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000714480304383431}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02199374005627044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012054249033507543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.013741746630537094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006742800088300104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04243723633575883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017020140807939923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0631413785577323, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002247650978929538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04194122086488744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014016615434499398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05323619772247561, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020619540523883185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07914684184702428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027725917227984414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05334942672891278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018012620317959791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.6074700254086013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05423743996990111}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6bfee5adbae278b70ea9ce798da1daa61391902c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.008781294870051274, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009050499890841822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.013677582922976033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013193791852402054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009252067005496314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008857404763954463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002216585419720102, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003047584752236144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0035060227812251585, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004475043348074056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0023512305693387013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002931024418143357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.00679769925058833, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000718459101307871}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.010367182111106583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010118399277303067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007007677157867705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006708462523675694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0082009891177148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008441371445186326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012916195361676434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012503160451136067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008671543071872297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008282868408056053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.0033020706801269e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.9077024795694913e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..383296217918729d927f06dc4e96ec159664a8ea --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.08966310598899173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016283594159426443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.14086823338303966, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002174573651185083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1005296177122325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015389563787558595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.009747347404058394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042465671299048196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.01702445557173677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008431303828342194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.011216041205494898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00047851956788785876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08266300329867055, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001411373389832569}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13215740335454515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019780686878669556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09330112527692144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001337771293407071}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.08265536975136227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014906774813849123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.13084094141254124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002017106703749903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09281985594753377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013961291968416943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.6124556246903006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.038254977576635665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..546becaa3b5f10cb39913ce73dd5177e574a5928 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.12619327348032797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017890629958271637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.20045647375518888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026946167754429164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.14247720198956396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018061896391544346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.018133516802832303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000645611933600182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.031827347668963016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012365216961313032}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.021015379090322476, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007116134026935783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09050018965284994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011814640560731773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.14784088101449194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001990541308747493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10275833507189983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011818702954760711}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11860200033066597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001670916163668385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1888009611952507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002505695512304149}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.13395201808210386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016761350624605734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0361019783677474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06745079097957657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b64deb53337866f2e34fdfb9e98b1050db029f78 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.10326783555477309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001845941711723819}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.15774264721944228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028918138893316418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11404319152029233, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019424688335624655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.015686648273678768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006566075701044747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.02846742132686831, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012449112557856106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.01856227196277119, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000747759698991048}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08074967691674281, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012838590605038145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.1238571271772768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002138262266070014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08879104640279128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013350757685037958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09519149494947343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017162671701933705}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1465542733502144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002715845708109963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10533375181568398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018091025815919538}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0062083415777778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04711946186027571}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6be8a6755d053e1635c2a0c17bc39e6b94272d0c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.07857613907066331, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018325767254077753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11231507755062209, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002709125328853898}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0826320712636738, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018381499864311686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.011896740680175497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000601828187026642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.02057079275702888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011498890895523576}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.013461430147774255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006594173038785609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.06274112802109241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013639990133955174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0899022034344876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020795974250259583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.065544556988537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013299047040813018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.07257907609294534, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017086622186232854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.10409821801585775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002535862979074873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.07624199938761006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017069202628844525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0058220356103125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.048299933628742284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0a2b007684af1695ff84b2d9119d239dc70289b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.019168420675696286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010795160831791957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.02879908909496583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0016458129256000573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.02010596120483132, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010917797901789433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0028556430231600335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000302994114231941}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.005469741300282359, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000651898645432401}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.003266077788148105, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003273226353783014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.015404523683234762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008241756698203975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.023150586278677637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012827505595113496}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.015989909215756735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008105738923510461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.01763922638803302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010041223908314812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.026296995107134465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015071738271990678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.01839170907812179, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001003818069703428}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.07242667655921821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008932967771763006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..731def2a757680b5a525a956f7d39959ba4e53c0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0015780841392649505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00030145052547005737}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.0027069709166605916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004523943421663273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.001794950456581436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00030728853559277316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.00016207296070587727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 6.383944014856069e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.00022581084966854161, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 7.606359517652543e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0001630519530024726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.716887646946743e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0012686454818619593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0002161999716520848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0023860171187140397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004079367273708954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0014936361251205509, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00023979795330628012}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0013993374329327448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00026598516241118476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.002450141316005954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00042236431455605384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0016051321840952642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00027664878910365937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 3.936862342904368e-14, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.4388487630942075e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64ee33ce7af65b3cc9474a0326d71515c6016f56 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.323, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348632}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b545b2ec563e3538c3c2734399c04429d525b204 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.324, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738857}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..acc94feb46ca45458c6d906ebd78134867fc073a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564438}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.349, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015080663991563098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dc50db46b164553949ab99eef391942c9bf2f939 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.348, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01507060460376841}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01500870618212173}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6acda64acb3643f4e1fce903910f021dcd7c6536 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..736570dd0cb6a71bdffa1b666a9cead85ee83982 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01468399195108797}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.312, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..84da2dfc3b48e9322ad97b8286c732871afedb3a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.34, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6ba90025af48102cf9a624a7dd8527240658557c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d39898b0b253e4ea1b9d24c256f03611adba4d8e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.359, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798596}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.362, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0152048409129195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dabb9af554237ea95b02d61a51f25c7d05b2999a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.356, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306625}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.348, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015070604603768408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1749029527e5ac595845b5f921e69d71068b43a1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.345, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055238}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.346, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3e9bf8e7005627af14a37b144f889d0af9fa9f74 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.353, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015120172605483697}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.352, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015110404505648664}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8a4fca12333cb74ab495105ae6e4e345496d933c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01500870618212173}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..692442ae4c35c4292fcdb0cb3a7f94762492cd0e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3531412a9a90490bf20bffc46046b50c37e2c4c6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402707}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7dcc3a2d5768173052de76f95290ec17fd0afbd3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.366, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015240612726405749}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ccf6504b5b737129122593976d1f1a199c3b8ead --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932572}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c33cf80cc03c86d3688072a33e9e5655732cd0b4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270333}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411242}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..74de284178e74933b9cea53e706469fef8257b5b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732968}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a62b715336cbb2bfe5f1aad1d9f70f14299415 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203933}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ff6e4a8547ef8da0d1d4671de6d8b532618e1c8d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203933}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2415ad87ce449f8bb1a117fbded063f827936c2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732956}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203928}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eba86294765ada420cc0a02cbd6976667cf4ead6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402714}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224466}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..59611d29c1d120ecbfcdb5b21fee8f7cf4bd05f8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.327, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411239}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732958}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d689f903d80319fdad5e4f5bc8dda97fd1f6751e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a73e065378f14fd240b10c54cc596f483a2798ec --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0bcd4bd31f2b4d29d259d7482f68a0a74e7a5177 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653593}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f0538535b398a85d36242762c66c3c95cd798b1c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.356, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306623}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b2b1bda955f6916b959c94a697fc1efd944a25e9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356953}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..20d9fcfe2512a24b710cbf20142a482007f8bd41 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541031}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01484221315341124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e0f4fdc020068cfa639441184cb309b4810fca21 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411245}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a4bd619ffd00b54f09938e821319c3141fa38e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.309, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01461960097720649}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e753c7e2dd8d57750e5647d49d5b851850bc9447 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880215}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f75beac0f6d3945ce71add1ba5edaaf9816514e1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229873}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.328, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0ced8d57ef8a9ec3579c5e267f0f8e92c02fb7eb --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.323, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348628}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.3, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..067e8aaed63f822530f29829df8faefd0c242c36 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411244}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.316, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057121}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7e64b99fb8eec4b9405cafb71a6125e76a21f197 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..29a0c8bb9005da641f08788b8159472ded9fd4e8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1792e9fbcff94cb24a073eb771c47724f19e7c8b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.312, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014658474370509008}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.309, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206493}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..251f46ee249b392a003adc2a3d762822db4fe867 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664396}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014566646394664396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..99abd1f543ce1ec116bb2cca28c740ec3c6fe30d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.306, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014580006055436972}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.295, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014428554438445526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c49cc3c49a3930938221b6ebe278a4453eb2e013 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792498}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.314, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108796}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cb402545673ba8daee25df3a27d3dd9df557cbd8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.348, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01507060460376841}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0ae3f50898a3d779e0f9b1d7c3a5db0c905564ca --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2371e522615e0d5f17f62c774a2324d5c907e650 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057139}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..947e78b3d984d829c72a7a156fd9732b29bf312d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574888}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b964bb665bcaf75aeecb66aaca1d787475412a3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.308, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014606483127342758}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5a1a10b52fc654841dfa343af857ac873cabe8a8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541038}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eeca51486bef068ad532ef693d20172d537aab2d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795018}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149659607102245}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..691ae69fa6bb43b6d2c27c12d78c3b6aa949d1fe --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.308, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01460648312734276}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792508}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa0157f4f9256282b06c086f1644a6b77eb10d60 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738864}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01488827258820392}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f5cc268516358b6008c78ebceeb5ab11ca37efa1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01487687202745673}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2d5507792bb7f4bbe1805c632080c4108c203c87 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.311, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01464559638572269}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.316, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..70784c7f8f40168fe9236506341dd382e623735b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.319, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473484}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01479492784334863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..163b8cab1aa6667c23bef37a9e0d6e8de8fdaf02 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363937}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811483}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a9d183690194a19b71e17b915d824c0150becde --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9eda580996cbd374569b7cd7e818133f7b503fc0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996693}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..18a4069ad19d3dc83aa4309404b523daa0de8abd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01469663196079251}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014758652303574886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5712320748772d1e2f4a3301e7eceeb0418c6cde --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.306, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014580006055436969}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014566646394664378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f59abe3bfde3c0b2818fa1a8c5951bc548d7c743 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473477}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01465847437050901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f0042a52e2a1d06221ee2cdacba22a3a79af8c4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.35083333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013782212417178193}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3475, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013751753243291852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..07814f7366ebc0356c522b2f2798718b97d0b5d2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3441666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013720551062295756}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..21c21e00c100625a85eba77b1b5073dd921acbd6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251951}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3125, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013386029277441229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5b159af8809bbfeb9b1846e7a3f8109fe42c5b49 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01364760294240639}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d264b3d2ec871cb4241b4a5d1fb35c7fe4679d94 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.30666666666666664, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013316642319070699}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3175, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013443538681348052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0b4a81dd4e511a991cbd668cfd00ee0c54df51c0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982098}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.30416666666666664, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013286140243317441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4e3ddf61a9d811b2321f3e80f4b663cc3f933df6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136774}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3175, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01344353868134805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2f3805f06ebbbe057cd6e56ec078ca7424e054 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..df24bd76d4c99d65355368dd45d47cfdb580cdd1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013462309712005136}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..44982cd46a7999e885deabac5ac8897b7170df8a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433626}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eb18995cb4edcb8d33cff9dba5fd10957e4045af --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3275, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251947}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..855063b63e86dc5680730fee0f40878870849fd7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.30833333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013336721143136464}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d5b8209980ba24f2391dd7fa0bdc58a88339cd65 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c405ec346363eda56824d8f1c86b0c3d4ec9a08e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ff47af7651e2a9c416271575f603ab49e03010ed --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01347162092976915}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013296358936471105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..558c6fa275f466b7d674fd93dc8a0f1f97953496 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821474}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013471620929769149}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..43131b680b473f546e8d834d85dba395c2816dac --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300215}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f729d3692931ea6cbed8819e04320d955f4bfb8b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b3d700efc21624c393b65b1d49ea9d1e589e3f7c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3275, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251956}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01347162092976914}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c91c7ab1659c528d27d01563b19e21797f210193 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013613950010225606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed38d9cf154d071bf13eb4c2453d8660b8b2ee55 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3125, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013386029277441229}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.30916666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013346684134591945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a39bbd481c07e176ac6f49296bc2fba07ed4dd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3258333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01353542204341745}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013579531277800925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..353bb1e4263efa48891559c310591c2ed535619d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013424568830356446}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31166666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013376268790982105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..064e4909fcc5f00ea8f227d470e6d897f3835e5c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31416666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0134053993149841}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3075, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013326707242912057}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..df90e5ddbb55bc173fab78894fa73de3bcd213e3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013672343491681819}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013613950010225608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..43e3f3301227878eaaadedff3643d7d1b40616bd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8cf5d6d1190d1621adcb5de29931f14ff4332b4d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251953}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.30833333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01333672114313647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4ab8ab827737f323c9de7825d177c6bf4bb562 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618266}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013553211167251951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4473fddcdeb48439ae3f99a389830a3029af505e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406398}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..72f7e87483b83740c79af196b7d4b088e418bc13 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01350837286730022}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01360541734571053}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..725cca840486007755e873470519cd6944b006ad --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260834}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012430399829260834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b8fd35f21b3498d0c2948f9cd5e156d2497335 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0124457700280262}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0124457700280262}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4f5f26642102019ffc0eb90fc6bfd300a9f4ef21 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01263940711192644}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01263940711192644}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c7faa08d214e98f8e70d2c4b291c5d2d31636bc5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453954}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012476304127453954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..58ad00bb9a26ce9e4a56b00f35de30db1100d598 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25341296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012710896778378606}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25341296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012710896778378606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..00313cd4be9bc88990717dd5918b824bcd569d81 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004759}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012399451855004759}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..510c34737a1ea97c101c9372d5a699894da946c0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042961}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01317944244765389}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b787c0cd45a0b6b3a19b06beeb77ece12fc639de --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012928933196496354}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29692832764505117, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01335202597672522}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..99a1130f3ffdf2267a262837cad72ed453deea06 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136432}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013143376735009009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..312ad11f532fb7d2773b5ed0fd0ab844220c4817 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313969}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012980954547659556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..21f564be8e930321632b88df415cbf01388d2b0e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012808273573927104}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..82923d04a7cb2e056bcd8ffe081eb9c9c13e57d0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012794553754288675}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2738907849829352, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013032004972989505}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c86091e915ac7a4a86049b95f46df03cc7ec6f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24488054607508533, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012566273985131358}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012980954547659556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..520fafa5be984729876e6f4419ed7bec937be8d7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926433}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2696245733788396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01296804068686916}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9977498da40f0dd61a2937fc09f31008690c66d1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24658703071672355, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01259572626879013}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012902554762313966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1face9e5bce53470bd707277348145ead5bd9aec --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926437}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012942030195136428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9b15ba94bcf5a98b4fe0daa25794b8fe02e59288 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012696728980207704}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2790102389078498, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01310678488360134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d018fbfdc4ac5020d4bdf702910eb350a00d05b7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012862523175351333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e7f2720435f6b857b90297173fdf15118af17ff0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012461071376316623}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012461071376316623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9530bee6959feeab434da45481683c010509063b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01236822537850714}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01236822537850714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dd1901708ad05d6c41a1a5924437276ff998058b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587087}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3303a9864514029fc3444516593351412262b1fe --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012521593295800116}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012521593295800116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8040f514eea29e736657349fe6cd439a58265ed8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012739038695202104}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012739038695202104}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4f38939bff7a51b3b66e7445e69c5294ee363be0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301842}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0891108ec435bea6953b211cf843c3c79ff79127 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012849054826858114}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2832764505119454, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013167478735134576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4c5f7313ad0c3b9bc1df63373d822a84d6dd7090 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136425}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013318528460539427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7357c244d73dff3ec0e23dab5e76118fa65476cf --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012780770562768405}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b95e6446749961c8c8d09095250a59db8e718201 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012780770562768409}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012993807727545777}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..06830dde0419c79fe1993147dc8579003e505913 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012849054826858115}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2721843003412969, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013006600406423704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ebd7ac874a5d313514f5c37e19cc72b794a25238 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_challenge_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313967}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5064268915beba037e9c5fcf17e1d5dfaee28493 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00882458861121907}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24494949494949494, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00882458861121907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2b33acab70ca9c89433c7f0f9f34d436a749870e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23063973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008643708884504997}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23063973063973064, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008643708884504997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ea403f81e3b0b88bccd8b5a19dbdca3907454bd5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.242003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008788455043255566}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.242003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008788455043255566}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12ea4b3bbde8c3318b082e004bbc3b30b1afa4f8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2478956228956229, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008860162361464025}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2478956228956229, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008860162361464025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e0d8a03c2efc9983ab7c407ac720cc3193341a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884498458193489}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00884498458193489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5e239a97f80fe1a2a7f485c11dd6796db27c7101 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24957912457912457, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008880241465504344}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24957912457912457, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008880241465504344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..047eec0806cb192b10a2f7458ad4977084f2888f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.359006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009843424713072174}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3244949494949495, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009606970654515783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eee0d4c3ff19a0ef31c5a9dbb8e930125af99dc2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3560606060606061, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009825454608416303}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31607744107744107, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009540440071928285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a40f2e4c0a8edb01248252c105b68532dc97347 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3480639730639731, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009774627600259012}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3143939393939394, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009526702423162909}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d8f10f508e66c68af113363860fcbe35257138b1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3404882154882155, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009723676813825867}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3106060606060606, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009495260551195607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4d99303e5d7298ce58740a5167951a31616cffb6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3312289562289562, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00965764131135091}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30176767676767674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009418994158522525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..483837258a78437182d6afd83c3fef0402c3653b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32365319865319864, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009600478182273787}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3047138047138047, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009444871667360211}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6b69b20cad83a7e7512e0a649e61c94e48a3e7ca --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2946127946127946, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009354224395837094}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2798821548821549, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009212077524656529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b4863433a0580d360e9b07ac5ac982300ce7dbfa --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3303872053872054, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00965143021642819}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3341750841750842, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009679106032919058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e31ee6fd497508fa7e780ee603cd3fb6c0c37d37 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.35353535353535354, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00980972894815149}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3581649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009838331651451844}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..23d71a69f513c049419d2cad586b4bc4d145ed72 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.34553872053872053, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0097579487306703}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3531144781144781, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009807078935467613}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0b2a1898fb6373032cf53fa7e574afa807a2f5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3333333333333333, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00967301666813338}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3425925925925926, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009738105469984187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2ca91dab90fe285066d18a775bd3927adea50e34 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.31776094276094274, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009554033064443064}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3383838383838384, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009709034670525096}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..843c994512a39933ed3754637a85f0f3bbf9b42e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008757032594354022}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008757032594354022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bbd92b97351408161d04e59f988bbc76d642a44b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23526936026936027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008703724269718638}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23526936026936027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008703724269718638}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0edb7922c11a2fc5638a3fd403486dcff3aee908 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24074074074074073, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008772796145221903}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24074074074074073, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008772796145221903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2688d7a963277ac8fdd4dea36c0c96d4f225875 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008855114414834709}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855114414834709}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e81ca904b9f9a43c88f669f95379bcc018f0bbb --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24579124579124578, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008834809366391489}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24579124579124578, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008834809366391489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ab8601d2979afb3816fa1728474753a1936f484a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008914948991495718}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008914948991495718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..030687e59814124c27c393fc6197c42495dde143 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.36069023569023567, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009853512108416743}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3186026936026936, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009560775507673364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cc4c0a7e9ff0640f28466bff24407fad61202103 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3480639730639731, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009774627600259014}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3127104377104377, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009512819491443735}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cae347b3c94c0bc3480e35f5b5ca854d9f81e096 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.34553872053872053, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00975794873067031}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30765993265993263, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009470292575831181}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..23d63071fb5ef626e21be2204ed6c76b1f74ab02 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.33880471380471383, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009711980224301649}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30892255892255893, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009481048387761353}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d111ad48649bb4e793826ae79b624a642655e642 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3341750841750842, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00967910603291906}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29503367003367004, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009358110551087425}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..34d7ae11d9f4cd10a9b7f67de362302f4dc91ab7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_arc_easy_qa_options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3261784511784512, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009619849417035172}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3021885521885522, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009422719042483192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c78779bedd599c5015d1c8ef336375378018df62 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.538, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00910382483037647}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6383333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008773841218429196}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5f21b153dc88f287f377ac491d6f9eb3ca13e8b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5356666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009106972161130879}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6116666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008899620943397685}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9a18c2a88e971f397dcb227dee56345983e568 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5443333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009094270381387362}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6156666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008882569490543052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9711c3fe285c1b2dce23eff2ac4ee35c23995275 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5566666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00907140524362105}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.617, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00887674483503323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..100a48715a20c2db0e01acc9deac051225634895 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5656666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009051147480837464}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6216666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855801251873009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f9ea680b786ff0c2848f850710dad1016bc615c2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5716666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00903595664371605}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6206666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008860362324722518}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d461f5e0cc758e638581e0684e1914ce7b381e77 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884811049411477}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.43366666666666664, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009049526374650795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d3f4fdd4c37a788428f81ca35c1b8224c3791748 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5bfe18a2dc1b1fbc6f97bb4c8c026f448fc7023b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5396666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00910145395014027}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.53, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009113781890088811}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2a95fa4c716eba76b38d16dcda0e2f32f49251f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.539, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009102414587191052}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5166666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009125157363376123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0cbf8324840df8cab08c8201769c940893316db0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.527, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009116909528258622}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5076666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129157751283581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..59ada3a724956d04272732f98ff44703b6356322 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5133333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009126984242044514}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.49533333333333335, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129833442820515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..22d6a434cc93db9c0afd92f676ca93b55b8e7d59 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.623, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884965755342756}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5946666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008965091467970754}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..119ab29eef27ca616ccc19ce3045bcb8e61e7f1f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5566666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009071405243621038}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5456666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009092070195065412}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..08fe2902685456fea88e2edb6416dcc0138c6323 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5536666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009077486613450291}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5426666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009096928229880426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fb08aac6dae77de915b4153c0cd8a3b27abb2397 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5583333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009067881941319675}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5496666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009085074954912703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..90119ea415e50ff7b1ac882f4ba4594e8e6c7425 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.57, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904031207504128}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.556, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009072785596468857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..50013ba9fbb4be202a9a016aaee56776cde8097e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.567, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009047888598785733}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.556, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009072785596468855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eab775bca794c1950199d209a14c5ae095437a33 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5896666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008982215188519143}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.402, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008953140207390567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..16f52be7ef5883d04e684121a47e9c21f84101dd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5423333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009097447488896774}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.541, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099483512819305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7aaafcdd226ba52a74d70d208e94b461a86f84fc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5706666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009038582451449426}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.561, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009062029213030572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8190f19aff804c6f87eaeed1d8f09f5570aa49a8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5633333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009056690207178128}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5506666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009083233528874798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6565658d8a9d7d13a4685b0c4be6b008f412a819 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5543333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009076164124491365}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5446666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009093726495969151}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7823d2faa2011ad493d53e4f1f90ae40366312b4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.561, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009062029213030572}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5476666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009088646624339615}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9cd6e356f3e368a48484af87f67b3775c56c3da1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5293333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009114505467759737}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cbd3c6296a8b68ae504ed5fc92d0cce8e87d3a22 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5b70b2c99300d215d774552e211a66ddd81bd015 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.48233333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009124530050684579}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.496, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129938951699208}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9639c0e78c840fc9704f5bc7fee1269716199d9d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.4676666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0091111208252746}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5126666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009127300863830172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b87c736b8e5213fd568c8e3f431ea329cb668ab2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.481, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00912363671545717}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.513, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009127144583936549}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..746cae69e520d59ced7d78f6fb5dafb3454505ce --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.47733333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009120844478925309}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.505, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129774600800656}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..05ceb2bd2d87483228a09b075a08bcc12ff61f06 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1754385964912281, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a4d0e2262c7fe584fa7dbefe2557f087f2baf98b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d831d3c8baeacc105651c3dd4af37d3bfc10fbfe --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.26666666666666666, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..df54ea0bd4fd4549fa9110150e9fe9f694cda6a8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.24857881136950902, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ab5b831730e47d082ca4fe1072aff296e7a797e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.22990271377368152, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a38f497545f919822d3c7f260786a55c39dcfa59 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.30357142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06199938655510755}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.22028985507246376, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2e5d1eec913465b4486092d2f5ba64985713dbbc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3cdbb1ca41b7a2b9ca0b76533a8dffe6dde37a98 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7525c9b89e5a0384131311e0e988e28f28caff22 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.32236227824463115, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..58cc02f66289b62e49876ffabe8d2c826431425f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3757011576560449, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c65667a3c04c211c6b9415b0ee6a569a85028a76 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3196798493408663, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..795eea8cec8ed670b0748ca505ab7cb4b88c99ee --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3416488477072939, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bfa6b8db6524bffb561be4ced8d612fd1bc66143 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3398692810457516, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..56cbaa269d6f1d9fb2869b719b794ea745e6a8f9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..01fd412bc6be4ac0634f5078c6194ad0a03a7c89 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b50a6b51e26b85b0b83fc82834a15c4c6cb52f79 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.31761006289308175, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a8816607b3b09dba195e4a525779fcabceea6f96 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.37694592988710635, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..08bee71dbb08b85ab7920d91936153bf2fe687cd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_can-we-infer_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3548587781510513, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea8e156d49568828e89d9ceedd6a8fc041d51f2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.14285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0471841613625583}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.11887125220458554, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6a9b2878919f55b26b9a2e09b074198b391da4c6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.27314814814814814, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9642b70c8de617a8fe8c85c64db52436f2d2a5b4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2567567567567568, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3461a92207c39637960913a6ec006eaa13e32def --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.24074074074074078, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4b8c3451e11815cd01ff9903e0e000e1a61bbdbc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.28992628992628994, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..349caebe1c7dfe6d493a7d9d4b34d201faddc3ea --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.23318250377073904, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3022feacbfc9fd7e1b4d22b5af80427917e7812 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.27465986394557823, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..88bc1c3e2d0e6d1a0d1e37d782298f6e21f46c57 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cbd3706bbb3ac25aa9e3c8e08bbb354c4a406360 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3006032601719933, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad238a75a3f86c6b95fb4330710b4814be103f4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3143399810066477, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d94426c6a5ad05331b68235b9fd7f82726eb3fdf --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.31636363636363635, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a5805a35ada6324f7e2d9726b570198fe091f9a0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_cb_justified-in-saying_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3073128622518189, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6838a4075f7ceb7dda0a24dbca77bdbc9f06d1c2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..312e8deb5f74100acaedff9a88abdcd393c260ed --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2b2ca1346f8981726e068b7aa62227a57ef59e5d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc03a7e1c9dbb65c68fdbc1392aebaefe28d3cc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04923659639173309}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e5c5ba32301a5ab512070033276e4f62d02eb796 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc559c5299369f148dd228254de40f1e6938296 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_best_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..480e61cf7f58a888f77472fe01804c592101b9af --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04923659639173309}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bc80c3f4ed8477e44ffefe3ab842b6da06ebb342 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..014ba17c49e6d547458b2adf7ed801f28f4ceead --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..62838759c16d19cee6bd45d52a8babf168253d0c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05021167315686779}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..342e2dd63f8994c64c4752129839e5dfd713c4be --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..91da6016130380103a922c1520a1eee87b61447f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_cause_effect_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9347479696a1262240ad75ba48cb4c29b89aa9aa --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.61, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001975}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4e66a88f6ad880f19521568766a5e03e0b1abbef --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5a224c033663c0fad9b6a854e1ba7a87c283da37 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05021167315686779}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..59993486ae9d294122cb81878bbbc004ffe579fc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a5e4ddd58ac86b3bd4009a3555a0cacff3e09abc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..643d16c0de1b550610552d28b26a35890a327f7e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_choose_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d177fda744a473132165d2b3bb76a95af1dee35c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.62, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04878317312145633}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d43c80756fe9447263ab1da139bbf12fc3e4abc3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..68c50298122e21b3437629d9eb30436e666bb8b0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3ede432244d53bae6f18adcc35ebdf507d087167 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9b61d816cc93384abc5abecc4d76caeb00e43a67 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f7bf5bf1906a7906cfc0dfbd079a20b47f704b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..03d41b90f5b98d0ba763c995769a40eb8844c7c0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.61, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001975}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..def20f07c5e9c976501a2c2cac25c763eb37e8ad --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0b60fb40760f19e5311c6daf38726258a6298ec2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ba68753b1bec5d6398d0e4736b9a581e121c8c38 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3028636b44a1ee7660f40bd78c5b54c4b04c6cba --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ad1a1d6e20d5c11311abc57fb7608156ba9943c5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_copa_plausible_alternatives_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..25b85acf25721e2b1824040aa7387b8d7ee26a01 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.498486545620417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028599955391489566}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.17483459362877837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001705017286322628}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.33493639354854016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024663398758924945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.22495159490910463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019053847340024682}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.05964515275724504, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009755619175694137}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.1142906263856143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001814336091779065}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.0768086035487303, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012111613779042028}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.15005963762480484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013171712778281752}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.2919690872282272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021251924476385774}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.19419308478997838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015170582220134538}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.14893285461116787, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014850614932496661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.2860286356508758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022430688988452152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.1917621810854489, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016819035833037649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a2f32cd618260065f9da15bffef70bfe932ef44 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 5.552260199704303, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.057401553819660216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3259195028808269, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001929323968999145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.572157323196931, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026426040314961857}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.40600971218219595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019372178068667871}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.13492928471559051, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011996866679482643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.24424415700293647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002141030333559227}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.1695476123074274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014000703195505763}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22280765716264103, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012944708889095894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.39909766220453646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023618899336078415}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.27934677067641567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014140720869577354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.26925744025419523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016959347663601083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4741109359252275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025232454404190492}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3357354638134433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017725485372938805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..55f21c4c8a7ae2e84b705454a4d3e5d8a3e7d310 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.320043236974254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0703674127483568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.3311240341080181, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017853276326339404}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5884284298124635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025141637691686656}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.41458290792353053, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017754408662774433}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14343528305064893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012055773188276925}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2640886894644106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022528239016002664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18139573177296225, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014279319814940361}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22789410567174723, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001239923319192286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.4134133885978249, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002390777095443192}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28718354333461665, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013718939798951697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2757280227521462, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016014662456065143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4919426398639349, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024913442742156007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3456449838143285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001679248842503913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ec3bdec9085417854fd459a785cdba913d1ffec8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.622404462604137, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0600956886865709}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.32978578163106964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001728792250109152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.58993990670276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024371452756877373}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4139372581461456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017028503243186983}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14600046958938082, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011984602947182365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2703010278791222, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022446737679045724}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18499822465619842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014157929728201822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22926603301976495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012193510568444229}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.4182116954475941, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002363070228365806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28955171897620546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013460315368352376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.2760388688353806, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001581361548630609}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.49527056164943506, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024657921238186126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.34678316488645144, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016548796761139582}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e3bb50f7906a526380329fdfd51de7a8b96c313 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.726984471838634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0796088492574431}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.32648518891836104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017271914729613752}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.586257235035879, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024423096252639558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4106941328054044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017229230075679713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1453440606347176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012436842104140615}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.26971984634892004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002313107941237128}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18449634822804548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001477498111348285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22765173104030403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001238329714231677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.41649689481485414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002350472856644654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28803275605668655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001367927628256527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.274783630094611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015950198452968627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.4945541982831927, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002451202764511733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3458744644841339, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016762712143040775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0a806e533248637ddfa0b4d45fd295064105fcc4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 6.5756845375883355, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08897150277070985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.32116519599671667, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017400612729082726}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.5784532325776863, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024510408288361423}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4044662434309372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017458799577526741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.14182020295749317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012237648320547914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.26385422416008314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022588904922048233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.180205137590262, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014510894649712618}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.22455155493389567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012390887238885092}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.411817587585727, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023153768885770987}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2844408397217688, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001371012610736257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.27161546746024895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016061149766465918}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.490609400586177, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024821818387890946}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3423573003846329, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017004223296000738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..97994b0cf69f5ac0734463193d8a4e52a20f4852 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 1.1558513353609208, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04074903284344169}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.11516247787644035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001524841446986784}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.22747710284617287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024176480638780427}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.1493845282584616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017612814690175535}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.023214286463636956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006999863181351649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.044410334346547925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012895264732825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.02963754911980315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008583825054508186}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.09351085926051252, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011626517440736233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.1875911020135083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001982311694030516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.12196365835192624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001376970711141067}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.09651005871721619, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001243153944300717}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.19223482953366502, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002042794088449503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.12556472090225665, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014533252243555674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d3d10ee5e38bfb0da219ee7fe9249be2598320e4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 5.632510593428182, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06797359125001885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31416640095368, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001676578347273029}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5652729230635114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024927456901652216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3950199600478533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016709891545967043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.12825863146984678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011505311700009703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2387490220507428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002140418284139839}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16266540713305758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013365594771830832}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.20916346047637208, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011837672533766093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3840042932400857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002349227665339613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26463608485476275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013170335870528055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.25910749063743543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015532175334562622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.466741053856193, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024637010906840514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32581485051561293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016289808132726072}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..658ab8c267fa271078d6555b7b28f9509260b46d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.392142200831627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10315877163155605}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3180209569566378, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016737440721134676}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5778453320981264, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00246975929714272}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4015506196777932, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016768573292908987}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1368729144877671, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011691195535521908}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.257752868294542, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022701334543955894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.174575698484694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014096927083128974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21739223338213798, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011987485605256046}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4023377833196794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002376723574094262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2761053811033139, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001353268307874384}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26596256244660715, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015443527415276346}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4842569652904595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002471682356356687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.33602429370388687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016395104225531447}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9af12b501d1c64e8d1087a35381c05b7496f117e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.63349544243743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05281657466901086}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.31357525152538196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001623161380165631}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5751723085823159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002495778990632278}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3974550338517387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016482540144611944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1371291641511403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011818735703140412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.260356557510865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023088013029268985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17547831360286914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014288067294337741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.216247455101675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011928912670333205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.4032048757502036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023797635332692417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2754925401248682, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013545543541219737}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26435133382219206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015244603278307882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.48612921079402044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025405746329231096}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.33527731886149553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001640163433964876}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad61546b5831840a633feb456efc59be07ceeea --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.709765572579541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06261403260610318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30664438468283994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016595572641215095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5650188060391115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024946022217113522}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.38937605226208516, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017014154240876697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13579088054388685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012067203935791926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.25817000306049603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023009577668858558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1739020471143616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001452734859135155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21289213132253543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012160818190789682}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3987146480229239, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002331544671296655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.27171532019877925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013724928070502846}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26005394613535154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001558252869531713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4795521973701686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024908487929554543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3302419538023624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016726242236806516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..57effd0a777eefe4e5d5776ca16b378f97c2df2b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.575403777061748, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09823540673154174}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.30336615193836675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001651812386890844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5612874601960536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002474965171767822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3858772280160323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016981938784725468}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1336565960148854, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011883817497889528}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.25509786184825395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022469620904438923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.17145057785135936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014287502471356415}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21071104756711098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012104628563537965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.396382791753183, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002322655929385875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26945646442741866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013738953482368282}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2575546053753626, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015516497103719832}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.47687647145147444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024725571404500667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32763137597619785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016696428766529263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a762673edd5bf9d83b1d3a630ed9a695204de7fd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.0006666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00047132592062028273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 3.968253968253968e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 2.8611082123824066e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 7.474747474747475e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 5.378367894229668e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.0006666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00047132592062028273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 3.968253968253968e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 2.8611082123824066e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 7.474747474747475e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 5.378367894229668e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.0006666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00047132592062028273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 3.968253968253968e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 2.8611082123824066e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 7.474747474747475e-05, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 5.378367894229668e-05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b34e47ccffce89b45821129974d087b667d9a3ff --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 4.659796079611401, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1368931758688134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.17644909352160107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038149287890057383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.21230940827718442, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00454974484640084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.17808584653315795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036047878484278886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.06341138057720765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017418196750443745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.08183158513848933, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021584079306890677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.06654255399675307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016555414433965217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.1238963811146196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029398927249347016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.14507709522146778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031870533189973928}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.12177858627685004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025003205364259196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.1513576373029321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003401957689671663}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.17927286088246633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038765489000090123}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.1509777229111984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00309077855012835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5881acc3a648cd7f6bf6329812b0afe12f7037bf --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.349660138830153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1098318499392546}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.24305067824681437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036097560022010406}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.32723288456222843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048182868947828535}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2653185459339885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003718960309470489}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.09330398651801074, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016570405561187557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.13658467403886668, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025405278204551044}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.10730240032895362, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018845525107885157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.16814754590628309, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027282671712716595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.22485624238227028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034527275025707706}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.18130203037790546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025961043557325117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.2041811673945807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031846062911353203}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.27268640990499865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004106397617602531}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.22128162666490145, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031723992786646523}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..61285f0c164575fcab26a0429725c6a278a9cb89 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.100604672627616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11628331252327472}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.2744732005419624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034020795182056205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3792037077764579, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00465258042529242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.3057154871720569, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035668673332201113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.11192035685967208, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016787135403730127}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.16449012589807802, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025920539576727714}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.12907057263921567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019101957633704694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.19158493144962355, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025745366569725494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2642632272554717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034193238088634525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.21199121837713777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025587269926346133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.23056997456483158, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030071550428457374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3171677356585434, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00402997199179447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2557399896778223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030869457481539284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b99c0535d14ef1cd86e827c9ae7812eaed940eff --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.460262600466185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06690235758953499}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.29909310953486523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003198137138338973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.4171947659672954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004380236874977217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.33535988312538023, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00333242597817113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.12487041668274895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016702178366713861}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.18358213761587733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025750805740997595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.14413942196532828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018940385351166674}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.2095831569760216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002448524558694762}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2928182365204402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033071812049612808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.23407187738369561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024485182965088507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.2514610077689454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002860099081408974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3501029527885422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038717879646786944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2812675916988234, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029431631516787177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b744d80ad5488f8bef740834e1473df61de5b75c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.58290843334344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09940670882099904}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.3125044962383802, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003072058347749594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.43832279193619333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004161083245905112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.35170201207032026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031635095609000353}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.13168237302502842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016416394703174114}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.19471607478958905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025429321562742252}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.15257686050529207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018607982697786416}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.2193444318712155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023735293014745776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.308468298974907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003191423466850776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.24599327220157718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023449351106501785}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.26273018537823156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027544779870844888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3678867286427804, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036888444777911644}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2949470778370446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027957905211735317}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1851ecebf8b2b6007c2b910df1244e7d9daf7398 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.5451634050753023, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03476122561593657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.046590041682019065, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018113877733116784}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.05509845941495712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017154015403554534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.04310290552413315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013354170612674893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0061050451406738674, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004913565515769247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.009015504658999519, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006494884515366231}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0063826724183375155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004428905323251431}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.04442283664669869, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016857127584432392}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.053364978373077035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016220526842467798}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.041469788976882346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012468639228984321}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.04212798093112617, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00163957458202675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.04928251726552432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014753429112820023}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.03867299387707548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011657405780310776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..452558b02a568c5aebc89ac60a39d41d65ce056f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.396112958955289, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12774001717020914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5339579313432855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031227264254700595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.42523020862127436, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002981115684302531}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4479642771715969, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023079960050490524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.248732624279182, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002509203740513705}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.1952491238459977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020864674211335237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2056915755809246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019163170514045066}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3850344789575973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00278033592883375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3033612056952134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002388739694886294}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3204968358680575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001985520070286022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4342921156865786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003003575710929277}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.34421205733760346, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026942002219059206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.36313254630951347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022360796971689707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..00c08c6d5098d7d25d6a078ba9079d5c39353837 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.976405649449786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20113725542092992}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.558994227405181, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032295866736836163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.44495262240037453, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029490508361979437}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.47007019927530463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022920087841303556}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27217728286113113, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026898706377774815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21411207509510502, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002230097528851014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.22591032128288588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020560357969047136}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.40975809640656674, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00294221380121017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3236212819558878, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024673142471829217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3425232001087342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020806737697165846}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46033224287141866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031762512099836643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3647714098177991, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027210820647532467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3859835303051329, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023029475516390383}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..38b5fa47666d8704315f5f0e481595fec7232c88 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 13.841780368990428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16004525418920557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5672114023663154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031836037058964135}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4506315681499688, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028976257859202057}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4775062117887383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022520658159749783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2831489435622744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026849363746315716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2224359793863677, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022585552678834257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23547797340215765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002080191282609909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.42052513955243626, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029794181672521524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3316609702212396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002482260811969057}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3522099816274127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021360587201332483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4731322251491655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031830112351273762}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3751305124701987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027537781991579034}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.39783266320253785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023487903214587246}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1f40712ca3cdce340fc4ba1581c23c6a0068b8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.347043577937871, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13183200117809515}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5657141271378652, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032494806378829547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4543139798474618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002865992018425755}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4791803139648653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022359999842246512}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2840374640470508, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028091760925762137}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.22548495306647312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002307333846171272}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23765394178309218, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021422406093072697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4208287732411973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003031588432467615}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.33645734099493246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025176571940632108}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3551586826634256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021591742079548407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.47480321305856177, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003220379872286072}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.38184109614706496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002799589796872638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.40243539061119715, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002367520270615011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..979e9db412b63ebd99c78d447191147932289f86 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.305213942543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1483607819215848}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5597772799865962, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031779357829373007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.45931387100423793, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002804556836326018}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4804680058354203, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021741397703150594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27941286210601324, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002685923767684149}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2261024234770814, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002220825905817483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2366049201616526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020482405764989274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4153388685887487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002922026669760074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.33991925444128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024671212347911007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3555529772285316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020868456031009723}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.470187398074112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031325933820836164}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3864435015330604, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027449279832689228}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.403868227449958, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023025847428200684}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dde3566c7b175e25f4e122543ca0e49dbc3bad41 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 5.202591268830205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.054922717477209206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.2917166159957922, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002219184062207322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5216732757880385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003067680737337117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.366483471533585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002386281579917487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.12771487727214673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001331726066831988}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2362911624934372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023048686809464653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.16205539672246214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015773137142455621}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2181802591620801, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014357601973536051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3992804600446685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002414052410394236}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.2763256240716396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016051756708536963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.24125596921065487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018822466477212397}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.43323632221522257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026995804538005315}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.30348032953012855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020484158677831262}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a87fa0276c3a10931bff2006accdf20ed168f678 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.510370404578311, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07017652112732815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.37679150052732646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021690658868186385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5994842792127021, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00255563453868782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.45209118860105924, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020375402593377735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16696448229806954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014179634395889713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27258591423013007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002211872159153144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20190477038644114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015696316292779203}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2606246077766141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00149866573783182}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4234162076398136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024102400017687316}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.31497173633802084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015604606735793588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.311329828827097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019564900632477846}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4967568002020189, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002564011157133322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3738662228300479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019441791456726448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e619521ac517ea5ee5ca4863ecc226c21fe83ad7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.694055179072492, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07704898802651756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.36979779913491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020832359668947016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5914089255959427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025404798791399787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.44487573960989946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001977816470537129}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16613522297283992, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001397693878296302}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27310008704696526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022423121530451896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20141804290762905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015588769877815645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25859076070964365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014689767050739151}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.42183833274839183, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002417511355707716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.313184127205453, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015466479745984572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3076440801284151, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019053103567047945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4935606984408204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025806815753750124}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3704488980543351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019175726004970364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e38bde9145da10444781d70e653e5838f491e166 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.6514653763342375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06251939465471733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.36420647303972614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020915518131283885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5822439340272905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002574821849509054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4380843233846489, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019929244812592456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16409417489732006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001445153868542824}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.27002111151104974, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023305847933292184}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19894653788829594, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016174170448261663}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25631927067186866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015047882308515217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4172204517599229, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002430745636023191}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3101723423743742, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00157850445644739}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3033028389713666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019232661628729548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.48618027753856774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00262308382372153}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3650855458428154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019418658394657236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57762201c4231a33e986a29b427229ebd26444d2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.608484443770737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09177253914044482}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.36000067737683655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002059158783528559}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5751430196884769, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025553447543118276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.433038880083634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001976429894642913}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.16179960223783285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001465512561186318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.26598167030517494, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023601948549819467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19618559845779804, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016525158117788473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.25407783683621693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015117553852618231}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4129708872099754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024312767233502217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3073988661453901, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016004229019968986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3020096150583601, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00192602224235909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4835110037230676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026281093270445572}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3634810794306642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019587560708400013}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..70dcc27bfecff408b7ff96de392e26c09e22c5a8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.335359360908907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0880259547967717}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.3544672605326427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020400084083317866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5675350811323949, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025580342690609322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4267199017854688, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019587924811470827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15730682660007175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014601294548965934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.25829361144949015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00230882843770772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.19065995281551984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001637747580731308}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.24923159048984417, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015024102603759335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.40544016089116924, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002402265187758597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.30167860222723625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015893036860766903}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.2968667042793682, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019183519029005254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.47565074316708567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025968713342409276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.35740425194488684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001944256853237914}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..815640f033f2b5cf0091b2e16a8c9085aee02225 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.09962536145007753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001740659618979375}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2495957789912658, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004084832622173279}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1404701926071358, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023545854909488343}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.015032914044550117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000742400789969203}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03926943996615448, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001959252805398743}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.021410321262745457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010461597649610937}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08055461541255139, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001249774505808259}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.20337457259832462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003060417391791486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11379782676552355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001698506173428402}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08079598752449389, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014172566441327129}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.20443087549335434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034517764908789015}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11421613258385542, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001935031419720415}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.8720915413287463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06977921896956246}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c99734dfd24190ea376202a5699e707915146ad9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11166497670488186, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001646093728324672}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2745822551275572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037626527798719244}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1568991797418592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002216712754065909}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013225636476818368, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006474952763639561}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03329408008418082, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017080640073784744}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0187210253001416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009208759050088938}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07882774952520541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010561315770534385}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.1958952699537899, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025491385981828486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11103657989201658, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014320021304524589}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0896933686749845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012859991795038399}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.2224374288611004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030478633438990885}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.126287265188546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00174254258134218}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7071340922221964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06567476459760405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0b73a826f3d051d8d5f221213e07e5f329605748 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.124060562800013, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018136218300640054}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.3073301314994108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042524341650730015}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.17473465066618604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00245708961708968}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.02227233591062227, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009171326429615212}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.05765394909576481, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002379587733761179}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03174503623834507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012950949882147274}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09291237590423006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001297408097673992}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.23207281546402703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031750505176512617}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.13112441731732927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017687033339916917}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09759594898910186, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014440054482869897}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.24407902417613644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003545475320336588}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13779052503552564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001976539742066777}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.2055699356745069, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05452685939114017}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bed16b2bf634dabdf5b37b298931cf30c7531596 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.12858513884438236, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002120442853077002}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.3058467203905133, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0046548186919951684}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.17754481617140364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027411678148972474}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.02559840737779659, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009669040664741145}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.06396559494568956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024066314342877596}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03592964212545872, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001337319847086293}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09637012660269542, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015081518881888312}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.23129631390107847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034855936093635756}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.13344572346235084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001985028910537152}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.10149076059236951, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017109531080025527}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.24358222998073842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003957993939468868}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.14060704133134494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002274916550088745}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.3690976879754526, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07157486729365321}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bb78ef3bd998ad2e5eb7f21a5bc4d5518bc11b37 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.03769871977195407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002336453416963925}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.07311282171723119, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004454298183792852}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.04580987420726127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002686434828650492}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.007625557776701037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007866165856765972}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.01686591602432881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015442682087128132}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.009837365340636202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008814198827318251}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.02814473364569592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018141347746336734}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.05436185626976168, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033426299994124293}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03390847923190842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019941253411488963}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.029868454600151712, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019063509322727447}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.05818680411207516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003622236236410205}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.03619213971219476, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002146702755941942}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7131409325442915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16045020297035031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..83d8d369d78ccfbfbad75ac5baabe1d68d7a1563 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 6.387886674158631e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 4.527370030666761e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0001231472929586137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 8.72630464321695e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 6.387886674158631e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 4.527370030666761e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0001231472929586137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 8.72630464321695e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 6.387886674158631e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 4.527370030666761e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.0001231472929586137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 8.72630464321695e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4005ae031f4c75d1edf9136494e72993f11f0dd9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.15468558801825094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027906195490837874}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3291046037990046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004388092575953328}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20129747803416173, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027922294615110735}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0400294063524096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001724105483571795}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08497447416593046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029266165872910183}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05123793702073981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018214840639693597}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.12465526978296572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002299651183476469}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.26725068507569655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035760579918268356}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16254032673358534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002238751342864787}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.12064651566409819, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002398515107357595}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.25801606284267453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038925350228375816}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15699398066183545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002413809068769271}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.969669748694084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11920564417930218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5f61ed4ee0d0179659634efeb6da6d2a84f03df9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14151483177458335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018666664224144868}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.350507712921647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004347527687226855}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19916839431146724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025060831527181906}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03563953027984095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011469030277248067}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09161704225190265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003025312844144268}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.050711870640867504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016273936819634284}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11241775738220008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014552290073169463}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.28016025506550185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035605767069095784}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1584759141966615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019769396812308743}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11164690738304506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001554937658585751}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.27937348602219997, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003826151047457801}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15757176490392727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021232145053781655}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9843126589812214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07803286264393723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f069b4c325523510610da7fcdfc70dcd10c30842 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14505365849514526, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017717206403909573}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3590688289676267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004194349096882626}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20413697486271104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002376457670474134}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03641466698412616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011418577722541036}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09475841284636158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003043438432012329}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05192590462289213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016190417513229905}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1165115860707238, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014128544079030998}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.29053341231755914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035601421682268683}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16427512906528735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019258406494987107}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11344382044719216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014987095085429382}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2835275825737783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003772811693926147}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16004314278537282, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002051543734547068}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9960960514942592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07344519144130342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..26fd5266e433664502ec1baa46d42105970132e5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14713365585983412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020824125761163305}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.34916556365895823, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004704775790168902}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2022841747965752, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026384886657312224}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03592889088406201, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011712459443672357}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09050705948348292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030325027339432697}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0504745368841509, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00163370097686754}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11559690666144418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001671096800205579}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2754667184343775, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003934807057494367}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15906261276123296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021562437238946204}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11557269164532327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001725315585914204}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2766392842578445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004134898630416766}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15926265578186155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022530285725081956}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9836229338335558, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09808438341855263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..573a8177cdc342c83c4c267739fbec747a18fc8e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.04765862333877057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003034035842078532}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.08668561522891197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004843273307719953}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.05505531576181226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030265646206736638}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.010462634958970572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009728648293141028}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.021800656752831386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018115640007053421}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.013418856497160158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011417937757972204}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.037309800551862775, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024441508269820006}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.06722349707153805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037810947177969894}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04273395830712555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023651800321026328}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.038418088250945504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024990780458393853}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.06943623456802712, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003937161605823187}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.04406961555377166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024510630458409125}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.078602667040749, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18381444266538938}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2d06718f84753df6554ba06b41f298742e0770c8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0025698165750884954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007513867047594584}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0020265023378529864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005860337220124529}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0022213194298558382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006417999440798693}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.00011170849128673766, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.932541467693499e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 8.475431338916355e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.0983480512422104e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 9.617082045566528e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.874253211859863e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.001814865674466118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005255278900735306}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0014201789051984832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00040633641993546393}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0015697328565003042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004515600693647235}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.00203326778659742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005789262577812531}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0016561300609609598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00047805234847829935}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0017855275703933101, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005066133842215296}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.8932273089650456e-42, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.552979377433657e-36}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d7e225d3df51c88d9dc6532b4bfa514e2fae044e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1691487054500622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024250397934530475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3329424577074328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004657379753879945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2135984788737725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026714992621488937}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03660197341061794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013876615412254558}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07882441796533844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029121244310315966}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04790575968435739, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017255007150536288}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.12480506419752084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018165017981600132}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.24849031836840368, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036491823702079194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15810158263668517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020195259778767218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.12993544368406393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019145615600976785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.26077607018509813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004054503415661984}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16520124922802173, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022185800515451995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.103288067343989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12415880355418442}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0804e634e630f5b349a4b4929caa8cc38ae9eeef --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1369839651831878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019614322408207006}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3369949215642884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004522579510481593}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19244278875333556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002639391462026543}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.031094803643512716, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011365997631264868}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07935713496492708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002884276945599807}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04413377405232099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015985481401232032}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10657563716596334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014875062360428326}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26467978376956175, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036163241442920165}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1500620628581487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020207447957827454}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1078060619083588, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016045831813046104}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2680535986457037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038953175735036904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15187454343375792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021906040942089064}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7714825425476433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10905675609895228}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b8d583c4cd93d679810c2f2185f63df6c53cbd8b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1416725332039807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019003755973271465}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3459660596871756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004306340084752496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19861514735566918, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002536869900552261}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.032589396300244954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001135411316234186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08273970858446308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029435229941216076}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.046170973346933354, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016039666517335285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11131065526290164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014510877883445626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2741645486545069, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003497544935564585}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1563808799657367, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019620345863827896}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11082674371408649, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015845483453646742}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2737340240682109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003854876130312793}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15581654576519338, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021586761800846295}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.8719539285791582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12363679141275902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a394dbb3288441114cb98a378d70c167cf0731ac --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13956778448576387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021737270808076243}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.329102550985314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004775853265191935}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19191359742761793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00277051626651925}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.034143138087541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012197975952686348}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08270285997486158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029002084266991856}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04715930396420784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001624147693037141}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11089339313344634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016751091885532099}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26375167528125915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038713870137101424}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1528761624914166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002169630539318251}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11161318605392968, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018444411300991288}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.26547328847718554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004239107141632789}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15380687866772882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002388364659924707}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0369871247788307, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11889285675918113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1b6a3d0fd7c2eb0ac5d3143585ec4bdb930b621a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.045437643598581885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002731741164998445}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0834633850546414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004709515538170699}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.053658040802239085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029774150017601803}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.009714659363066126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009061043015811388}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.020204993259341004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017173474943290197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.012335864733508397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010450675751901856}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03525079234225019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021438053902596605}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06521180915166398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003703751644624955}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.041486821630655535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022764116642440576}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03647650806323336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002248078408967042}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.06711329846733026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003863144394106958}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04288192765446807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023977202282267356}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9532704572175751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1334974714932564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b1e24c984da7fdd27877a2c4944d353f0d8d3439 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.003263719197769084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008887617448902996}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0028345456514645845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008116977743173961}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002997767517780879, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008334506629000642}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0004590858641913026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00021377485443811557}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00041953329689178745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00021411305439143847}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0004337191943913522, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021214389566854186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0023145025817495005, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006312757980824667}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0019717134578344517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005600175515270516}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002106529594577242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005845294157065285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0026347083023874887, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007273791404483311}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.00225801889401816, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000653191190142516}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0024033045589119204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006766571604709825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.120169018915429e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.777103521919484e-32}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cffdad729f2cdd60683ba168452f8b9bdfb26c91 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15951620968916577, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002506196733818455}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3380965213762154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004239971963662302}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.2065385624658632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002495181752673378}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03267382741321086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015503006643301331}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06995212128557793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002538369711070906}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.041608603555378855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015173762355031157}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1158742599512755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019653714280750737}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2450321311305678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032031104916126017}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1492749985320237, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018430993892411397}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12563518916564906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020584621317715304}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.26886059673342566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037413258402390244}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16300942364929533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002098349740204003}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.575373380176573, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0743995294897499}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a5be87fc542ebd620c0d013f24be5cbb61f88a8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14477662439175204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018795188514419876}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3557260266989316, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004456910041921236}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20332653496456884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002535349618927259}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0332459231860444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010850655973791109}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08572120065468666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002895224082192127}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04729852367510724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015420776527216781}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10920770628719932, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014291037457289909}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.27011776305596435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035598997301664764}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15359088976552104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019473489961876593}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11478861556780529, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015689281670261022}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2847152787941595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039331867569525865}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16157946667427311, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002148423192266251}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.8684098635173505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06338992961884797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0a60acd914d1dcbdcdd9f5a1b61d5c9837692f87 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15082144789712365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018424832486209342}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3651698865935423, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004302647367894026}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.21082155347220088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024569832954891837}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03513129846275268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011451237082588754}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08918749690012981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029682699425890064}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04973150539674991, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016131300737703802}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11407141265154397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013994053119914564}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2776724573795989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003411155443688218}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15960845606925958, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018776627339659353}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11828100442855599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015337129412005281}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.28863758587492094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037557189858194494}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16564364312727078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002071661209778157}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9593269560207813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10392450231035287}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fbbe9d2f3dd0ee24e3698ff1505a5fb03f0b9c6b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1496531280766477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002170811665161093}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3453250604427468, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047401463995383085}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.204398760358889, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00276366410673389}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03495998710112079, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011734323689905663}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0853256030786561, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00288948347840057}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04868798327580798, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016054863551996723}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11215486999034506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001620815089476965}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26057916777844126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037363611572360737}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15352602365435797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021112441534149396}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.1182804937364553, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018148963331361236}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27561134068552073, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041549575197084255}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16206552227978172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023555539018775963}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9760625459105288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10026358022338139}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..48c0faa1b0c201212f6462788f1cead567c95f35 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.04792518752207993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002929855540885626}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.08695740365895663, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004905955060606447}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.05625513613265757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030874961718411237}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.01000387771406212, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009949695824421575}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.020457759459906272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001799928617720977}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.012561323055777309, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011221605993979012}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.03654303187169094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023333023205796763}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.06544765513897487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037490143344574268}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.042368809782355486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023587611382963657}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.038524011442170915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024337355610465307}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.07009896492118943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004064534959138374}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.045014902928945284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002517391175655878}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.9987646889959115, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1912402463323597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..79aed4aabce37dba35117361961bf0657e4e761a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0025400865271768946, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008365321705968738}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0020522199947177865, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006198953753349982}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.002203951765543492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006780578514746854}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0003593890386343216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002587217800602705}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0002044705818290724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001357861630560493}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.00025865120204742847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00017713704426631428}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.001746686503389582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005159950965839201}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0013959158984171599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00038826984967891984}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.001513916054366738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00042616321835822437}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0021182020561365147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000660664849160224}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0017475858050803443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005209596862265322}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0018556868062698107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005514893920061358}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.3330277651810904e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.795417937992427e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7cf0216fb91f324b6e7429b5ae3a97dbfc916cac --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1656184917860495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002465318130849214}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3451271695394911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004485695093538501}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.21463729419325275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002671429409363269}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.037345942132846435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014462619260980324}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08165828005081678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028400731706742888}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04915332059409096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017122102302659385}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.12269872075535913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018643694649009327}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.25782978020360103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034687018620499803}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.159314857725847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020081165474175313}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.12736067944965968, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001970254989936668}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.26933208119177504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003908747660119136}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1659189540151564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022284342453661927}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.064984135458196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11276853324640117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e6d7f664d766408743a563b4c56279f095c63a68 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1260899791574131, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001905161571513694}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.30488764933726087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004406152742466401}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.17624612929896505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025718000739001897}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.02734160155747233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010866592288641356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06868815407030224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027368160750971225}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.038642466121148654, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015244525321813124}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10218411344219641, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014948765149499935}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.24887472602828783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035843610201584087}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14308653397520427, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002030967495914565}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.09807478367610013, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015430046746426423}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.23944847065255154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003722025149898182}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.13746530371970767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021096421026204895}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.558306696837244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09981504570416164}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6c66d9a9203c6465756db7e5f98c4da6d18f18d7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13253960760217687, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018631836737003167}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3178043987357393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004209555582114566}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18476307451897173, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024826342866030933}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.029929646843642313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011076561079256734}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07427687891626805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027863929510018536}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04211259064384283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015475034261702306}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10895910761984066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001506797437434206}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2627359450729775, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035428896385583442}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1521149622660914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020254138607920464}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10274775436240437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015507057264651516}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2489222546895533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00370928368654382}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14364115654972823, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021055633428788308}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.7326078380754275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11082657511901861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e5daed71b5e0c7836ab090d7e4b280f628760ae6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.12889199801472273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002243455154850897}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2927286723681437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004638755474625006}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.1752477037036964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002823759912213769}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.029515166535948576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012016618969078606}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06911038852478063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002746574682775168}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0405702032936913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016059537675197902}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10464484098581038, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001773802303200482}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.24008506670200253, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038295614084253612}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14276741278375094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022683257050729088}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.09960601532293903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018313506067734666}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.22860772378561103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039645397857025364}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.13584003612122558, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002348292605866413}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.7305491538270004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08408232834719435}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1613524d752c1a4dc3a5a7ca63d72e06ef84be10 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.04123247889915294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027362423249375554}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.07120572993230662, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004274377455153015}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.047528176132410734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002823749720620929}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.009497011127313206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010718639651327534}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.016786374564059875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015228476295663846}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.010936982876670076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010227818352712905}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.03244728409674943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021591440394866995}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.05663645241508702, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034535293095162792}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.03751611271132232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022506689999874908}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.031933246077056165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002143169453256064}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.055391705623806235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003398556473468678}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.03670784975038728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022077354874054825}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.6780236081531866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08801012234752068}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..97d7cb3cdd008a7039fa1fec24f789511f16b6f4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0025728987993138934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011767976878626356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0003240120371493934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00013796006385495373}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0005723341987999615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00024516446983701974}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0008576329331046312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008576329331046333}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 6.125949522175937e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.125949522176059e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.00011435105774728416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011435105774728473}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0025728987993138934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011767976878626356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0003240120371493934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00013796006385495373}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0005723341987999615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00024516446983701974}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0025728987993138934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011767976878626356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0003240120371493934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00013796006385495373}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0005723341987999615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00024516446983701974}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 4.142453217449745e-247, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..99a7d222d8cacdd7b46025abceee9b6376d53905 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 10.02681772761729, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.4785770088213236}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.21326442624004158, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005569959071404712}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.5796067902091783, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.008493584491820132}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.26893866283043205, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006128132638012782}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.1569774762608679, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.004918509910414435}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.4493611433994356, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008641465811853976}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.20120770859481335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.005573115844107413}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.20594687601926343, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005476023109804243}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.5638381191556961, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.008539182187801923}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.26037256266092274, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006063020148608195}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.20814801304375385, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005524999926052949}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.5660905777243646, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.008533556999477658}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.26266697026347924, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006101223479297265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf241f8155df619ff60e5c37805d505754b4114 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 12.405128080356619, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.6282911341436567}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.4297880540568289, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008220917619398086}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6505221227794611, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007032948083628127}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.43052575441974283, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007565036202436803}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.3277678584275725, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00787877500339806}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5012746456886409, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008142123538414653}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.33159324549421026, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00747564127341564}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.4165981868791939, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00815637476891333}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6336843792535592, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007239228173826263}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.41915608188207465, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0076028128010187926}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.41983019583164943, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008186753775907333}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6355954746439701, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0072092086784890964}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.42133006102653786, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0076023574833303156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c0de69d62db3249bc5ba0fc559d53f4e06033d13 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 15.396111872252607, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.7207759822340656}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.4876095593388225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008404900319386375}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6784375831550562, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006899269582410456}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.49207161703177676, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0079131367283513}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.3809761461318492, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008300803335151334}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5339790203304655, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008121721991565935}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.38914197156300356, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008043239661107048}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.4745592667499523, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.008400824405327217}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6625149456976438, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0071233628299963376}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.48065244860067763, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007989865365422826}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.4769027851776395, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008404721247676569}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.665131716020771, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0070890587100882405}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.4826221999255758, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.00797725209707895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..07c2d12eaf6e3822a67f1013fc6d1136c07c162e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 16.218180922067372, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8509332297059158}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.4890706684702515, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008396913867721763}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6997303374835331, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.00655926076210075}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.4977472383213469, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007835731534849398}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.38688066002197424, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008249491136803927}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5562519826404686, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007946807268147116}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.39761438363429064, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007954981983937091}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.4760638248450606, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.008403042671103629}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.683370150325831, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006819890967440377}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.4863555951176898, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00792372037118076}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.47941150704079855, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008400997890393643}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.687165768578369, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0067675214257696}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.48895320036596807, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007901377124692735}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7589756eef8a9d74b4ac62ba5c132c491aa8658a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 14.198451804549311, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.6684426734932597}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.4481132422374573, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008389023019191415}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7084211018687647, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006378983318263235}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.4624679947761788, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007768937193183521}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.3533165568819186, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008050961633003766}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5593924878365768, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007881040514155611}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.36764860730658894, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007715456641810255}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.43585149585760313, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.008369484613704422}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.691252031424185, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006649043538442779}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.45164355068805134, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00783665587254422}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.4394149652836155, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008372880319767448}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6958738596781262, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0065890391738934635}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.4544143905392024, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007811758155493913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..344238ebc85b288ab9678680f0f4f5f3115df1a0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 13.506508915229169, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.5116866328228713}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.40539001717811785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.008246350607890437}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.720893483949784, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006190339720420079}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.4316365583256063, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007604543745568956}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.323283729079598, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00784836767981782}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5717527841713916, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007807360491891714}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.3454708512028626, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007528073940807884}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.3959242347319179, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00822162851212014}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7025263199962368, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006478632818938049}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.4222245633792425, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007657295802053638}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.3987034301296399, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.008213614641467533}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7089771703621439, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006396913155506568}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.42507655541519007, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007630445420039102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..da071ddc09872983f1bd50253082da293394a00b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.48639825897714906, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011661506839823789}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.48639825897714906, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011661506839823789}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9bba1605e1d88f9f1f79a8e903f8dbe10ef2c5bc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166557553076037}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166557553076037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..482a7d36e19d7fdf5cbe44bc18e50113e5c56b4d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a50d5dbb49772efc0aa6a071de983130f2897f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5048966267682263, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007814}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5048966267682263, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e689f4fd3d9ba9f8706704a650a5a41465796d4d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665575530760367}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5032644178454843, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665575530760367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7c128361dda58aa7852476c5e2f477ab0d02399f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5038084874863983, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0116654857447468}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5038084874863983, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0116654857447468}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b7252f8a2dee8b32ea14a63ef079fdfdd4060475 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17916585022658107, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007303477837541812}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.021356332010434582, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007105713714095752}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2286180773139028, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004259684658467588}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.036054964953355036, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008984262282200936}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003342865314527443, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00017849853167599175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.04156016756116742, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.002190482133962692}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.00581609356366873, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00028059938593242515}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.019312824968880614, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005557873015899917}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.21416319877579273, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.00397919084314672}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.032951139104863636, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007476528694649519}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.017643439425447376, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005894989170566875}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.19731432042647498, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0038630321229470993}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02990474554699068, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007607771602225413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5b4515aca9649ce2551f01148165c2c61e72ec52 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.14849173039066274, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.006780886130825962}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020251968434917507, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.001004184999269241}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.201417622459911, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004027970741634812}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.032381757450450424, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.000864749614587369}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003495343473522734, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00039644904227136035}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03504591595228045, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020381015294354335}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005210530351647924, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00032295371471857974}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018798741988949746, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000866081508450852}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19048063687209987, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037923827162263503}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03028754869084763, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007880477659311635}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01682276801642798, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0008795517871256883}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.172585092925651, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035987297019058986}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.026758462015252938, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007341044406760539}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..734363b10b97d8642f86769d5836e9e68d248d06 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.14199881699588157, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007298077065204388}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019700320148592156, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007325263284771759}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.1998389873544708, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003997380487910199}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03229518293521006, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008347091468006159}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002931431196844649, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0001919227650150536}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.034428579543291746, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019427883698678984}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004949579293108938, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002653281442859}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018031465684386884, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0006335497251678613}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.187052546508197, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037036180117878304}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029752024739423115, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007341759429352048}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01617912400918, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0006560029568093861}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1700270228297869, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035543068929493704}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.026370393722191878, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006951231645923941}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4d067d8404bfd3fd4c5cfbe2ec3d86e40b5b02e9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.14611074118885167, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010329545037543015}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019397395062929524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007719276203796916}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19955139137105213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004054485142263899}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.032166278597507376, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008641364081647094}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0031310622057378625, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00043845430276555624}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03466455497519352, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020194141383628244}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004937686316660083, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0003186532275811235}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01770820830988742, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0007194515278158072}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.1838484940943189, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0036582750075176943}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029306952043178024, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000751437752875103}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.016012342883103627, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0007103602078950152}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17027891330022135, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003611508658664297}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.026383518066224777, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007264066302234951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4934d0d501a3d4941572bdfd28ab249c363d02 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.13831086006846338, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007223374215514496}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01879738324600387, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005908622270302197}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19954407744415306, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004053615634720824}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03155775272111882, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007831075219253004}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.002591258862523811, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014237661089962314}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03292368767698468, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00192514110533362}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004548001185708453, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002389268902228686}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01714064421552565, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005309706560737331}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.18441955079753033, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003735346161231745}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02877406291863521, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006862843463300422}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015437163218078281, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005145259041792498}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17088506075667292, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036630999910594376}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025839149783962134, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006410049906716466}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d4581d1e24c93e4c5f975a319c53b37ffc648f85 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.11807145345667959, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007767667145105737}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018866851198029937, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0006293413700291389}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19598796545291594, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0039141300191215}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.031432905163962875, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007860008461527916}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0027315307202555304, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00015835092179141186}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.033330339409121036, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019094967708652559}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004669858309181997, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00024073389992590414}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017052586854830805, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0005424487932078866}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.1799650841386416, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003577924759854656}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.028453364453624484, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006751447210313165}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015736631180765175, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0005595356316243608}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16949142493214228, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003578759284382382}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02611703752890656, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006550883631257474}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..885d372423ad08e12a71d196768fac09cb6ea0ae --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ba6fab06af90e1dd0fbdec2cc412db84661695ee --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49347116430903154, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664829595210969}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49347116430903154, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664829595210969}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..821688f1b373fbb91ffb7bd59e4be6da71f716d7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4885745375408052, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662778026451676}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4885745375408052, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662778026451676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..33dcf3e48e06c1ca9bdde151558e19447f49cf74 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5065288356909684, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166482959521097}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5065288356909684, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166482959521097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4d68a9461463ec82a8c43b7a97001be06b83cc34 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5081610446137106, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664270112244237}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5081610446137106, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664270112244237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b349daec5e489feb2ccf292b476b03059606186b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49020674646354734, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663586263283223}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49020674646354734, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663586263283223}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b2e34c92e6278df45cca2ddff73cd6c7bc2ecc6a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5663764961915125, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562571737707337}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5669205658324266, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560864423151372}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c47091dbff2542d334eed8ee1ea328585b67b74f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.573993471164309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011537375448519443}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5745375408052231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011535468840824528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..263cccb2aaf8be8d88c8888e4819edda90f0ae51 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5696409140369967, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552114834700507}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5723612622415669, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01154300962328283}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..719edbf4833cab0cbff4603ed6cc6a1384e6a707 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5554951033732318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011593746871584154}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011590883373666854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a7bc85d2d8c49fa2fca68ad7601ae896ad5a31e7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5544069640914037, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01159655408098765}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5533188248095756, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01159930504274508}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d6086b1fd7de5547aa028597121d1f27f2b4a1a4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5554951033732318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011593746871584154}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5571273122959739, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0115894305035091}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c268f6f3980f57fb102877c6b74541eef713a635 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.639, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175125}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.562, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01569721001969469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..610252cbe19fb1177cbc40f8101eb69a315b889f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.679, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934645}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.665, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..807cc9e75a7693150301d60ae24fcde34bb22a96 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.702, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014470846741134715}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.691, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e41efb127453d64f5a142b666776d66d85ffe6ce --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.717, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014251810906481744}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.707, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014399942998441273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b17db773fdeb51bff585e954e0e41df07312303 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.716, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014267009061031306}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.698, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014526080235459543}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1a711b37800f351494e9de0b1bfc7be62ca27375 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.716, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014267009061031307}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.703, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014456832294801105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..76880fb532fd008434120f3dad8ff87817b9a140 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.867, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.010743669132397335}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.791, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012864077288499351}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d3943c1a3546594550e0be9d308e66f2a72ae73b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.892, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00982000165134571}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.876, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01042749887234397}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..18b82425835f3c66eb9222fd7495e98280d1c3fd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.9, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009491579957525044}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.893, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009779910359847165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f9dec14dbef5e64d6f6426f383d14af5caa11401 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.909, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00909954953840023}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.903, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009363689373248111}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3bde6ae5e31a03e4956442e86fa9de15a47a9d53 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.912, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00896305396259208}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.907, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009188875634996662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1a6216aa325c7f24400713cf0c61e268ee9d7d42 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Direct-Question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.918, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008680515615523715}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.912, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00896305396259208}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..38e47655c1d93de7a2295be59b16f78cd046cdce --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015819299929208316}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.453, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015749255189977596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eac18cb53864bbcfb99f1bea1492eb85501c6788 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.506, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015818160898606715}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.475, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015799513429996016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9ae3c134375da5703870502d30113d82a1b42e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.539, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015771104201283186}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.509, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015816736995005392}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e8daf0fac69e73304629e74e5d399b1b5294daed --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015663503610155283}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.521, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015805341148131296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e13e5f3f0a8d100f86b66702b252a11e19a74686 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.565, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.0156850572527172}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.554, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015726771166750354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..30c67f89ad2b921164a3686628a3eb1e1a112759 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.577, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015630589090476345}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.545, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01575510149834709}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..182282c21322c1df079b03794039a0f84354de3d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.625, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015316971293620996}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.531, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015788865959539006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b67bfc337abf7e4fc9d71100c29be1dd44c58caa --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015615500115072957}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.408, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015549205052920676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..48db54ec327be855cf582d0e215609a1fed64033 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.477, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0158025542467261}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.452, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01574623586588068}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bd0dd481109137fe28b79e6515a7312e9fd44bf8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.546, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01575221038877184}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.532, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01578686875935901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3bf9539707e2b97a4851998dc594aea946c82e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.574, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01564508768811381}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.566, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015680876566375058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5df96d0da2cf488ed8e8bda81b4cd7028b292a17 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.622, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01534116525402665}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.595, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015531136990453049}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6864a3778fd8f094217e251ac85d4a1ac66a0453 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.601, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015493193313162906}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.525, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01579951342999602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b1b3f34198b93827a91b3d2b15af1bf2b4738ced --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.507, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01581774956184357}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.474, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01579789775804276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f08a55c3bedba225125a5ec0e2bb5980f78994e7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.559, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015708779894242676}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.507, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015817749561843567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a80cec3c48ec32088e0da84aa91ce1734bf0ae40 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.607, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015452824654081496}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.57, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01566350361015528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0dfb7be3f371b30a1d9390c8f2446e13abd28c7b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.642, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01516792886540756}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.608, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015445859463771297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..da3bc1f734d89057d80bccb1c198151f4202b1d4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.643, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486776}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.604, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015473313265859406}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b1f8e2d50cf306638674ee2d6e31ac658db7fd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697235}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5024051309460181, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562298481438055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..057cfe4aee2d3b8faba4e8d19fb1cbf5db1b35d7 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585206}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5077498663816141, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561043278863545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e1cf2c72ecadb625104627b0f1ff06382df475d9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47728487439871725, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011550494192008947}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4917156600748263, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560845076525713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b40871f996c3ab46c750b66ba10514059841e958 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47247461250668094, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154489847386458}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47888829502939606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011552120807053817}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3700688688f0bfd59fc4cddc7807e593ef042f53 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697237}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551049647290312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0503f9edcc3799a7180c3f3cfef701d2d9c1ab49 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542066509767012}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4767504008551577, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011549925483927456}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67da1f7051d4409b61fbfa9627dec1e454cbe86c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555016408505474}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5221806520577231, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551049647290302}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..63f429bd41468b993e8b1ff959712c233c984b8f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.48583645109567075, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557792331301673}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4965259219668626, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562153149168298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fd7041921c1f680daa3c243533a6685ea33543cf --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.48583645109567075, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557792331301671}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.48957776590058794, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559920087347776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bffc0d3eaaabe5254472d318f75e17838f4e9db1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154813982307477}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155104964729031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a81ba891570bd5e7433db5f3ca3fdaaed94feb7a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47247461250668094, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544898473864586}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.48690539818278994, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558466383367183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4167f4e392010100bfb014158b6d278f2caffea5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298173}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4836985569214324, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011556285484521572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..68dc76393cea7bf1c94c27e59a6c8b8ac3b5f10b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ddd553fb6301773442917498301a0114551e6be2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4399056ce567d52d890acd7d8850d051eb5ad12a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d076b92991f161413f60212ac43027426e744564 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a8e0ce79794e7699c1627086eff03928f0039e44 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..301ec598e9aecc725587b4e46041c7e14b28087b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef17ad282ea0c565c72c1d02a59e6594a94996d0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555016408505476}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5109567076429716, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559655791130729}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa96b164cb989c6b8d734638f3a9eaec8a6ef39 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4820951362907536, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555016408505476}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5114911811865313, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559378273599126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1a0a2a1ef6d891a5d962c570472a971a23aec479 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557435464292916}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.48957776590058794, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559920087347776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f9fcdf2721e8a5b2b442be05c95266293d77669c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4794227685729556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552636515221862}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4863709246392304, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558135970599896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4051a47647abbadc1cf9250972c1b47b4ee59eb3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4831640833778728, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555875693960771}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4906467129877071, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560409019420362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..be0ba9c911fd31a36f0a4d7848cb1551a8a6d6c2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.47888829502939606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552120807053812}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.48690539818278994, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558466383367183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ac1ed113ef2b57231fdf3c29d1003eda9fc0f28a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011527657726586461}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5104222340994121, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559920087347771}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..576e53f9a89d32136c8ab208de7547b27c74fc1d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.48850881881346875, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559378273599123}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5002672367717798, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562430600098487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d5ba062ed6e1a2d8fec480383ae419773f5e4f3d --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47728487439871725, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011550494192008947}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4911811865312667, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560633656952968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..34ab8f22c193de7edbc1a77760170279e5069644 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011539022035111228}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.484233030464992, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155668204219638}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c76ec11119570b4903c3edbea64571f8f88ec1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4692677712453234, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011540570846495544}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.48743987172634956, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155878357073797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..174df4d6391acd792c90e788be1145a2e1fcdbae --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4740780331373597, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546883081384901}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4826296098343132, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555452669106634}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b983b9b412cba16ce021fba85231698a2401adf5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4404332129963899, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029882123363118723}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4729241877256318, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a7661f58b92370bed382072d21aa3ab430752bc8 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b68b7d779aa08eec3090d45dbf676bed79a2c1da --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ff9ee77fb0c2e13aa35669cda981ea1ad389ae21 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.555956678700361, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029907396333795987}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5451263537906137, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029973636495415252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c5b9ead045dad6bf34448d1b84938ab5cdd986f3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5631768953068592, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02985524739031494}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..42f6cff037d8041a813f4ef936b10283625e02c9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5631768953068592, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029855247390314945}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e62efc51a216ac8b71824849514080818444fd23 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e94764d0fd3d8b34b32bb347e2bf372e1aa68c5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b8c1e995b15b2c273773fc11b3875e34b138c32b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..da36597ab1d05fac2a0633029f6de396e5f83003 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..93ac15f954dbf6456969b5e8dacd241db61cccbc --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..060a6499d180b81e11fb9c05f6dbbe71d5897c4e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4584837545126354, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02999253538537331}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0300727231673172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67dfba0b99722d124ce66bd3b517ee8eeda170d9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5451263537906137, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415252}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b63606709aed22bbf3a13a423637b3944ebea082 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8c1df9758e0a715f8a1c520acd83444139688453 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2ad18866f40b80617a70c0c4d8dbac1e89016ee5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..851f9060c258dfe3fb8903386a5a90487650cbf1 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d372cd173e4f6b2aa702de549ee2744c433749c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1233211b1275e2629758221b0f2d1bc32aa40299 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3e67922046dcc7a873b0ffdc58a7f2bd16cfb306 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..84c3609359e4a129daed7c86b6867450fa3d7473 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e238caf8aabc3bd9a1ec997e88191b0ccc413f47 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd6e531953080c51a402f8ed485783cb4153126 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331327}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6602799ce422ed9ea2c696cf0c51d21064fa38ea --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.4729241877256318, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d76cc60231ddc1e1991866788a40f620b6468624 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..801ad281d28c9ad2eb74eb9133cec943290069a2 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4aff98b216e12d9ae4cb906e71aa6f06fec4e49f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3c3684d7538fb0071982d518712be1d6aea0c299 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b9951b0bb1740202d89da3322e74a5dcd5519c08 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03009469812323996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a57fb66dee0357eb7e947845ab2377c940dffda4 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_superglue_rte_should-assume_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4584837545126354, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fa66d50d98fdbfeec97173e00933e64e32c71db0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b4d7124ba3ed1fe752eea10c97c8e5effb2d1de0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915853}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c6b3173ff35adaf91ce646f972798378d8c553d3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9f7e64b1de4b92ab22caa7bb8b3a127180b733c9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403923921648463}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..382c2ec1f317c43dc0a83898125e27cc5ac876a5 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174964}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d99fd2ea7f7722a6c7965976cda1d5bcfee6c40 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_Replace_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5193370165745856, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014041972733712976}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec592650199b52cdcde0d644ea532ee4a779ae70 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051956064076896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..51850ec64c6869636f6cbb8740f54f14e5502658 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4925019731649566, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01405090552122858}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01404771839399767}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5f62e83dfdb3cb1a149e116e00d607e5cd5a5475 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5074980268350434, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050905521228577}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014048278820405624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e6f5084efcb268c4ad297ad6393728035ee95c1f --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5067087608524072, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051220692330346}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..29b26de03f5ffbe858435b8603a294e8ee412e6c --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5059194948697711, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b4a052354d9f9735939b11851b3978710337882a --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_True-or-False_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5043409629044988, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.0140519560640769}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f8e5f97b4dbc36a21157d2d0a403e8ad2cc03e5b --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4696132596685083, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014026510839428732}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47908445146014206, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014040185494212945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5b16ad4954fba4ff1751b74d44ed856eda452660 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4909234411996843, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497704}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4909234411996843, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050170094497704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..53f100210844b9c6e969ee41910e1117fcb743fd --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48303078137332284, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014044390401612967}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.46882399368587213, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014025142640639516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5ce6596b7dd57e84cc8e01e1492d06346f43a0e9 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.494869771112865, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b5b77661f6f821680c43d212e204854ebe3d7eb6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824194}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48224151539068666, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404361959617496}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e6c2ca1fb31263e23dc01865cab95f5081895d81 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..51a2a84787666cd474b2fd6bb246aecda40e2f61 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824194}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140492945362904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..168d4ae45b2c90237f122a4f4f30b27b994ed253 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee40c93c6e869c55944f384c9ea762ba76c4fc6 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4909234411996843, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050170094497707}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a358f9705878dd91eb4a10d7979c62b07f0677c3 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..31320a50bcb739aeeeec9032e409ce40b6c2b132 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405237625922564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cb2e76a80049c2dd5ea62c86e8f3a5986226d265 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_stand-for_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529022}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_0.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cefdadbf2aaab6e4e7d5171121b9eb229b55b22e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824194}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_1.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9810857fc3ea982b6d4747876d904063ddbbbde --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.47908445146014206, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014040185494212952}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.47434885556432516, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014033980956108557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_2.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..04f56192fbcd1aa421c47c8a1bc179c8e71a4927 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824192}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_3.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b671da6745b8db1f2a7d58add894f2d047bf4f9e --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440415}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_4.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ebdd5bdef76c12805e1a2c0baafe44ab8208e0 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174962}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_5.json b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ddbd2d0dce460a99421bf3d2715ec75bb104b726 --- /dev/null +++ b/4b284b42bc4/eval/agg.4b284b42bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405213114691586}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcbb257d9faff93fd5503c79d8882ab8dc10e010 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbad27aeea7f799d268028a9e1906e53d951dd29f0a56cd7358518aaf3f6e541 +size 4105040 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3094acf141469d4161bb04ce726695b03585abe --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:056879060e5191dbac4f67289006f0e93fd8352eab9f1bdc24a301e59b4d4aaf +size 5065978 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56db5b0f15fa65e03105e53888208be49499e07b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d5ccd6bf55a429384e806cda3b5ff2e073ef2a8b25d48d6a4dd99f38ab4f41 +size 5967583 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..086ab8aefd7b5409b0e4f6cc0231644a68c0a2bc --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75cb87c42c37db9e3d02dd3624b34e555c44f0080423683b7d9e14cb8b99c32 +size 6901852 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fe3d58d28d761e8ae52a1ec011d3825bc4b9d97 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef67a1224d05e96726427e1b390dee4591fbb682e97c7395140d08de29199d6 +size 7805496 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94f8315000f734b89d02856f5e8dd6de2c308a45 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e987d88289987124da882ce225471c5b1c915a724e3d8ac0959769def839dc0d +size 8712674 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4f906a3dff3c9220454da00964e2f12644c21d4b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82f28fedf2e51b9dbdf53dbed8483ccbb43415378be82a6a414a625d6df5930 +size 2968121 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..84194f0f17fecb7156ec20bb23cac37464d32f34 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1916a166b0ec997e1b356ee7f8af2d4d5fdb63e29fcc20ba73fddf87416e16a +size 3650124 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14182dbeaadc185c5f48409088dc0600461f26b9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6dd5a1c67c690b1373354445e6ae3d9f0e2b6b7932ab92539021e457d66654 +size 4355491 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ac5d336873af20fdf798aa96ddfe57a86df6860e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8c3c3f817ba5356d307be57b12492c9de9e99589dd7328d5f2b638bd277ce5e +size 5135745 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d28ef4ff5a23520e2371e4119390e5eea2f9abf3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96b4fbfe26ee067a54ff85f44d051b0b4a631d5db8c7b57d262c974b1215c8dd +size 5907820 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..affca64ef138b349e03a53b9c11eb15ad982d2e2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd3f0ff6445e30be16085e50ebb45733b95b997e447582c187c89a2d2d2d3ad4 +size 6657412 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b4a3ced856bb636307e0d6f3f5a9ebc63b920f4c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66f07ed0b67f9fc4d7f5dcc1fb23d4f0e7a9f37ed3aa848f10678a2a8c152799 +size 4682982 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..862105b420dd16767103bc27a68e187a87eaff68 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fff1c00e527e8d82c8a1086b9a7ae9cd99f791fc28864504db6d1f55d2abc21 +size 4790571 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0ab9894f35bcfe368b2fc95f9475a1e1752e86d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61dbb0024af6748ea6784abfe4a02e2088768e95366aadb67f716a2f943a58c4 +size 5143562 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..431a4b77e59a1c5fcaa83a1a104e65df040d2437 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4352a5e32fbd0e8b58900e98a7f9f389265778a91753a387a3827e4a40fd0a7d +size 5713949 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..15dd84e36bc6f0bc0033ee98d4b3c5d7b67cd4ca --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3e36785a87d559a7784a060a11a1d54f8fa298a5c1d17f4f862a7030764b056 +size 6362202 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fa976b277372803430a04c91eaa2f9c9be52b757 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8185ba7b7a0de12ea868e995d275762325fe638c57e4507600756d90a9441785 +size 7006166 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b6a9bec80110fcbd83aa1ce8c6a2b7b8716ec8e7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bd74f25d91550786d24149e73d070b86f394b736904291e43be3ec9ab72ae53 +size 4543612 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bb3d272af1a463bd775e39ab022f1d6fba5b0bf9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b8bd4d0ab73cde56649c02b1ea51b145d543eede42c0eddef50d63f3675218 +size 4843844 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0fac05fba3b54cf7e12af660073ab636324beff7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d9698f623fa13bd400696335361e587aa5d48684b13b1ae5a6415409a4544c0 +size 5785710 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5525fdc181ee4f0c6d2bb457748dd30c2c1e49d6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff110aa9c4acc03df6df608b9304decfd0d3f6ebaff004714467663274aa4831 +size 6626133 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2d2167839e787696cdc38608aa4b05c5e300cb1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6ff1b119a3a6d23767d4a0d4841a2ba144b938f3e3124d25277de4b12bfc49d +size 7463451 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..285f52aec72cbdf874dcaf4607e7a75ad561d405 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e21bb0af349289dcf787ca46416daf1e23ef5cc42f13d221250f31e9440f62 +size 8336999 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7512423ad2a4b0077f5732d2fa063f614d5234f6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc2568f1cc710ee5987d04e5a3ca94a094b152c64cf044725e0a7f433edac71a +size 5911977 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a02606e012c2c9f4254bc4358577c643dbb635f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b81b2bd625f3f64b8a853ad7ce3cc6b76316410b1cdc772d312084c8d08df194 +size 6514120 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63a1e65853027de32ece02dbb5a24703394c0cd4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ce1edd615d3c82fc8a1bd3213641ed2608940517ce6e419a0de3429f928a3d4 +size 7667013 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5aa09c51ce572f5f5399f3e72a399f56db4321af --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c97a56651a733c32eaba05f3aebf9826fd427df64dd80473cbf742bb430d63b5 +size 9034769 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a33fe933b0021003a5f2958617305b677ae11ec1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58d6db350cdc72bd5149c84806b390688813b9004f18bfed7c17526e8b75ae4b +size 10379273 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aa8f4999a77da01d97fd341681738dde36405222 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7071ea6da3868c1fd6bb111a4f42e389bb6204887df95770c79d23e5e42df6c0 +size 11719831 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8aa5bda2c272380a989651bdb3b684e32ea2609 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff2a0f4649c26a4eb32ff18b74dd0cce825bee09570c5c9015fc06033f9793e +size 7906103 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b493996dd8b67f0b67614ce4697ad7c24c011413 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d49a6eb250d27a7b32aa07ab401cfce9a41ada136e13fd295db3d7759eeb20 +size 13551361 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..346f87d7176388ca4127b04ca091cc9b1b6a72d6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35735f83965792820572751143d0bd59bf42ddc4269a7cc1d4fb6dfaa0088e85 +size 19235059 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3c72b3fa936f9867e6c18efc3911e89d980cc08 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:869bd151bd7c754e46c5d24dc11cf068e37350d9b8d79a295b02c4030c4e8b80 +size 24709744 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17925db5e06049a5709f7c55ba798d892b707e8c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fc4c0f7f4868f53b96fa833a1b628430e364cf7484b76ad022ba9d433953c59 +size 29901439 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..acb1794664a975631862b4e235f9422d197d0f1a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a869c310b130de65cbe95c27ed446eb7f990dd84fea59fad1333d9b0d0cdc523 +size 35293790 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72729cda278d26cf5f8012ac2a3bd812edb82a53 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c1976bc5f20c5714df38a265ebaedf48a99af66709472e3705370fdaed0116f +size 7858690 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e360c8bfa9af065d6c8cf934b4d3c2bfbb91560 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b341da24f69538934dd3b82951526b7e98f3996a73ace5b50b73e328f45611 +size 13566197 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b6a0fcfe17077764620dabc19286058c912909b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82bdaa2da0ef05f67e4dcbfe549a3f3403bb343e4f4f7157b07ef874cbdb4070 +size 19287642 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8967f104bd3af383e660059571fc66e2c7aa560c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65be8cef466ef196fda9c5bed4f5811ecf7dde2d93f4115ba47e19d72550b850 +size 24788406 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..205afc871ed6cd589925b016e6ea0a219fa60e2f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da1d831235bc713743a6d89666e5e4b40ec1018a7e2a8120281a33d659c27bb +size 30012868 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c26d34e38964d2f45f68497979d745d4238cde6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2032565df8b2ff475f4bf320c0dae6a26b827c58fe515315a9b72adf764420b +size 35438451 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8f7fc3222d72a454297f6b85a0fc5d789873a956 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48a522cdc2229d8a7013fadd70a1cb1c2f3b145773838f4c76751661811d698 +size 7965093 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a254e0bbdd4a7a6a1e680bc828b4fb4ee1f69722 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee5c2e70450126705f9bc7d5cb6774e513f0d614435d1306ec776cab9ed1ddc +size 13687241 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd784a9a9e9429ab7aca54ed1b4de67037ff5a6a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99e6487ac38813a6c5184a520267e69c4bf0e39490bd0b2fa467a82a0c029cdf +size 19399935 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c5e01606c45ead6fb79a6c425452e5f1e1a4839 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:901f79f6e01c3220819a8a82e3e6420508676bd44ae386e5f084e543ab2b7bd7 +size 24883622 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bc5fd3fb97497476943fa538d4a982052060e68 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:598d5cf3176776f972ccedad11e1c84ad9b1f87dc1e116e3ab17797d48d1cde0 +size 30127264 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41dc257a01c3f7e67468ee4471059eb52c743206 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0366440b04b78628f99cdf677dbf9d6b01df13d9dca08b2ac7f97aaac59dd21 +size 35580469 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba4547a73f218a6f03b02f4ce082eea8b9bf74a6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a1bef5974bb698cf5c9cb95b227b64965104dc423de569a9f63375f9b42c50 +size 7701515 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08f72abf5a50ebb966554a12f7e94ad247ffe3d7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f41868d8799d2e8a0dbcdd2c4d2b9fb3656a2aa45dee2492f990a3d964bbd1f +size 13294053 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92660e8d4c0a467c29be251e00126b06c668c398 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6fd50ce0f579ef0aaa0e3a1364e143c02a1df89f508feb2d15f24febe5b5a6 +size 18887563 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b0aae0e2adb3e3db3227ff22e2cff5c4a3b58aa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1db46984c6f4d12043547eb84455c2e1ae5f106ba81ac6d4bb5c261d7c4f72 +size 24309956 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f87d448778b5997928a641c02c7605063a5b3a60 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68f36410cb73768a84688c88ebd8bf0fa96946c990865d6a5a432d8147a1cfd +size 29467168 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..312d94f5087b7bd2eb584db85bb4867b7a3003b0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad4dda7c5e8eee548b6de5ffd1c76f9abdf15d6f4e58ee69057d8b8a1ac3566b +size 34799106 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c202f9e3e7b32e8aed8ee29e124ca68fd3b1d512 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:866f3cf241e3b664cffadb93762ecac79d2a86c6e4bd2e8bc3849e63d41b3297 +size 8202032 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b95373d5599a1f0b902c7a6e8558d2489165a769 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09c94db8dcf12320f7f60815ef480d8069ffbe44d7326f8b4f3ce2f16296e289 +size 14055047 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ac879b3b7376fc616ad7b5a239d73391ffa3076e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9794aa5473af7aded04898c82eba3adbbdd731bd79fabf1d4db82517a12e69ba +size 19881354 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a04d509b28f8a1a73aa418cfd637d17dda5c575d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d6065bf69997a28a9a1db9d2e88a73551a4dbed77586abdca344543db3a7ef4 +size 25470787 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2d395521eb8724b4586464bcb9022cbbafca34b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e12a7e3ca4723bdfc0fb9fccb9b2c75aca466786532d4a0dcf577038baa422 +size 30860418 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08f37661f710966f303242aa67b67c05c9402084 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b43f7ffa17f0d83a418adc473cb210485988f6dc293753e1ed2abb828f5cdfa +size 36473616 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5073f204826e4787b60d22618a2b45c7b19c451 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd5a8c124f9315e3f184087a8887570acc8b643014efc42ab7979b5313a7523 +size 993597 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a2b59e9eba5c36fef47cfb81108a425c60698c8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eadf86bce34f4e77a9e3cf6934f862620b6300417fe2fe0a7b28236c4dca835 +size 1452464 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cd4b47c19dba7d9088dd0282d0bac86f9f7dead7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:142e3c444aba355cddbad5f4a6c89633d234c5cc71dac05c3e5e6a91b5cce37e +size 1910907 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89e19659382b910e7fcbb93d39a4343babab6704 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d44cea42e1d014eda223c27a69da9b92c68a3222eaa3595e9f34c0dfcfc4191 +size 2368272 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48f8ad8ea809a73b6f259f9d092f6f38c5bad776 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:565f8eb9f83731125e670173662261995a80f74e7cd42133d065b9c224a857b9 +size 2822667 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c46c8d3ccb1808cf8dfa8f0874da15e883fba243 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6a8dd98a44d36124a4f0b4dcf3943a42d25a76fd018c4df09036264e4a19be +size 3279046 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d547a4de266bd3cf593ae5e569aaa2c566c9ecda --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690fbff917364e834bd75674605aa260eadd58013b949c4593a3d5d2ab3599d8 +size 1203113 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9 +size 1755006 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a88c994d35ef8c169ac981574db63adbc06ddb39 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9dc5bf75783e5b87a228d9e892c95a051956d17204f55823a84d300290ffb28 +size 2304045 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e147d4008cf1bcd71c532d19c5fdb6169bf61154 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b104348654601961426cd435d011b31bcdc3dfbbcc42bfd30be2d2120b7bcee +size 2852446 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce88909acb842862a7e32da1c66ea896ca4dd690 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541b193783b573f69a5ca5abc201d3585238e40c56dc6f263539b4fc6ca47056 +size 3397895 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c6801463798d823223fdad0fde1583b6f4c2bd7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37745b7cd228fb04b62efbb76962ea8ea9fdf547251eb1e52b6a96aa92c5ab4a +size 3945208 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e574bf1a5f499ca225ad5719e1d2ab1e3226d0d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa0aa616e1a3e517de13311ff4cc8acdec2c935d6b27d23f4cb092c6962c76f +size 1008000 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91 +size 1478640 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..64aac9809e3038be864ce288d3f1c25074316632 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a4acc0f7f8c825055750a84ab8af22fc12b95af70bf12683a736d2b0f1012f +size 1949605 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e46a16b9966aca74c12b7843bfad4612cc16706 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08341dd963486d2c8f0edc7700224f13be670d7ca3cd435c351fee8c612c6898 +size 2418933 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8208da84b5a820bca7cc7377f2d3456127f17b8c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc5f3e1c865c26190c0165bfb442e3253cf4c5cf777f3df0d657444d0395214 +size 2885589 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ccdcf65ab57312588f9cb295ed9bd562e4d6abf --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f97ab8fe402707be2204db24aed69f6e73535f4a6f45927ef643fdaa4ba084 +size 3354060 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6713e3df585869ffcc4cd879bfd626b12e3d726 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ec96bf41dee09bf5bbdcd1e1a1daa9f449faa14ace9434deb160b4c9c21644 +size 1160152 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c18cb394df74883e052b9b7b01d6128bea238d2a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d699d68f9a0745a9831c14a4335ec7e90849b0e48b07f15e99eb0a424be8ec +size 1668485 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95c526aac377da24362e35660ace8dc620e297c8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3cd0b8db3c3b19acb71771d2e84f4296074b4c7ba6b14d2d6aa9b4856fc63b +size 2176591 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9574eb552eed930d555a94a7f063fd551443b699 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4849edd0d416bcd54bf3ecd5fb2d73d4a543357bc43c08d9ad653bb53a04e92 +size 2683413 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7449c6474b95ade2803b5e7626541977f70cc56 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f3181c097d9eda1db88a30686a4ae47fa126fe176e6e5186b8c2582dd12612 +size 3187416 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ef5db5eb9a5f126b11f4ff0ed3ff7eb90c9847b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d21be747888f672d585b5e5b9234f2b9b0f04fb8efa25ea7fcba269767ffcdb +size 3693429 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fca0d00bb640538500a448019d0cdef8d3bfb4ce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00ec732ba967d7fc8ba940a104e86681225d43e6e0c5172194a5608dc33cfcc8 +size 1027068 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f +size 1503640 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1419fb5fdd48969425600895f956ee30f44c4332 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88bbdf0acbc300a728bcd0d7cab64c76b3d61901b575aa3c7da18e3187a4038f +size 1980571 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aab590ac11fa310b1df7f223e0bccb28b2561d0a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:613ee9de1d2524f6ceb8000401ef7fbdcfc79cefaa90c14e087ebd579cc1bd9e +size 2456007 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a89a2a9c9637f45adfa7adbcc4bbf8bbd5aa1d10 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7e3532b7161d0efe137b9eae8f43b1c83d1f92ea1392171e1eec2ad9b2b6d8 +size 2928627 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..701fb5a64e38aabc77c272eb00261a54bf644b27 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f1f1870d58c78e36049377866c5984d01387a6322036d98aa2679af531e0c3 +size 3403196 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a74715f822d47e3e5957a69b413eea77106ac3ea --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56fe99097e48204e9d9b4b82dea5586e8be3df4f39285aee96d8c3fb6842af31 +size 993937 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4e2ba9d116483f16916135619e59fbfe18c78c78 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92a4b71cfd230e9fbc8fc38633821862bb07639ff97373c3d28873293185c497 +size 1447654 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49995d5d83dd3cdda1d20a88871bea820db8d2ad --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:664326aad58f86e0bcf5143f674b7da5dd6b7d8716f5c27d1f6981da25c972f8 +size 1900910 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..85d02535182ec07c54645f5de2da3a717fa1b93b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6942af47d8a294a0e9090b344746fc74f92dddece54db718aa6775a5026aa54 +size 2350908 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..feb88f565f0df5fc31c8b26b1444af54ba43e724 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe67849e69d8c99809fc1e9942423ee6ff8b8e82c6dbbc4494a22738c3f99c72 +size 2800563 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a39a6378e0265304fa8ad1ac849ba8bb394df48 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcdf0802ea1b4b9c3fd70278004f581596688eb29f897bfc4e269aac6797ba9b +size 3251264 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e0b84f31a5eb3eab3280ac4bd32f009bdbfad10 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8655412b9a42f8604bc975b2360d64f3b46a6129e38e9e556f686cdd8572250f +size 1203513 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23 +size 1750064 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b772b6982eeb0245fe93853f5ae3343ff56597a5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356ebc691140d0e5a38da6ae528a3821f3f7a9e216a7d30e9b543add49ae49d1 +size 2294128 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95ff8f07450c5b6ff16f26d166b8724cbe556e43 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef942a76f3534007906a7e21937038db2216f4569da78365f4da68b8999e5651 +size 2835012 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a34a19dbbd463520c0ef526e0fcb68d91d51fb01 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bcbcfef6bb6a2b6dfbda1ee163dde24f3afd5cf14640a5462fbc890869f2eec +size 3375833 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0a8154ef7150072a19e404506e49279ad800398 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ba8ec4490fa9b21a4ec3025199beea85fc0d0beb2cfd2a8f2439b5ba92f74c +size 3917481 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e8079cb6e53af8de311ae0a816d471c2bd469cad --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b14392ef3d301ad45f611a36ebbf7798f9ad60f177ff71a819e43f2bb775220 +size 1008396 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5 +size 1474064 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..632ff6794ff5ec7e9e02fac7e35268918b4e92f8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c262d9a031b0ad7775620ed0f5f9b8621a30004c63d8441253f9cb7dc6624725 +size 1939796 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..14f564646ae79d2b685fb7fed493f5afd4f162ba --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5849283e9431520ffb7cb6c4801ba81f2a73ecfe1b52fdd1061e7d2b6ae25ba9 +size 2401696 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..624a0e3f478cea827b0f8ad6bf71d2692d1a4214 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07cdd12eb5aaf8eae8aeadcf8fe0aaebb5ae0ac2702ef72f9201271769a7a208 +size 2863618 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86622bd8d45f66b894a30b5c3b82f5258371fc8f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5556e4c159ba0d76c811049858b14f9a6240559cdc33cab078e30e81a2e0ef14 +size 3326466 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..25e40e5cf38e6aaa67ee7a223018b68b49e62427 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5a03e6f5022706b7a25cbdbd2167b42439e5b78db59ad164218af0f2b463ed +size 1160568 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9dd32a78f981e35b6db0a8231f2a3cca4342ad5d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8d4c3cf6c461a1b52f366240cef30ac02cac11d06b9c6b59f94e9b916a8ad3 +size 1663695 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3803eec157aa41bca445506e42cf4e22c994df87 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9492f2fa4ffb78f177f17bb21338df1b52c26ba8516abfcd29e9c890b8f628b4 +size 2166210 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8ffbc5632998be1bab96dd3f624912bd27b319a0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940dbbc3b8715b14ae52d3c08468624d4fe6abac1d898992d8fb65ff8fe0398a +size 2665429 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f086ea5249cd176b1fbce86d3f2764eac506b4aa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944e9b252305b173f2154b3dbbfb7a922aa7d495dd254d7d86e734921e5b6a52 +size 3164495 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0656908221eba674f28fcaa98659d8d7e140a6a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dbfbe0fa8d0ec63e042050339a02b526fd0ff3c4c30b9ef284f06599e6d9c7d +size 3664694 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aeacb2c2b0b65c49f9f887063304896fb9b8efd1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92a91ba443273655c0246f1a52188784b21acc811c374eb65be70793aeda9c8 +size 1027489 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b +size 1499064 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a3b43ae556a9b74348e0f1d695efeb55679e062 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74df0c62784bc25a8efb0cde7f98ab1df84c7e39c6e06e659bb9abf1f34d7399 +size 1970772 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed618c76d2e7a266568ad553fda90b6755fec69e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7acdc17e630be4b5d4a966c2cadc9f8fc0f28b5718d1058333e180b32ec95792 +size 2438768 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8b28b3c3c3ff1fcc84e17365a6dc9caa0e22180 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8bba28862fd72d4b882b12453274960f40d2869878f16372f0cd4e760107a4 +size 2906678 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b58eb1a8661bda4cc200054f997dad864d04705 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r2_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3f53a3887bf0a02ec9996adf3de4560b07a1008d2a52fc43bdde2b1d82c49c +size 3375582 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ef97cc7628eca05c9ae45656399ea0ce4d952a6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f600cf9fe723c0912743ff3fa3526e92237e2f0921b83c5eb4433b554400bc0e +size 1169516 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b13e5239e139417d0fecb704108bf212e5d4a647 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ee2f95c088605aa170dae4e7855239dac54f6a919eb8a545413a5fb0b9cd80 +size 1699081 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..06d55fd08e94e49bddd35a5b4658d83031aa38d5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f916597226317be5ed1a85809e88d3fdf8da48e932dcec0f6db4a6a5e04cbdee +size 2217845 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..384f0477fb24b5052f103dd71ebdf505936f3e9f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2439498223b0d9cf311f10f610127a479fd8b19ba32490afe5b73892acf373 +size 2731216 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a539688ea3155a2a49511bada357c8c3e79e74b1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b2eb300cb646fb256a63aedb8e7c02a6e94207f0056da83e437dbc543744a5 +size 3248845 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..135c90d126cffe193dbced99604f3d5182f70783 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e722e970b7c8e443c57a9395fd7c1d3673cf7a869b7549f6f52acb87cf877b +size 3777379 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f3be85479c724781c8f05f31dc540f17e9d771e2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a814698952934bfa578ae6a6d77d4cf356be1e93e7bb815ef9bea43ceb2313b8 +size 1421172 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe3c0c77f0fd332fb2b9aebbc2f2949d79b9d340 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8436189e45f167b68f379e1b616cda810a1807bc1f8dbcf03908bfc6beea0535 +size 2061859 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0782b40455a82610d96af7d006ab1c317bd2b807 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f8d85bee06a41883025bf9aaa7f276aa7b68ac20a1ffe07154087aaf59419da +size 2689249 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4ea169cb4e0231fe062f3667ab85ff94e6efb879 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d674501dc92e8dd76b0f29eb6994fe19af0a9572fef4ec0cf47bd7e5e6941212 +size 3311948 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a43d0de192075a3a6298901ee8c1496be64a475c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d5775900a8cc67dc9083fcc3fe95f659e67f8e56a4862bb30c5f8e994e70e9e +size 3938692 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19931eea82b0f5f8951a502e4ae5898dc9b16a6e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f565e365ca14aa36f82ffa82a6be6e7cbe7f29229d34685547d8388134ba51a +size 4576501 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ea322f5b86730dbd923d7305107df0ec3d1237c8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f8b3f6a6d9188e0b5d6fa414d21fdfd6e3bd3451a1c26d6e292f498ad716bd6 +size 1186755 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778 +size 1730743 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da540c9c5b771e5b7995c976b47f959f97362a50 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b2e421dabdeadc34d1d9e16a66893452c22c12049661fd49ec4a4451ac4c8a +size 2264496 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2df1c621dbb20d1c114a608dd9c3e2deb004f51 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f89907696b1e74d923b335b0973a13a47aa96d23d39f90f4e72364f87d15e7 +size 2792090 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a0e010bcc43ce1e876006a797054c341cba6e14 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dba3a7680af01603d67c547a4e94ac3ae9f085e4faadfbe8e2e0bba40bd1aede +size 3324274 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad1c300b569652908b725cffa8ebe515580a5732 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0004bc10a9c45a944bb0f6de184db5a641a36f7fd1937b814aee9974d12f83 +size 3867323 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b87a7aec9ac1ef4cc47c90e06ad95e9f7e0f2876 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3dcd584fa40439e4f102879ef4a7d60de074ee9c03bfcc742b28793cbb61754 +size 1369745 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf7efd6bd3ed2506edacfe7e3c2eae21fd783bcb --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d338d1b1dcd532d38baeef9e90a995e30a680efa33b068abcc6386b40cfcef8 +size 1959099 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..180cb3a479a9889336cc7fbe30c572109ef5f5c3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e465205aeccd306d36a0660c47e81bb04367c46721c5d06c98ddbaea06178a +size 2537322 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61e010843751c0d366b2f3bedbc50c32375b9ce3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69154cba777e02f7113c70de7664b9b410a000c4bc74db0c27174cd6ec381e8a +size 3110265 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..790ef92914778b77631cea7062cfc8cb2b4d5b2f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:effbeaae911053b5272c833979d9026870bb1f0967273f21ebd26e26cc78653a +size 3687454 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..84246082c89df86c47c9d37c5091a145a58c94ef --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b58b20e3b97a1f36f69f21056d9c43480762da25c8a498fff0f439e7ee975 +size 4275621 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6b78b968a8df8b30720f47e2711a46ab6bc600a1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:374a75b710b7ad3cf8ea2e64a1b48c775342be665af294b050bb0d1ff673cfe2 +size 1209971 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021 +size 1760743 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4a60188b061dfe62b21578493896df4eb99bdea --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24ecf1bee99a23512ceb74b2a813dc20b179405b522023a28f8228d981771d71 +size 2301645 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1f57c7a717a63dfbcfd1107c2830ff5f4387898c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b51ee677ffcb09b204595a6f7ce5dad5e097dbb3f50a5836fa5654377ff59890 +size 2836526 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b2361d3c78f724b53af8a5db5aa889d895e249da --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6bf145a083a64dc2b9ebfa3e144acad956e20e04b1642abdda4c176cedfd27b +size 3375939 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5894bbe1495905567ae52d06e033583894fbdd20 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_anli_r3_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d17b4c9b71a1066ab9627453404b3d51cd03ee0ab9bd6a3536e97a5684a85768 +size 3926203 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e091efc79def7fb17e2b32dc7942347ab818d07e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e118c727efdb3efdedfde515b331f91d2f4286739c2ed74a19bfdf133a51b50 +size 1216777 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..972fe71f34703958a953efee8d14c4613e35b012 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82335ece559cc4f831f6e56340b5c95e1075d57ea05bca331eed57579a3015fc +size 1670632 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..628bc3788ab5fc1000c8ba7c357f03682cbaaba2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d56f8665159b1551cd57b532eca9888850e8e3288502a5891e2a60c4f7180ee2 +size 2120084 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d7af1f45d95c38871442aa9da33ecc9a68fa805 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27db8b9fd43ff909816736dbbecb77b4f10b482a42f3d2bf0721f72bceb8a103 +size 2577615 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b2e7b6366d9cf40e8d50224f524abed0b966fc9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61edd68a0bd439392eb3d4bdc8064cb82926fd73add557863521192f194514ab +size 3027517 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6d8af2561bfd9843c6cb4df5d5e7674e9c108a3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9836ccfcd30921129c11189ee45bacbcf9e99db9a14a56117cf277dfa9233a7c +size 3479076 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0af2cd3b0d09dd5d959ac1b3f1af5aaad351b09 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e37d34bcd61b2ecaa170dd41690f158a4a43485f2a964b9545e8750d0cf15101 +size 1458157 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf26f343896b22ff27cacb4d8fceaf556d9c8cb2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1255d778e708262cfdf805c7ca6b786f2d38baabf77afcda2ca52bbe280418e +size 1961189 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..918d6b3702a87b314b31b5418ec0215b05c7f437 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33986b11022902b18fa73757b6a1c66670b5c125498f0464ac582e9f5442bcb9 +size 2457818 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ec9a49bac9f589606b8934c22dcfd0b2b8fb6d0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f10fcb6be6fe0fc4f6852a80fecdc90d13a6e5810ed96b5fd9d3c348f031f97 +size 2963824 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2dd33510e4c98103e859c177779be29c8df7561e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9962d1fd020993ffd75beb0f7ffee514bc6c01cef387ca0218dac924294b0fd9 +size 3461735 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dce5d4f56699db86fd0b2b8522191049272f03f7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd450a3605e3348a007f355b9fa3c5c792c644144ba1632625cbf9f839d69a39 +size 3960936 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7309f6f6839bf0f3113f7fc771f30ca74983561e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b92149d674882e86560797499a6b83511fd105eaaf4d4b7ee1e59bb84a03fa +size 1505759 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..471d0219b2c4c89c826cb33fd32e1e58a8195f83 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f64f5cf6abbddf69343cd076b03701630f777c2dbaddc224aa48b8d9e1b7a39 +size 2032865 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aaf6850dca5123d8296ad4a27a1548f8fc567694 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e732615f3f0531ee83eec3efdd7d277ba9a7a1f34d415a195a02cd77e23e6a3 +size 2554100 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d0e9c2285c825c5d3b453a867c204613c5b3cb8c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c824f63f3984777c6927f0b59a8943890fc9db6d090c4e8a72b64fcdcd47ff7 +size 3084312 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df5c0ab5accb97b0fe7da76070d372e1cce401f4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e799fb67df22cefd3834493e707b54ac1c6a1ca7aecca77213d2ef7ce22ad152 +size 3606583 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e8802eeb10099686ac39b11e9c9ad9a0fa34ae35 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b019f82cba60f6ccc746bdea54e24b7889861b888b9b853d8297490150ebaf3 +size 4130450 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ab91d634dff071a79eed69f96fa55ff2be752c60 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ddfa520104af31874e492d4e30ffda047da8082072d4a5a1cda83e411f35e1 +size 1202714 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..558452a6af403c843e03cad7dfc3280fc5bf248f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2d74191409713e5339baa59cf3b6149752771119339ecb60fd5f7a55f15026d +size 1638992 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75e951c95efef0f3bcf2760983a85d7ed0957c5a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff12386b9f3d2352777ea7a84e3580755cbee94385c654d89479fc3cf0dfc070 +size 2070864 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f38338fe43af4794a59368855ddc4102624e04ed --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b586fbd6ebd76c4536679753a85c93f8019c2f58056319f9c99679722439f20e +size 2510815 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf0a60e6dc9b8f1b9e73c9abb9d93b92b3a70108 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:858c479a635f74893def55d796153376cb8e6f5788df8aa5cbc5da45faf7a493 +size 2943137 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62ad7f7b00d7a7aa5ceeafb3dcb3e7416e77db22 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c3ac7f6fce937fddf593e294b55b011350a3c6797c99d969893caeade14744 +size 3377116 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c8e962f4adf54add57e19ec833bdee833f5a6a5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:144e5795c733ba5c368e13c64e16fab067ab786c215be73c7929333823ae82e5 +size 1187177 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a0822a50d6ba0e5a70da76542f59ff27facbced1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2579707fcf1a3acf3202ff5ec59a1499469981ddd79f0cbcdcced5aa098e0152 +size 1557890 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef1c07f870dc8c5fb2e3a5449dd37188b9c9f2ce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56f95c1b17133f9e59ee215edd542885b2acc4b04cefd590b619fbbb77c94b7b +size 1922014 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0b772bb90142854669190a441f751751955bf7a8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0f65022f2ce7898b6bf7168c0a05b22bfebcc67d2db3e4c2c24c74565ec408 +size 2295674 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fa71069f7ebdf59e2570f8e4e44ed9127b1c28bf --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e3f8d80b2804d3d11bf5767cded7a2a1afb601adef9bc771ba7988bbdadd09 +size 2661204 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9f30dd5d0393f856db1be8c030c29a9e73aeb2cf --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_challenge_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c607913d33d5db05b54ef0c645be2d252786fe078284f294869a015d019791 +size 3027841 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9dc2e48764b772f33af8fe95fc709b951f05f2a6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:192ef0e33915952ebc2bd60203bd22f98ef67f35d679786d3c12b3d704d4b8b2 +size 2351241 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0fb7b257ba06ece9591f43d8480b92614d94e2f9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7409027e3ca651327b45d88639f6bbb0e07dfd9e15979a23570420cf2418564 +size 3174164 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13f6dd46852c8cf16a3f7232e6fbee328306a2e6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c4a3d31d4972e4d92206805ab28eeb0ace52fa3c559f7f5c1343bee1785f70 +size 4008946 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a0443c054268987500414d1bdfd80174faa84e2e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d0a7878d7d3bba5acbb0b2b73048bb4b2cfd95c4730f4ba209438d19cc54330 +size 4831612 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..70c36e55a7525eb4b9b449ab7f1b664c52941cce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f854cd27c0a7e45140cb3ed98c08d40a1c48fd5bb9de08c7d454693860a5674 +size 5662356 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2fb6ed7d7a8255edaaf77555cd9e783f9b546049 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:210f9bbd2666776849072209234b346ee8c4242efbc05d4d694dba04acd472f8 +size 6494729 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9e9228247b9ebb08352b931336740452e7932f02 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca077ccac5f869efdda103aaad5da7b9c7858ad9a3fa7bb9e3ce8d2a3a2bc9c +size 2746007 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fdd4834b8dfa1b487be2bba4afba826a52940ae2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c65b9b50f8d05a5735f2744d75b4acace911f4c4c80d98cadc879e2150b8647 +size 3649414 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..40fffe06f94d6c2dad03d6f47bd360a8ab255f1c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb140dcccf8fa872120632b70d59741792251175df7ced297695b89d80e6a135 +size 4567020 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02c04c29d41c23249d7b86086c24198849b16296 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aec971899050b37d086122d242eedb7569cfe83517702fe5043a0baf4ad1c17 +size 5470539 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b5e5e504b81eaece5c4786a538ff56ea4906a8a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01dd727a41811cd2fae6f870499d079b210c0fdfda1827d0280235acd8539bb6 +size 6383176 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8c8bd9d044fbb8c43822199c79403967fe7db792 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7d54a1ca1aaf06460b0db2e91ed06713dead1dd4a980c754c480fac43fb867 +size 7295541 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0f43656c6ee57fee2b17d1046a4b88c9f2ec82f9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae8c852644d95fabd24f6f14a23b0dce73d3a798963cd47738e2625eca799b31 +size 2843009 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fcb82936e0c6260850e8d3566094ca637c6a9c1a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87ba9f89343a15e3846cc73b2adaea14449579267fe2a0925489cc78199a67b5 +size 3795876 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..453bed161a821601f301132971315dfaf52ff53e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f9d5aba859ee227818f0dade8a7d2b42d72be8181ae47d27292010a7548537f +size 4763381 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f751114c44a94f46f51f2495bcfb455cee5b415c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b34b8c3d3f6a7949637b17be006a65c6217a9ac0fa81935fc01b6bfe52176b7 +size 5716267 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6d525709634eaa62da8403d46b863ce905e37c14 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccb679351a89c4046e124f280e3209948db67fc7f0c77d9922f1651951349993 +size 6678195 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4decc51b2f82e2cf6f8d96ed4b36f252ec69195c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:234291582fb2a691593613725a901f295a85567076c736cfd149535c59025331 +size 7640294 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5fcde41a6059bdb02c0c40581c8411178677377 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd7aca12449ce935878af3637f088e82ce44e2def9762d3c926085c4192fb4d +size 2322732 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6b7008659d3a11dc7bcc685f7cc9e0f54d619727 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29666b47e001033d0c7c72fe61db3c0b7dc7a2a8090244b6340325423984d66 +size 3110021 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a2fcfecae8ba756bef40d378c6cd44c9bcddfc2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4205efffa9cee7b98ad81da6267ea5247bc8f3cbe9d3f3c0e60253360a15c63d +size 3909160 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..839252fcc86e6d004db48a6cbab2dfb16cb9157a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6971589d1db2f6fa5f9c3598cf2b969428aba0e8daef8b7a71e40748f3fdf0fb +size 4696184 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53bcbdf6c5fd9144b9c6c123b9775e06005909f2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad2f5b98c22a7819362ac8f83ada5b9964ddd6633a8a2ca8f6ceb87d5bfa7a2 +size 5491290 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f49731c34dcdd2436b8b7b638a86448ce76b1ab --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477f9daace1f27377c5bbde3a0f4479d6d776a61efac0753cff8249179fb1d73 +size 6288023 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6aa01ea282446b931ddd0ba352ee7705f145a4af --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c5b06fbdf362ee4bfb1d0ce1faae47ccd2e9ce3f9369544e618022a4320014 +size 2197132 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..38fc267690c9edda931b2baf53ce26d3b1b0eb88 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e6702282e1cbb7fec9e25d8d22aedf50ceed6cd2f512c5248b10f1bc6ef0b2d +size 2832038 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..312ce77505c9cb64b471901aa596b84729fd59d2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de20d98a1f83b46a7dab6846b6f8d96b573296a1f992af2ce79ae762dc12453f +size 3481072 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bad229d395798a4bcba3786514cda993c023cfbd --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef88d932802c13c0ae67866c1ef7424a1434bf6b928638a9bb3fb46891aa920f +size 4116003 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a583d5584e3585f6b40052a6450db70ffca93285 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:498d3b25b75799643f6790d8e6cfb65bdb3e1c5debb8c64c9633103b33181ec7 +size 4759949 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f259841b7a48721faaf8910ff16fab281550bc23 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_arc_easy_qa_options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1cf871f4f53e543067b0a511306f701e5ba4076e975e32a0340f5a24a7ada12 +size 5403915 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b58f0e7a3826f9b3031e0e4197d656120df59c37 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeed6fcb591e56d5e79d3e8a94c9f0c0f35113423a61d0fadbf266a820b19ca8 +size 3641587 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d71296755d1964f22d2f2f0f978cf6b0fc2daca5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:420f1d29defa0b64175f11c80f539e64d76c37d2dd8f2ff6f79df3a0ade4d774 +size 5656565 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..08704fda152a323a71d9e498eab0fbbfb58b9dc3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6799883f13c42ed7ab4a3101f1c5b8b6a55a373ab5956a608f5c9d684a39367b +size 7693637 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..366916ede214306718cbbf22dbb3effc74566c35 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce97259a7253968d56ec67eb6b11816552498bd65ce1c69c1d6c77cf451586f +size 9727796 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90ca2440df066a82f986348ac0c720cb0debb03c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4b43c97cbba1c7a74db45691335dfb657d76a0897e6f835e7e181a1b5cf4bd +size 11768512 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0b97840aa88e08fea0f1fdd2e4af524cfa498e84 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f8eb3cc905fcb3ef524bb152629f0a2716ecb7f814519d08d25b7ed49dae51 +size 13790099 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..791acb01685400bf6bb59c8dc90fa749d783ed0b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ac4c7dc95f9d4bd1340637604383161d804c978b661b2accbf2b5e88a7c4adc +size 3984657 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4907fa0f6fad03f585a5184f73500f6d6354a72 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e632237bb0a865bc37e5603264f03718b2e31e6cdd9270a4cecbaa6bf71d9c4e +size 6168138 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fbb9ee444b781fc9c2d83d0be2ccfb93a63cc30 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89e84de93fac779b413ba0d1031d0328b403c16ebdde6932394cf3ae42b00db +size 8372467 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..68966e5f6d8e7698d92e316cdf89558141b8526d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19a32ce8f47e3cace6adecc50e2e79659ac6e9c4f8ae90e142ca9025992ab24 +size 10573837 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ed621faced3045f3763cd06c3f8917a5a0d9de3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15d631d972e8c7d7dd6c6438cf6747fb8ba47643927f6681abb45cd828b7cfce +size 12781832 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..047c3713d1635f1ea1e8f281ebfa124111a6ffa0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed9abc9a06c5d5a045808a2371e9fe355abcdecb4f3b8d1446cd99ea9ddde0b +size 14970776 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c3fb026797309ba319f56caf62cd13a47bc50cff --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4d395773ff7fac6048052f8eb36a6d5fe579d2567979a9d3712d47cd83dabf +size 4041658 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79b18214016afceb44accfdeddb57d734f8741f1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb42d4dd9c9559ba35d62f35181ea061881429a4517e56026ad0d6678b8389d7 +size 6260996 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62875467a39261d1081dc675389aef9db93901df --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf119cecb149936cd7b8ff57943b43f6bb4b005499fe2e5252c3d5fd5ceb8aec +size 8501261 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8af9f0ec8ecd084fb5ffb1e0c8f02192efa0a46c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17dfa3706c1eed8f51eef1b2f550b593b5673bf011d9341698338d9ab5c1985e +size 10738591 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af4ab65cc68c0ece5afdfc76045f8adc2bbe6f7e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ed1433a6c392cf0886007434b5414ed4276e23fdc26caff8956b8293426d36 +size 12982421 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0941e790c4400aa2902ca894a0f36124b1174fef --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95def126902ee0f5569a90cc2952ea7045639779728402eb33e395afd842d9de +size 15207251 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3dee1b6db905462e36a545eb962310b4917ae033 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4946eb47733896ac4f3ece16a555dc034401946134da41ff70cf7b2532372fd8 +size 3664156 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..73ef8c2d1ab7457882a4d3d904d4d3497665e2a4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df83a6e60b054ad9ba7b8835455d53ca5d43d161548fae33f6ce41e513c9238e +size 5688133 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74328071207fddb9814e46531c26534da049dd08 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:462062070d545d890bb65ffde690ec4955eee3c90e5696766bafffddc58c9607 +size 7733078 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb7542caae64a978152d34a442ab92e84d01f42c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b582c2cef87421f4dff0f6c4b424e115a7e36b991b2f5f4b1f53323dda539b3 +size 9775480 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8249d37389e983d7425715101ea1c0d2b5853ebd --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cabc5eb2e920d84a07917cc17b035d70945ab92aebb564a4fd84d371e655ce44 +size 11824390 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7da4d5dc5ee660e6952a7098a0e4bb25dcd7970 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138e8c6a8114f641fb850f27ffba8ba510af2bdd35c4d3fe21e2381b3cb06ad0 +size 13854211 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3f65156d174ac7bea73a21b4ecc2f605b265695a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c751374139a5e7f2ebf06ac1722987c5ec0bb0aa2ccdefea568f2d39af8db013 +size 3860813 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550 +size 5990492 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78c90386a93f47fd32f0947cfc079304512fd82c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f838bbab8f363a5a82665cea93b8596d6cace1c925cdad3c3143595642a553c +size 8140831 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e60325f77ab3ec7b091c7d142ecebafdce3ee36e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575921c3f451d9a1ce3031eb7000e87970a7722d31f314cbd9ffed9e4599a38d +size 10288719 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e9819790ac91c51ad46139cb03469c142a1a1c2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d4e34ba4c0d830421ae27367f35d0c724d342a6ef91477a811b3ddd04c8188 +size 12443378 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6fbe0f75ae6799bb2aebd927fc7a7f6e4e53dce8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bc2b1d4bca2e4ba5f0b21e4718cb928ddf0f2f0e5e852265f20573ca0da8cb +size 14578894 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d4a6777c0f84850ae5ba485cf47478aaa32d3ba4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e554a0d40167b28fd36d827c5723eac3441ed5aebc0a998c01787989908ec4e +size 55153 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1728e31ffc2c029f2141cec865b7fba9dd854972 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3672e56b6e91da7cf5a0eb77f9d0676c2833e9da6f1d3833b767f00e80078f9f +size 77973 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37c05584ab855aa97f4155b7e64573ae28bbbac8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fae273d23eff351016f273ace6e04bb00ebd78eb24e327ba5b089da28ea693 +size 99588 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45bb2ece41325f270d3b089e3e816e63f6241a56 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abbe4645376521ca2902023f4fc169a597e3081bc5cde9d8a5c789363cac6697 +size 120739 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0d82182c9f93b2c95629f6a0fcd4adab9f7f7d3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9a6fe1d874d481f960fdb174c5f1337a6b0e8e9d3ade46e65eb5bf369089c8c +size 142858 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..505cdd5c9281fc7c181f882dd1ebba2132534add --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:130d53b2fca44b74220f9804e5d790f5a74decc610ab5478dede43a705a59264 +size 163696 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8 +size 66218 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c +size 94141 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56f8e33de4534ef7b2e0f1ed4d9fc717df1790bc --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3920f00ed152d48f456d1bf5604e3c47be3b8b8522f412873254f7c6e1cac13d +size 120824 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c1d227b3da83ccd1c717de66b6c18ab49f63357 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d5aea87fbb125f93e5171676cb916527b8a766810f55ba72943ab37214a7ac +size 147065 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e6b5d8aa888c1d2077513f196bbc48977b23d62 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de5131e299f05aa1c599224111737eec64ac73dadd034d2d666dc20ffc01eaa6 +size 174263 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6d17602da7e21fdcb0775ee5784d539056e0fec4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c87e4893d729617bc48ee26fd137a827c2d5d1765ed47f743b93dc08f924a0f +size 200175 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b95f3b7e84d0eb288e493fb98d2d851a049d45b0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:692b43582c593b9d8655f63b1ccae5b65270c30cb16af57dd9fc97914ee6fbcd +size 56306 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d +size 79780 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..653b357c20d5679d85b5bd4b88c6903b008d63e1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d55684f769a9258c902bd759a1add8766b20d2768a43f04cbb47e58bfca082 +size 102067 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5e14457f8d654b6b2d116c2031df112ad084388 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54aa3f898f4b65d7fac99caeccde98cd7a2cbd8b6382b89dd141ce18f7d085d9 +size 123880 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17bdd29787d119774075903ef38c22c1cd858120 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74fd387435716c6033776c9b7174fb5ee3a99676f4eecba167ed1e1ecf2b5fd8 +size 146663 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b4e7632abfe7cc7893dc139618e45ebe603457a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_can-we-infer_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c806a1b18253abb90054bf26feb5a3939c5e78aef932716c63d8c476111f822 +size 168150 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c8c075f9ffe62635fb38e381396a7bbf192c506c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197810c113fd957a5667c059b1675650621ff9adead106c967efcc3ffe1c2b8d +size 63901 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75ba2414191e7380b8f671993eaef72da412a90f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633679c30d02d9b3aca62ba9750fdc3644813f152881720fbccd664aefc5dabf +size 89652 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad2986f83e2f8f8de11d7d4a3af7e3fad7db73df --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9970234dab055e5b1579dfd5457655c964a8bbfb3d34dcbf948d674061aae432 +size 114131 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db6444d8b22181c2246506d188d0bae6bb1be737 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e7c20d7a836c11b308dbc5d0e82d3f38935ef06ee2d520d67cbe3b3e64408b +size 138152 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d65aeaf06bb974a00fb252651d34f79c85e0b5e4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dc82131b19f583276940ee7aff1f9094cdfaf4edd8d06a82734467c226cb51b +size 163148 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe50b0d1089d25c9a360fad702222fa2096e0760 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_guaranteed-possible-impossible_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a647a53ee7373d8185cd3de4677167fca35c3172b36d81896ac2087cec92742f +size 186816 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aba98bd5444d546d6c04d20d267b1cff19da739a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04114e54a98fc536e89ab3028e9cf11450253e64c30bad032becdeeb5eb3fc6d +size 57326 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05 +size 81124 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b13ea572022b99d621e0221e52103cf40f5eab76 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6de90cfa16df53051319dc18c97944fde257fe9f7a666859059e9abc53e656d +size 103749 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f03ea64e5521388483239bb547483106ef740826 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de6fd6aa1126d83758713e63fd76caad6b5d351985420c17187cdc76c7742aa +size 125894 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44b0f1b89762a0914d82893ef0d551105433928c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3662b154cd65d2a9c27234a61804cdb7c2f10345057347ced451e50b825a9b64 +size 149017 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c66f3d166fd26dabb79c2b05ccfa744f593d993 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_cb_justified-in-saying_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05eee888128374dfb94b6b4ef07952dc7a790fbd9df5c25ee4feb86509938cea +size 170842 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f85ace50757cfe61ebe83db7fc35fe5f28691067 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2448943cbfd4d357c37f0099a76f458e7183f9de59336c92b9ba848a359ebcc4 +size 92252 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f7ce010f703be826aae9a9a29b9d2a439af01439 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a58c53bfb70c8adf05b50f567595479852b041e48911a2c3841e3e6279f63f96 +size 111573 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a74aaa9d87a98ed737bffe271dfa519c07198401 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e6ad3d92c122776034bcbc4d47001a28f194d6c8db5fc4f4d79e4184f38f7f +size 132075 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efade6361d510219629f708de3a50c3670aff64e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:316b507bc2fbd0154a0d3363747284d04541ed5fa86d7b3f3790bd0e1b872a02 +size 152209 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e90effc4da1f7888f6adc5e3140465a44666fdb0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aba75f01b62fbf091735f75ddcf9f6f0539af7d623d40ab3f060aa58ce34ea3 +size 172034 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92bdd5907d7a056b7f198e75e5a05810c7fa9d57 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_best_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16370057d7928033809a95f4c5abccfcbf7c5dd874e68b3356c572ec921ff56b +size 191956 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec630a7f923d77bc80c065643b962010852c3248 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fd0c85aacd55672a062203b1ce886db4e6fbfa61627aa9a292196e93b5bcd8 +size 87866 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1260091475972475f1affdf8567be54403b19bc2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c983e509d75807ed25a4e3942a45aabc2a13b2cc22d89c7db9dfef9c566339d9 +size 105056 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db5f03bb467abb8cd005be5ad2d94a78612feb74 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a94acb6c29af416907a2f1ba2d18c69cf5255fb40caf3d1bfe3c29fec4aa2b9 +size 123432 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbe7a806c46e6d4cea3f0b31c68d744cffd54c71 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1de193812c53caf29ea9110d45659468983a3a8c05afc62a22834baccb67c22 +size 141431 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad0b910b7aa38bbaff416d2fa96a852fc82edac9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1fe19c3d2392886c99df85a6e7897592cae2bc82d0c5fb98836c5a0a90d2fd +size 159077 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..894ab16f8ea139a5faa6629171788737d1c49065 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_cause_effect_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc7a291b496e1a5000351ac0c91b23c72baa72f99e29e3a3453826f4861adca +size 176895 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3ac30e5ad8429a41341beedb3ccc66c04450872 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe11bc7fb4b3645343db31fdc68ac8bd9b55b847418b46d60b1f288668854de6 +size 85167 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be4a4ceda5688f93a147db61022da29be21b08f7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb59951aae6b83d3481f237b4708d9970ff8d675c735ed5073145996a37bdae9 +size 101245 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9dfd19a23e80cfe279d28047ea0d4d9fd88a2fd --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d749ba624959edf0eb8d3d484702d08ca870f0ab021098171793c0ef2a93c7 +size 118479 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5772f593cac4cb3d59268abb57edef5e54d678d7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0fd808466a974f9259b90962e7d5a9107ad1c5edf3f411f516afd9dae4e5f0e +size 135396 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19c7398e934dd48350383c9719fd27a9d75a9259 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430728bddf8052759461e15ffbc9e26806332f22ca1943a2bee7972c4f479c60 +size 151935 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7f91aa0a64c62762106f8f194cb21e1efee0a1e1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_choose_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b096fb8dc4dda664c97b4a41be25dd6c0a588423c50eab25f340f58e2b0a69 +size 168750 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9c0741b226710199f1a05ebeda9cbea54f3196ce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6721845792fd69f7e96b7be9a384a261ca5d978f5d505515e70aaa6421ead05e +size 96758 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d5112d8e8de3cfe9b78b42ad7da2c9b9da88eb20 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e741e66b9b909389cf2377e20697b355d72ef365d5a4a1aec421e84c636ed056 +size 118281 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c84c49d1747bd723d2c02047d1ed5ae9d8c3b5b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b41849e93bea83e857bc27f2aa84b48a7ec38a3518ed782a23144b41d748df5 +size 140931 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1fd368822153922ae0749d74dc3e58f2221f8479 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c25840c05357d582f21addedf55bbe9a0cb7ae64cc0e61df3428732d9dc476 +size 163230 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0b05b9c165a3647834ba58cbe9e0d08fcf36d61 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff67f73a696f1eea13f146f0f61ff423dcd3699414d6cb0bb40b3b772657b4eb +size 185178 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2bf77e2a78ab392e7251328a45804f843230bf48 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d59417725b0e5faab3c84d484e722b6737600c540a78e2c6bd2894f2fde659 +size 207298 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6eca25096bb5cff8d2b01cb883bc18fef2583154 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c91656d8306a810a72ee6c9595f49b85a0aea8446108aa8242b6d263b131f0a9 +size 95785 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..23522b5989ba3c9b4142db0dc4cf8119c9a825a4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da03b58c2f3ca051b9d031d3f0bcfaac7a1bac7848ffbadef54fe468bc09fbf +size 115666 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c136fde2cbf39002c98ac140d6da2fd0f47baf8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3735f406d2b058767c5db86c272cd323ef952a074ba7600a08ce09f937a6c518 +size 136694 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c2edf363b49972ad30089c2aa6e4f45c0dc82c2d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb56ee113b2221e5182a995773c7660a89802bd4320eccfa50df7cec676a0815 +size 157408 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5292a6628e77017a568c819c7b656d6589fe30f8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450fe0c4a9d63a21ac51546edf473d15f0a69a53101036c7beba8c6a4e51b5fe +size 177737 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b878b1ea2bc5874c43bd91131aa0e091551c2142 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_copa_plausible_alternatives_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50fd0b4cea5326ec24dcef6c6446b6f6055f55808fee852f1c3408be911de72 +size 198345 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9bc0f09634ab947e4ab7ec15f4e8f3fc9832610a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf188366a41a7b3c4844a6614ce2407f2b1f6b6cd714a902e0deb5c0b695655 +size 3565840 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dc3215f5e157315bd944803eebac65e2352075a5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a54cce1ffa2cb6356f82aef91a21766648d2ced307c51d1d651647147f5f2ee +size 4462899 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6418fbe31ebd17a0519c80cf81d09be1b3d0938b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4044074d6b1e913341ef7e452e77338a04983269585d19022dd8caea64175179 +size 5413029 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e842cf2f8ffc25c820fc3a19b755e270e032897e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e786e3d65f7b442e1b9fd38af2a6893b55cadadb50b66351e3d53ca68cd7f363 +size 6357770 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94a64e12d9225eaa6a9ae50c185cd15c9e60a99e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c67ba90f39ecfb3fa73db19295f2f674886b32cfd529266ab47d71027a239d6f +size 7295055 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c1e9da09a10f6822a8c407f96bdb079eb25b4fba --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c55b4f74cce5a47c2304ad7979bd9ed5a2da9bd0a9bad0420ee437d9ef492ce +size 8241419 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6adbb8bbad421509f73d6aae23da030d9e850892 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c6f1ed9855ed133269c8891b35605239924026f93807ca4cb24483ea2ce29a +size 3449558 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3ddcb939d2de656545cbd74ad7a9ba5c624ffa5e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a69d4bd070b967553c9d5a5c5661e580c387451a67d469e8443938530d6b0e94 +size 4353976 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..23bf5563510d7507c4364accee1e54b93ef296b4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc6bcfb54f047a84a636c6d4ece60af91d2a554d3380df95cbc18b18cdd2ce1d +size 5268753 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b9c47a66790d431f2378bdfde548b2fdc14aef51 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01efa8aa7c327bfe448eafb4c247ca8f1ae60a1094b60b2c88c167acfccbbf80 +size 6180636 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..614ec4bbc71e89ee60e51bafd6661b1f56ef519e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab62960ab0c33e41a9cafeaa6190697e452fffd6cc022f2f6badd09b40ddada +size 7089304 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbcfe51d0e4333a62907f8877653c7fc73bf26e8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f377da3ddd80d157984d56c3ff412e251bed75e8c9a6931abdad33e4e1a2edce +size 8005890 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef0a6fa1741c1cb991ea00d81107d6a7a65bd380 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7c76b220eb8e70658218bf49e18be93d0c0b598c86532868e2b7fc7a4e03c89 +size 3596275 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..313fabcdd4e71558ee343a31aad53b41087d62e3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b2048dfd40a15c74554c503c8dafdd00dafb7c653b18160ed5647d75d25382 +size 4897598 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00c9fa15e41a3e9eb4d7796a55aec941ce515965 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:855340064d211067462236d0b2d3ffabc2f14a1a4d7c87108ddc36280f95d30a +size 6076971 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..935f6db5d3578cced7081bee2eff3b7a7759202b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0579e15c8b90a8e04dc8137a6e9e82990991f6b7595ed931549def1dc7abe498 +size 7178206 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce6f234dbc426d2b25c3858f6d9d747a1e17ec6f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46eb9c5ce2829ed84bd16b30d12a72b764709547f568cb3eed95d912102a4960 +size 8262728 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..23e241418ef652865880084d6238ec45f087647c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e92adbc66dcce4150a20f409ce5d63db09659f88201ad69b260487d9ee82a1ed +size 9337123 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ceb692a30f06a6e818565b6c7008043d173c35db --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047bb4bb5c0f7c110adbd7df8921662a5888c4066e75121f15b617d59b21fb2b +size 4470431 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a7540c5f1aa124c104d3bb0b38fdc469b4abdf7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cae3f41947474336cbf773976934f08c955c651f98d0d9b55e8e31cff7e1bb2 +size 5032559 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91a8d11eb60387b3a15d47315fc896ad5f593a55 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7db604f148e8e193e1db44a3fbb54defc0e41376f1a948149d049b46a32fbc +size 6116037 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3c5b4b956a587c0abf3f3f9afb6117ecce69820 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ce941efcec6642903bef131331636bbd8b88eba7c6eb8b601323f30102f3ca6 +size 7196637 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..208d8e89fe21fdc8eb3c8163abd6427ca49c5263 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b633f7e851f28769579fcbf86266068eb712712b7f713dc952612455e937d8 +size 8280383 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3519ecaf902b53bd3e022c8bd36dd5b291f40b30 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5718d7e647150b42083b05d5aac3d728ab5136e2b71ff8abd502723db325fea3 +size 9376942 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a35ecc44f39dc0c0f3a84c52715396cc94057646 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b3c2859a8e5ff77858b89b070fb436c44192fb45d10c51448d8cad46eeac7f +size 3185986 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..775ede722c5767d5e64c4ba23a152545d1a5a6b5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c51c39136dce9c160ab6c47fc16a8a1ccd6a5e911c4b74ad2866adec15027f4 +size 3908881 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b9a0ba7fc2d7e2f5e90746937aa5cf286a6d569 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea96d193d107891c46b6a41631dac0bdd1778cbd153cd8d2b2e0da569fcbff31 +size 4699899 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..55d004fea85fa3ac1e5e2033adf177d63e780236 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d2c02a195c093e518728902d2127357220d52d68394209251be2079848ba71 +size 5489911 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf058e24008296c4bfa9b5524e67cd1a367f2d34 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd8a9b6cc619f06460ef383c044aa34454cadf61d2fbbad2f4cf967d63ab02c6 +size 6276330 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79843320c1296269d8fb35f86b49d063dd89a188 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586bc60341588d0abc1f1fb16d77aa4ff6602f3f9e2802505b9c0e30efaa8269 +size 7073116 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fcd6b61cbd6fcd980432c354f9f5b321eaa77c09 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cdcd5917b7b30a6161694e38f8421e9e37b96fbac6a8419bbab8184e2223113 +size 2888946 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8f4c6e88f55704c8d80a61cf7fea262c8fb5dd43 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8403c1693b3c7212ea587271b1f162be388f76bfc2db646ffa0a2ebd2339dce9 +size 5204289 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b9d323ad2028dfd807c7e69fc646488dd01880aa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11e504ab6c3070fa908a5de72bde1ea8f2e1f115c216fff9e415d99299e85d1 +size 7496295 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..075cff53428e8b8d6452fd1cef9a4790c15a30d8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1331d1480cc71bdf91d8a609f467f67de3ee0587ce59807ee1abee210d1bca +size 9780652 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6902c914d00d901c561ec3eb51a3755f2e357b38 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f331ab228eaa7b22672b388bdb8c43307d4c337e168428340a6374adb4a8882b +size 11822884 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c995f3b341bae189743f73dc2e3f1154664c9ff0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8d846b22618ae850eed6a5b7248859e194e19430642b3a4391aa33d11bbb63 +size 14076800 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bddb331b6f8a597d657f2fc8b0329e186ac2be7b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b78c0fb0592f6764550366e735d8ab74a3b23ad87f5ed58b0e0331a8df94226 +size 2775109 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af3e5cb5001ec900819a9efc26beeed05ab643f3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8cceab05a1b06ee61c3d629a8d91060ade4ffbba575ff985fbbfa39cab839aa +size 5057369 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d36c128b9c3c7878cf59d4e8dfb7ee411606bed1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c0c6dba7bd211578aeedb12ea2efd071fac07b471dec185452267943a8eb16e +size 7316790 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29706806d830a10ad61c77c634cea1ea171827cc --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d75c4e0c7feb10743eebe5c21d53c66471a4fe007f32b0a455fa839f054035b +size 9571450 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..007e3efa5fe05694ea644cddb3062e5046e5851c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90730616a20cc31a442be469231eeca6454541aa245a7d1f0d45abd2d200616 +size 11582858 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47dc120635dfe082bb3b487320b5f78faf550f96 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb2f8f7d00565fe2b5a0f7f8f443302eaeb16ae99f6dc61a5c8f952fe60167a8 +size 13794719 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7a2fb38f9f736719314c407e9047522567f1475 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a39fe7ee8b5e4e9cbc7f72021b5c548187225c31649a32b490178013c41414c +size 2787962 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e52a4d5f66d219806a88567c9c2fa9889fdec40 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37614203bbf0678a8f16d76997fdd17fea83ed484c011f08b67d37dc162215eb +size 5104201 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d252ed0d27222565ffc8f1ce123731c91d1c7c83 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bec8728e46728578cc0f83ee0a91f50a891c14efad7554a0c5a2cfb6d8f48a0c +size 7378151 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41ca952c74f47aad69a3805cbe17690e6b25669c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e33198fced5d05d5fb064507b085e267b12b8bfc2fdb8f5e37d01d61769f909a +size 9646488 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5cdd2f77162e67c56aa390f183e32163b0ec6554 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa12867f1abab2a3a79e3e8733f44fe156ad18d48ef36f303a5736c91a9ae679 +size 11672073 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ce08bf6d06fc66956d573dbcb70db1b504c9f7a8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74f59224efb1219b4dc615b2dd0325373d6e99e3418934f6f9633f08caec997b +size 13897510 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3e258d63763bcf2500782fc533f95210994de11 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4340e31e94146c0833cfcaa1a16d28bbef6bd2fcf82c03702ae31a6a71972590 +size 2804359 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d441be53decb00233de4987351868e8155e6fac --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c912c45866319980efe0713a43f403d931612e79d6042a5b2f93cdff80f1d5dd +size 5095516 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6deefb7a8f414a4b295acd054c27d3e65be8381d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e157557355b09c66a2492a6f28f9fe7f1baf3d9b81a59313c613bb0cbeb9d5 +size 7364103 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5434318e32958f7fa182b8407437a9cb293d60a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c50bed425cf55de5e6d1e5a5c6c4f280e77f353068d38a69177090f90589384 +size 9630704 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60c280fb3f7408e27938c8bf7d364956a92f9e23 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8852a9ce1266458ea2da96aa044610137f7068350b1b0f93220a75035e305b5a +size 11657509 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f54968e4691adc04edb42c50822c5e1b665844eb --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b724fa3b36d582a18be0e428acfa2f30d826c588f5629d429bc016dc73100d35 +size 13882277 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d79b0a629b234c460b3a4fe08d16274ef5bc0925 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d825cdfa53fd5bdc55944e7699b1825d2ff25d35e88dd62d5cd13e744976dd +size 2846586 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1547bbfb99a86688fc9fa5e294b392cda844baf4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:031ae4d2d5227f048086f3b18cfce37caf7613d731fc37760a027b2d665f4fc5 +size 5162056 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..407a6db7a8dd6382c714b374d4e4aa20736cbd55 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57ea1c17fab83ccc71ed4379429419a813fb9efc960b377be92e0d0fbadbb18b +size 7449143 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e77e5f0f4e49c809577d2ff2e250be96a6ed238e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ec8cf2631176f1ae6f8c3a810b14337864cf087221daa475d8a95910c46fc2 +size 9729298 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..55a3306526f24df21e9ecffc18bf6d33633ba81a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3e3c13308f527b052ce9495e2b81770629091990d43a231fb623ad2ce76c77 +size 11770130 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..449379c8abaf3b3dcfa1545cf648431b8568a812 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e2a5606c35e3f77168e9937118dbefb7ffc5cc2d95c25ab5c4759a6c842af4 +size 14018649 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a699f966a0ea87adf1bc144d3f6dcb39bda2981 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6f8a5d1bb131f7fd26deee3f2c5a5091d3e0267263265e4db7c385c42ab1388 +size 2576617 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f1884c638dfe829d736d82005ad030680b2060c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fb45e707da399b8437fcad6b5536f3cfa1bb70ed67513233233b3042ba80b2d +size 3144993 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..12dff71253c9078d388e213ef3f365b633070330 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c88dcdbd1843fef822f4fa739d8e92cd2a540225e94905ad9f423d0ddab338 +size 3714194 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61b885f8b2ed0de4a6956b54003f3ead33c020ce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c524bc873a77499c3a1485d7b6db13a32b086d222c9bf796f941ceb73ef4a21 +size 4397526 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..acd3c1e90a8691e0961768a94ff6d1d65fceb8a2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a867e8862c4ccbd01b3359c60bb427ad93fdcf390356682cad1eb3be960bb274 +size 5176137 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dfd457902dc92dc253b672f35877c30f7c0200b2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca3d8e831b2bb234b92aa4dbf4802793ec1d6ead11fbb6c7dadc15bae326da4e +size 5914540 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b7a13a49d96c75c5e8f0fc24651526687e41d53 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ccb20c5d5c3953196b18f2bf4cf56034daa1983635cd0a2724c286218a5429f +size 2115935 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ed94a8a8be97bba1d516654b0ec4ff314f12f56 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a685727ef53fce47e82448a0f83f30b0b610ccdb458deb8caa9668409aacc5ae +size 2900909 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da7f26a367475db7a2ca296fe42fcbf6b2664a05 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6c3023d212ca201a3b84c23a7d0328af8032cf7726037d57c1a2a7056f4d7e6 +size 3683911 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d546418dea66f528486a359eeeba8540b59e4de --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76426755f9705bf9b7d59cd9c8bfe371cf206317247c588aefd537de964df7dc +size 4458372 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78dc81b1a8d42d229d40c286329f4cde9f0523b8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d779f9cc1318bb2498dff612d1d00f00691cc4bfdb9665f0008c823acb04377 +size 5254516 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26f8c0160c119cd79fdd75cc2658c6b28ee0d213 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c3d826d79c8802c8b94899a96adf21da14a64429addf8472b8f8865c29a4f7 +size 6040677 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..478960093dc855da2146e97fa5a3c8bb0434e98c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8079f635694d51abd5e48631e4709a22ba4138ed25ca99e449c07bf9045d5106 +size 3036985 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4adba36dbaa839be680e317ef7558b990d998d7a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:697d7e5edb3f77afffe237116da76bbb4e39111293e2aa834de9b536bcdf5a29 +size 3267752 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b533cd4142c5aba9b76af897b7ccaaf6e6d33f3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393bd21089e7bfd87633b7ca4d96d92c70fa1312cbfbff3cb0dc500892352705 +size 3508368 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ddaf2368bca00168517360fa4c24b8a9f46918b1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5291d9c1ed3b1a3d19700cf84b0885d2075ab54edc17aa7762fc724f7dca6955 +size 3789857 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fc3c76b48f63ff001e567c648d1936f3562d749b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a8a7ab209fb7824b047b6bcc794fa2fbeb9f7c7e4fdf22706ca52c0a97e050 +size 4071500 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6046888da1ee34fccf65b9c0be1552cd8a9dc049 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e864f002dfed16a0c5736aae454570b8805b934a7edc74f3e773fcde8e4dd713 +size 4347123 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce +size 1864129 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bf5124743db4d772a51418ac89a8336365f34ad --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3837c9d65a67622cba6912e68b485726769dc8918d4daef9ecb3f830cf6a3d25 +size 2557203 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c47eca5c467de63ba887e0799363644f29405814 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72db8a022c9a078ece4cb244abef9cfa919b866c4c18248e027b30edb81aff15 +size 3248305 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c20acb7b0a99d4c5766e4a70dac31757408073a2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:159cfe26e6398eed055168a2e5dbd7ece34c21945174887fa26a25360d9f9dc4 +size 3930866 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..315a84e5d616d7fdfd7090aeae0d59e11ce79a19 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abdf361636c5336b0d045b9ec39b32d1f26239957f182642512f2e126837385 +size 4635110 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..342711a815095c36dfadb27500d86a5c2e253753 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_pick_correct_choice_index_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b35e434a52fb3a7c9ede4dce91ab17e20a56865817bf786f6283d2a69ed6f5 +size 5329371 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..549b8d0a363e8ca6cd5d90c6f8257350fa9df3c5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edbed4d54cab6dd27a38664c8dc4499d59683b0b0300a9323a4dffcec745e41e +size 2256900 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56eb16c8f0a63a2a3aa6bf7b42b6c62802062903 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168ed03f8277e2fb6abc1fdd507a7c7f47e989c6af39a909ff4638029a855ddf +size 3017517 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e544917f623c10c8c491a4502166da1cf4d52834 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64535366c18da0c561069fd85858025f112160fc5f4e57aac01f1c5d87a8429 +size 3773962 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b7270ed07cb50adab259783672abc3473154f6c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1055c1dfe6eb13f9ac2cd8b4c8284eb37efe8174d8ab6b354e0cc70e45c96133 +size 4518659 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0680a20c61c1bec57be302f56bec71450f752874 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b740cd1552f8e155e733415c30d6dc19ec02ff1a29f61ec3ecead1b0890fe921 +size 5293378 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1898649c6015dde835377057f8d025bdff41fb60 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_piqa_what_is_the_correct_ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33041c7df0714908508ee6c53334c76e29666be7e94f8b5557e93270a588d0d1 +size 6054944 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b55787b2fa3aba18429244146cdd031cc713400 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9169b3e138cba6fbc9eec30e625db1578a1e07eb6f2d8a45cbce3b9a4b4a87d8 +size 640035 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13ad13a1f111d73eefdd787826b87e0cd197fb8e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ca6b981cc320b3f15e601f68ef98e16813d45d8271d832a2cfc21c3ace4fbf +size 755093 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3dc9e45354c680bc86c86a1f9748f7e54240f6c9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c552a007c3db84b6628fc86851e93ec5fa484bba6fbdb7fe8c40b2aa8abb0a6f +size 871321 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..194d8272839643fb01140fceae0f51fcaa6a4cb3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a652dd3a211bc168acba24aaafa9e0e1a53f9d9642c275ac503285d226450f7 +size 985775 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..35a563f1075c9201ff895ba383aaaba34ec0b1bf --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ee422a67e2fe3bc9e3ffdc544fd0fae98bee82af1a6f4fe398338eb2f23a39 +size 1098473 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..de50f0271eabca0e7411340aba71389caa001a8d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd010985ead08d1140776262e1ba3a0a67227dbb4edfe9e1b837e8116493cebe +size 1213642 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49fd2e32a6e6ab80ad0e8019a7d410d2a02e1cb7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5930bd42f22ebc6d9d9f91a70493f1a19de2a07421a42d19ca461c7c8b95f49d +size 1182546 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4eabc01e3ad1ee16ca36fd3a4bcc61f9a5ad9aa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa9619d229a735a78e79466ba9aa35d02b9945db202a5b5655d3ddb755dbd25 +size 1779346 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af6cce5e1757906f36a95eb26d03e45694d30f5b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4472215b2b2ea0363ad17c9d2101ab4d1b6f9602f60607569c26c6238534d6be +size 2388526 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0255932ed3b772106d7ecfb858e7992baa98e85b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1ef4a544d318e93ef42d3ade7127b4bc29901db6d040b7af912534ecc29c470 +size 2973492 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..63da69a0b165369fc3e6fabb85da5a541da6950c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60bbcf5d1cb49df24ce295457865962cc5b22a6f90959bb6c1377046e491f1b +size 3555875 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..86757ce64ca997e9e47cd867176c850c499218fe --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Direct-Question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae4ff578e40d84c4024d32d32f04cd857a5b5f75efa0aaabd5d0a5c96ef62c3 +size 4144665 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22b5f8594f332c35c48bd6a12f01fade9c086f9b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc413344b6dda9dd6c3391f5d3314ce268ebb8081dfac4764121f7fff8c40ba +size 1328849 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ffcd857bd21266927cfe39c1fd42507cf594d5d6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df653d00aea3a985db2ad22c3adc16524acc180729a561e802fa4cae2855b76 +size 1524365 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3526e63f783fe40aaa388bdaeb972231cf308e31 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de3518928f2b75911e950e1800fea0601ca1c91ff5334150718b2d8c4d2e416 +size 1720525 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dafa4be403c888784d31507a29af2739de91753d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d902a6d777d989bf41f35066825c2d1a2b65e495577e8e9f49f6dd953500ae7 +size 1915627 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c273cabe0316c14547329ab63c1c9b473ebf8de8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f8866fdbe8a4ffeda94a2ea75d641de4894a0a7b37de8200f8f4af64523f36f +size 2107350 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81742ac6f3e0265df737108beb4b2c9e56a1f15d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c223c76621726b41553a459c18a9051d7b77bacdecb91ad6d64deee39f2bbe53 +size 2300829 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..37be5a17b6d0dda5aade70e688bdf9179a3d0a9d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d4a3a0a4a75063d07bf508cc66802dcfd7ccff086f0c8ce28a1049e20c936c +size 1935244 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..562552a9e16b980be0eb601c5005a40884ae5b93 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d35c5b161420c3971c5bbfaa5c1b9cce5e82e7fc24924939322d8c1c74327c8 +size 2636314 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5cc495be38456c05d7be5a132d523b1b6c9c9c78 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab676c09117d7a115e1e91b7fec045a1447fe34acac0d9afc0a93c84baf63c1 +size 3349446 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1e506ad6cc9757c2883b831b59c66f70e664651c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e686c933a23a7d7e46fe46e5b57183cae519667cd3e5a52e76326decdde4c42 +size 4039240 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..317ed29a5b603ad491e77025cae5fa7591fc9062 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939d904859b2fd55db1bb9dedc821374d623c1a8de76b7a88d404be169a96011 +size 4724538 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a72523253bda8ebfe044a622ac628c2f412691d9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbb7014a5924ce5019b4a28c8f471e26e0fb2a9f8e4b74a5b3141d18223bcd7 +size 5415710 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..32dc912b6f99effdf4867296e2c54c39aea30573 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f2094877f392c84a1f2d2baf206652cc0e49a01a44b6e85c37c35538a97855 +size 1870155 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1269202f9a825a5d1fd9b958dc4e59e348a65c16 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc0f1fbccbe4b0cbf8e37c341548dc9a206a3fae366173387f8a20cba50a4bd3 +size 2545240 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec78283bf7f1764b8aca07892ced060221154f4c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36fa5434d2b03f10d1606a8949d883e8bd96fe3d2c455377aee2ff6332dd7db +size 3232464 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e90c82476dcd33b9511c0d5ff746f92f6d52df4d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c135a962464f5679d7b777c1ec30fa261d6b6327169978be8eb4780917006aa9 +size 3896155 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f41de93d1aa34af052537079c0b2ad65e2ad36f8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae8f50a3df0386d20383d1a813ab75838a2286a4a62ba2ef4c3e0cb705b426f +size 4555547 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..379070d1bbebf5eb069b852e3826976f65d3caaa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c0ad62c4622bcd00e29d29c6bd244159ebf4f4ac2bc4543bf4058f507d830e4 +size 5220734 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61896593397626c6a34f333066312ff81b88ebe5 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d3d5d93ce2ad3f45c3bf88974c399917a8a90babea1d7546789f81d52ebce2 +size 2213969 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..136eefaf02fb3461d7ed5fe208aa6d3e407a052e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ebbd92927e9d10393fd6e0ebf4dd9c400ecbfa9c217dc478fa0d71ebd9e743 +size 2974578 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d28de6709dfb939ddc401b00848e07a1857fe98f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c851fa964df10dc779f8df8e493886578294183ddc53c00f34d283738ac659e +size 3726808 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a6ebc1fa1f2a24dfb04b4a47a2f122aeaab4ceb --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd0ad448b25d0e8808fa2250a4b0721b08f8468ef0d16a7835bc3d77caab87eb +size 4478716 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2b7356e2b8c294758c125185880ea3e25c0bc96 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d753d551db9f0f9f397d563f1385df6fe14e408344d8601bfe558bc22a299d42 +size 5230367 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cbcceba575740ea1a843d6fc9eebe14b447d4577 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c382381340181e22c3c5a64192db123d781a280da0a502aa5353a42f0b1b2e +size 5981010 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87cb28f79e0040f9a7aa0b041ef7f0c0349ea16e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02811303fda5dbab976dcb28e02e312580bfc90eae7471af92e871736f194bce +size 2360936 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be49bde682455593d3907d3f475cbbbc4972fa71 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc5bc45d6db503efe5be0dcc24a05b5b26d587d8f16b865ab247b187484900a +size 3193945 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fbdc28b296917f0cc81a93cf39536fd24819ee1c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e973b394697620c83710743734205e03f12f15fbd992a1aa39681ca503afe1ea +size 4016910 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c88c973324adf18daa750b97eb24b6783db1e49 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fce01a529b620464739b10203563df9aef5c889745b1c0d72826ab75bd318475 +size 4839601 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2784e3601264a43518849ff437445a4419c6ee66 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741710123acd1d6a25f858f68d3cd7f509f21c842aaf1b651ea23e529fd319a2 +size 5662595 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c3976348638d82b2160042c5cda694b597fd891 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce87d65539a732b94f02e04ff7cb76e38889e2b25156a1a1d9515a0d7f16b66e +size 6484110 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2494df113e06d8fa5031cf279024bb0ebaf4a6c9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053e6975026f9948f64b2fefd033378132b6bc40d3888dadc9ec61e205f1a95f +size 1879584 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..090cf026a86e51503f08e548405d4f30a05b5d7b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9733747d668105f582e462febb483a70edb58ceed321ddc90d0ed99abcd90ef9 +size 2435453 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ddd7aab06047e2e94402ec239af7215071fb1b4d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42c9dd62c378c4d4072ba520bb13a314cbac7bdb4a43b08606e970d01c972c2 +size 2985887 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7b918cdff70c1799ad66462c0c37f83ea3db2c94 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f845419a7e4993a4002ca8b90d9604d78d6f940030e78ffae760ff2a4a3554b6 +size 3534488 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe66de48feb81971f389e540ba722e64e5d7cd78 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbdbe417e7a7faef8150a01b7d9e1127ed11b9f90b94c49f0e59f0ed92b6c5b6 +size 4082883 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2dc73e1903b5672497bd555d6f9ffe20ce2da402 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Generate-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82ebea572df561efd8aa2efe3c5de70b452865fb9ecffeecf88ec2b0cce76c6 +size 4629971 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90059ec081914ac06e8d8f2fb9b33408073f5f35 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1fb25186ff4db38222e79fb8fddaa8a37659b80530ea5481a0174968c810d04 +size 2371023 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79df9931a1c1d37e63cf9fce9aadedebd112c216 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4689ca49ae79dcd5d2143f626987862dda916d2294f6779c164e8c351c0db5b2 +size 3210667 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..181692636a747a1f186a993e8c31802262214099 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddda0b568ff1b76ddaca575ea0d9b8a7748f2ea4f8c3f199a7790294e4a5aaef +size 4041356 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcc66449139eeabdd7a1cda1887f7785fa67013d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da92aebca5d2fa15a1ca6ed4f86ed2a76ab810787cc58ed9a31d66c487e4b50c +size 4871736 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ccc78e3cf00647a643bf2bec8cdcd02a4d71a8ef --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fb41580115621082fb3898de01af0c22187b0f8d7f95490ac6d1e06f001479 +size 5702050 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2084ded8486643cff3a0797234561295145a26f7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d6e48b37e740ffd664985c95695e4f3e798fc29ea75007a9b9e8c21a097823 +size 6531440 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1039a2976e3253129940fccb4d65bf533344d4dd --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0845cdda83f5c35d201986be8777f7069a8d7a849fa33f8efc2db4d55141079 +size 2343518 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e37ad88d53f6fb00449f6746674f20754b0f7c9e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58149ae7be858fcb7b34670dc20588092065b970f93df4c3af14036cb583f6e9 +size 3157925 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..374c923420e7bde545898b5bb8b90c9e1b4a2d4b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10867d679711f730b0d83dbc4986e63c58c5ff49062572a1f2274fff564a4b13 +size 3962534 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79c1a62787e347b63d79b13924a5160b590b2451 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7868e220f498650677e7cd9573b26971b8db7edf1eabcc4a0c80b2e1576eacf +size 4766659 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..882d5646b7fb05c5987a783c47491120422af5a8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1f2541325724b68614bef59495bfeaa2d938b24336c8185dc8c65b98ff1203 +size 5570713 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49b7af226e1d25729c0c02e9aada32e4d70706b6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ecea9a085cbfff7dfb56a5707e8c693dcf8cf97bd594666ca11cf7efc47d46 +size 6373693 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e62739371cad94693e84b38b45650dbfb5388ded --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d264d8b42e67833ab141cb3cd49720045256add9516d51b93adf72a1ee965c22 +size 250701 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c85722e881f9e9e5c2713d507850f9bdd4eb1f40 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e1354286882c19627eb503deec7b7b404e284cfcdf69a7960220210c8ab2566 +size 351377 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..01f61ec67659849ca6aa25294cf48d45ac9381b7 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0afcfc3d3fa9edd49776f2012de9c9d060eba1bfa50e68f5b317e2b492a5e187 +size 449657 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c30eaf256c9668345c2fb92b33b7137775b01803 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e52d6bcaa8c076240059c32c43ec025bf12935dcef3e214bc91e4172bb26b25 +size 553021 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..049656c06252174bf215d5b024b2d48c3f022e5f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fc628dadc54e7a93e718a0b19494232ca1d92f76b03c1e2f379018243b773c0 +size 651639 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8505c8f06f3152b9e69fa5c89a9fc7a159b7335 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_GPT-3-style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eed38cac6891fa7dee60fa1f32fd527051bda785ec4276f936be5d3def7b6a21 +size 747553 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b7c30d94a0116482fdb218643f83e7f8e4d6702 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0614ceec6ed9aac5fbcda023c86aef5010914e3ba434bf981a12eb613cd523d +size 292989 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346 +size 415394 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..742e7e0acbdade096f298d5096f644527b1f7e4a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0ff8d7ef659427fe79c70ecc1730882e6c890c98f8ad2b61da2f86ba03e179 +size 535267 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e08d286762b3bca317bf0b3fca2bd7a02ff9b5ab --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddbbefe002e668801873409c54675a7adbb19083bc1981714e72a6eeef1175de +size 660218 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66bd7dcb72f09858c4e4364499b19b4140b25051 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac047edb59989d48c492a220d316bd63a5a11997b67b18a50f3f676ef3f22ab1 +size 780424 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e8f6dee6de879754150b98f8058861a77c69455b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafa3426b896452264a3f01e8001273a0d0dee2b50f9783e8adb02534731ac4e +size 897900 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..677f390c23b2f468bb138f1c09d3a43c3fef8168 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27b77d8bac1ab695ddc05c414dc117382fc01ff5f45d3a07c3edd36e0250e57 +size 258488 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8b3fa6703bd5f043ee135437c43e47a0695e476 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1faea5d8a6b9b2ef43600f7f43c55e3568da252b6027d978eae89551cde989f2 +size 363042 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4930d81e772494346e63fc99c9cafb8149371c87 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:688f7c89adebcad886088a77fa7130b58e84399bb4fc12dbaff1245f412d8157 +size 465254 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a92313c69b982a2a1d9a005cecb965354cca5a51 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc2788edf97ce98ed6e31a623f056a79d668954d84e4c8aeaff22cc201bd681 +size 572511 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3a99460d9188b09902345bda03498e6714b111f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fdbb85ac8b79797742c9e63bdc4aaca3fe72045904c219366b3f5d66d30240c +size 674992 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b573c75475457f00e9c1a1d5b06b1b5baea880b2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_does-it-follow-that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f0fd9ca64e4115bcf26bae03a287006900d7e097510f575fcd60bb87dcf783 +size 774766 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec1cf06485bc6245c92c4e1919dac81788340b08 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59a35149d5fbe225de1e1875455323fae061cb7b290e6d61867b58f80003fded +size 261214 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d +size 367750 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c29a252b45dcf50d0fefe3c6efbfb9b9a0991d72 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03759aaf4c005184ea448b0b441844e138388e495fb7503ba0291603f1e2184 +size 471870 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89792ce297a9d781ad20f0df2a79a2bf928f448c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37ba0c0e9a05c04ca50225e295493c0a74e61c917bee26c0c6bc54caf04bf4e +size 581051 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89fb2de9dbf5485a28c2acf7e6e82c8de3e631d8 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f57c47edaf6d54c1e4963242fe99ede02c51bcdf5f58d755518a2208a78d5b9 +size 685457 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4aa885f257e05aaf918646fe0023a0c05452535c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_guaranteed-true_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d10aaa1b13389defba4ea07f89a3eb2d98bd4705b5862e2f24c92a366fe09ce4 +size 787148 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cf657df223f52207d66e4134375afa5126115f39 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b330753fa49fae967c6aac38bacdc3d1d7792079821b69e7aec0343746343b +size 262361 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f +size 369689 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d8ca46d957e83e992b53846be6ac204f27cb620c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da23d3c8047020cb22dbf86001988bcc852582ecaf378aca81bbbb19b6de774 +size 474636 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b62dac51024b06ebbcced6da4264b6a05872f9f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59e9419299f12a46c6a4e922c63c377e4ba9a7e3c9a79fa2a125df7681dfc32 +size 584655 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..afa93a4ddd06c140af5861fc212d2752b218c9b1 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aad9fa357f8431a3cf02a25bdf5511ea1bafbde4c3f1a332a1cdf26389cc3d26 +size 689879 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9071dcbff0d2620d67c316989950e4dfa553f45 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_superglue_rte_should-assume_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:949776c5dc627457116edd89145f0e078c3698ccbf746e3487859091d5567fb7 +size 792403 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2752c212ac04d683fd11d01a598a94785344236e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cb14a1baca1ed9e167a274db4dbc55c7ecc7cb57d8e8cc9ebb327d3d2a43107 +size 1039165 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..22ff9773030960d3e151601c63b1be5aa90fb99d --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0d0108325cf53991be7fae0b49d9ca12d1d98bb656504f605b8ec4e96523d9 +size 1300169 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efb4304d8781957023885c3ffcc61002d7691a43 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e428f9096dcaf4ebf74480d08517bc0e653cb53e0967a9646c25bf3b38130a2b +size 1561284 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0fb61ffa75cfa8a0441d7c7ec354b0bf08d94f30 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dd713da81e4db669bca241636ed4aa9f8c188f9875179904c8e5923ee7854c1 +size 1822747 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7dcab8e1c17fb6e1ac7e7311c24beee236afabfd --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d0696013d73b994c449bf84ec4672da5f69c73c4770bb55fb92ffbb72824c07 +size 2083012 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..419f1c6abf14b9bb03e4a7960d146915af5c9b50 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_Replace_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e72d26a8cd93240e445bc1e18101b61e5322d793770febca27c1d4e9494c1162 +size 2343731 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2b800c78377a6ce5aa55265d6ae4a732a3ac1534 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5298261cd4335a192e4dceadf74c1a1cde3cc94795d2c7b05dc587ca5bdc058b +size 948114 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7014ccf50523313784204d9b1bb3dafef3bf3bd2 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad5df7b5c6ed1d310a38d6f792729bc5f6c1cb8f8a0ea1ada18bfba920120a7 +size 1180864 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca2bd1bc40bd84f89109c226a1b362afb96ca569 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1415e38192b20874ad8356f3a7cea02d43d5c772d91ea67a83c2c0c3572d14b +size 1413379 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ad897f05fec513771ab84de15eed2718eb7a789 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d73f649a69b491307bb066d067f8a3379f9d231a762a1f162a2580ddcfc63573 +size 1646194 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c87967cce082413594f84a78dde54b1faf38eb9f --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf0b8104236d18f564cf0f27eb618ee4f45b836ef00fcaa1d47dab54d8bb3e0 +size 1878105 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..20fb197979246f93aae2f4620996469e7ca6a781 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_True-or-False_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14865e4e5e791fa62eb37453bbb50e3c2343909fe6e71d5022e2d08b90b899a6 +size 2110270 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1ea45a433f3072daf6d22d11e4bcfcb38f415b4c --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225263fa892de1d64d3f1e2b05f0c94d467bff3824d7499659bbd49346511af4 +size 1010049 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b6e02f34c5f08f6f2ec4e826a2c93020aeaf05fa --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366cd3deb76f9d50d8d206ac0e708d99e62a4499699d0fc356f8310b79cb035b +size 1243116 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a5d9173001da4aa85f88ed808b5dfb24b7fb103 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:029404d0442104a5387942968eec41cb8a281bbaf4105b6214ec8052cccb2f59 +size 1476417 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0b1a0c282d996584c0808c79a3857deddf515be --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75d9c80ee2497d2754f264d7d40aff1f8b967d7ebc161d54e0091b247279ff8a +size 1710002 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3bdbc2a0427e1e572be84605d367a7a5a2da374a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76dc493fec33eb33cd12d437c3d2aafdfdaba6e5685aab8181d0acc0449da048 +size 1942398 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2bb7a628c54766d97f38486a78b9739b644742e0 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_does-underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8202cc6f3e7bfe502cc3f9a75182f422c772bf12564e85315117877a0ea073 +size 2175217 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d8b3225d81bfcc188a8fd3acd205cae4ac47d1d9 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b0bbc63e2f6b3f1909a465e2be66d3c3283247bc246aa020e8eff2b492d4fc +size 969376 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..09609e64ccd9bd357b9f91266a930a7ff394aa5a --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa72484e634eda98fa109db3e22f8d5f7d7f0200dfaec34ad5be29f09bee244 +size 1205014 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7a033696155ac2293dab714848d37e401808efb --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4af34c141025334f16b965f0b79719b99f257a2e0cc06dcd11cbb527c9b802a9 +size 1440859 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..531f7048084f5539914826ff4fd3089bd2950dd4 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b196b9375edd0886c7e71a143a1bce332aa64c3becff526752ee7924081120e4 +size 1676998 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..038b435c5513d10c4d516442ba90394429052606 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f197b4146c0bc203dee47a2c532dee3f7f9e6d407a8f85a480766c4f6c0e562 +size 1911930 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..618f9be5430bff8e3bfedb4163929faa1aa2a964 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_stand-for_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c5f3d429afac7fb93701b64df0ad392e6bfe2f3d235a36154a1f523d02a773 +size 2147316 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a846b328a0109776048acfb6a8c627882f6d73ce --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcba63cf661ecf0c54655fe0058ae8e13954dc922a652563207c8c18cc88a247 +size 1016453 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6dbe8a5448090eb6f3165efc64d13b5a6fea99da --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9820370924162de0acdd9a43d4012cfe6b9dc32c4bca0ede4588e961ea61a6d +size 1257187 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcb2c16e5002e7b40148a3bb16fc6911911726e6 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb71ad4508313da4373ce8b9ae388698a090d707d0affa4a5ba92b431e1cc7a +size 1497968 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..560937c7e4ffe49eb6420f9c0941ea6b91d99a0e --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65718d102d304e83ccbfb2bf49485ba6dc9538b8e4d80e33e849591ec6076a4d +size 1739129 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_4.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49b1af862bf3a9bacc3a32c4012dbff974f44b96 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9aebaee8907c2a7e72e3eb95e802f18f796293e5a575ea75e15dfdcd2d4e51 +size 1979155 diff --git a/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_5.jsonl b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1d334ade98f1b671dc881ef06c0301d8972397d3 --- /dev/null +++ b/4b284b42bc4/eval/examples.4b284b42bc4_winogrande_underscore-refer-to_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fca646fd248cffa361e2092ea902e06c1e8c6fdc7d0999ab115fc4cba37dd08 +size 2219612 diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f096a071d0dd995c102145ae7db0dea85b6a15 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4273206525263921, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05364575256139351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07516332695488044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017494053619516534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3000002487080154, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004693559983294075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11230037179769856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021352190368007454 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03590952594711019, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00111479086151588 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14620648751654972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003234756290211622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05344453588119793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013515518560690634 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07168459834878929, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015668817269601622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.29090649446634975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004555120095447256 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10771019147913463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019671591552995436 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07152558346509422, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001609325107400611 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.28799432197952096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004451937577964067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10717155022915739, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019927955387855968 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cc118b9411af385ee148eb385316ba961a52dc0f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5663724921835591, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03793270967595185 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08217512985928499, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015069260165856669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3920117360421087, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005374085547490371 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12756709737220784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002030468381881967 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03886435424085205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009357648884166175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1952944224324245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003818475633981706 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0604895960614538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013006193696080512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07729829270693146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013303444161253287 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3723170212800364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005024550440352146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12035361435811785, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00181560137606144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07779518642109406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014090714637609509 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3702218835789373, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004905029541255394 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1206781467869011, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018904175388126346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..021361d3ef48697e4dfb923d8cc43941aec508bc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6313130510239234, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02727704631142144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08219107059651697, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013422258523153536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4151352004015792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005332508949374494 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12940251519432414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018563230360622849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03845495568474405, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008134611533833349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20877882384960775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003907086651198888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06086364336249341, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011655118907789416 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07626492600480415, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011790764283155961 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.38715675983971815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004836346504741043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12014889628673367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016294871735385334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07790995067901116, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012592690527861467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3934974316637387, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004930425413445172 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12261763705509457, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017345722248776798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..edb5a2a838c5c10597ca0aaef61be96e63b956b8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6636681020720647, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03135011211987113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08335165133186997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013566594758271408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4363611958728497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005466144892220358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13179598729950448, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018608653117238654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.038821951607757095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008309945118711307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.21925719264824975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003975211126463013 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06172653863702163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011849569250187196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07607458853736868, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011617831771301823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3989579329521947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004797365715247612 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1203852422828686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015949578691764172 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07866244698502353, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012631336576740743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41007264952780736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004984839846868383 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12430003630857168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017268968711559307 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62279fb027da1d419d2d529f65279a0019c538dd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7445914925255956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04471373508927592 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08335134599012016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013601845542222193 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4369343435538318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00545111390697167 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13191715691415712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001848451971487058 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03875848226958462, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008040225568103941 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.22226421540491542, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004032909658463521 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.061883789388597316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011527436830992247 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0754476375575414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001174153361661276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3964743307100715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004828941243739242 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11940663728897792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015860574558582763 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07858636401791905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012777097204336968 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41038517941804836, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005000143163857465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12423777278329896, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017238674224559544 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f80ba9d717785605111df0ef725be26559902ea --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.8005310739494581, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.033724293174082695 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08338013300594908, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001260130864731573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.44613389749037924, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0053462691621041685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13259315662099147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017356987872365276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03871058510533021, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007593773025926396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.22607554732619956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004027973657557073 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06197974009288303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001091686612001906 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07523239671078998, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011063220739786807 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.40282994585844645, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046937441735825924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11956303513601427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001512266881380029 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07851137617969797, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001188072848467214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41817890606333163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004881701441444398 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12469867862475097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016220089487202947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cbd752c0e8922e8225168a4a69fdd29ac74e366e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.17179568666388068, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027588098305790145 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.14967776591519644, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0023741585126890443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.13734703076547916, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020799156371082523 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.012844648594530184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000953695060736493 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.01366618248229483, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014161977749096548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.011184314741657642, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009686746779174165 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.14267159823314632, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021803516732606496 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.13040116723987202, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020787261762492937 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.11556695658268057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001653530715898047 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.14166916768123394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002522392923923697 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.11682503514042476, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002049865434311089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.11016382456596313, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018129599146147113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.33659811457161004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07179334370346768 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6faff3d2d8e7a3777e662462998f72792bf64a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.2979124156767924, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0054955941631360024 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.4891593464751427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.007011942792981163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.3274178122458887, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0049286280426292985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.14521802379162835, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0037085106298636638 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.2447744707424197, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004896941862479997 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.15888598086244612, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003356976364238643 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2295708148490674, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0046150064407066435 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.38128693273071246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005688750994881648 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.24956913127077388, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003890488336637862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.26398378931320715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004954506205875008 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.43612713385035845, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.006362125054891411 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.29017150876669406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004434001029770126 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 4.108172938460409, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.15815575820636524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..69bd76f19df8f2d8baa2d70b2b4cf6abeff2cd8f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.4187592773209161, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005626717003828703 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5928704577888301, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005099101322054699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.43240476392568056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003817266189092974 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.23096493796741716, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.004461915637136568 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.33273997806822503, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0045935501380386535 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2342525950120652, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003402274339943465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.3309444597203636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0051028624371683194 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.47140835027491623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004716481756787479 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3391427202561202, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003536978206670861 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.37021425910295114, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005128361770056663 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.529932604416665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004898567623492216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.3827941384172546, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035523699230254864 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 5.777777933818976, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.14012798168135124 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ac8d1dc1b3649d0b8493139e1a36d6b7ac072a00 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.39649918511817167, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005575303775752109 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5868851616945063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004874648360119763 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.41628159920481106, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003759423443598214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.21990121411366634, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0043685198653270335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3314774836213164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004469810663401372 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2270035578776477, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003326641885288619 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.31489793141614947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00499101931869006 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.46789790447333446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004497579692650062 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.32821478779416957, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0034635730509386077 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.3510123362567013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.005090798191984392 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.5229838686288929, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004678146069876245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.3686101872995796, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035306120812830873 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 5.50955888983536, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.19407341974434633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bef4b00a5a8ef572e89ed89ee4b3b92375a98338 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.35672902435293663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005099748319399167 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5855419342513171, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0047751903798185485 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.39437568045135746, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0036213014676998953 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.19346061280634166, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0038984749084759063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.32689522978214663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004424054434433312 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.21128493584064525, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003094624637133662 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.2814695831840839, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004516835082055319 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.46812215627206843, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044823471544487535 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.30988671994943884, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003261405607838547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.3168249874123057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004673199828417057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.5242590494964836, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004606375146707341 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.35042208286660137, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0033912455978920217 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 5.135430012754724, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10515298831784857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cf20ffea861c57daf28be686f89463ae13c1723a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.3346764742386238, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004843721993493443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5735003817601863, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004776228919481996 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.3764579585424841, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0035986469316141526 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.17835412437343545, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0035647689800114026 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.32281685329216003, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004493079326800901 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.20065053485376905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002995889807999223 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.264408350747097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004225202781211489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4629069290187871, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004517741150892403 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.29763435336229693, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00319184324141869 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.297705821504551, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00441559661770478 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.5151776462027118, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004625313103403918 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.3356555748821186, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003377968614381854 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 4.852629241722782, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.14804180093942532 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c40cdf7ef14c80bb51ad2aa27e1272614a689b6b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.15369360248720013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01910359117683677 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.030770028240182067, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009226075888278027 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.230051193857904, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030536564547568117 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.050674302559418065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011850389149770726 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.005257320096320131, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005237467608889611 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.031456692666470484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0019898714460596075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.008207247449935154, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007072767342602839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.02948342463609005, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007987803660190042 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.22664687188671498, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0028881333653311973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.049014322701212684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010406088468448477 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.022110816162663957, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008133150658395728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.16487262586384596, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027329966443987454 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.0361766232091386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001070618326400763 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ed91a2bc7b586bf96cb151223246e00a2f696756 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.2681634787853089, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04998595810874735 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.09550865383429513, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019686712411162346 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5542119160082642, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004657384009324578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.15190412398303874, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002523824368707439 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.042165060237563064, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00115469645580311 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.25868024994419325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004235643426537448 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.06685938813989681, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015427103067306345 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.07997084816197671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014665506771249248 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.49629192822486257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004293241239132906 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.12871999313910717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018578488402400592 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.08397023684370912, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018152970797848846 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.48869658983291286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004444469008391446 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1333199688923633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002328093617925018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..81a7040cc5d37b74144562872d8c5666bde9d8ac --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8205397209702776, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0673137206688154 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.1291177089470445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027994673830584586 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5858999609001735, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004575569928066966 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.1894275171118333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002566906047548628 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.06520674856202793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0017537810536778107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.319077314103273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004289568372222579 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.09546217670504362, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017012398458140122 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.10404670276255262, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002280876939330266 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4958646402293731, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004496077189792039 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.15348751527744423, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002000176462271699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.1158438971486494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0025491115811417464 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5272294008125672, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00439142331254594 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.16985387548623407, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023697702811214943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c4e29b9cbec8ac2a6f598e5183d60e8d34cc6994 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8545076876817848, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07724809572451992 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.13600857468800703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0032198416630618205 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5594347319631547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004736205368048093 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.19184315592400922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026570113202673156 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.0723944116238141, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0022142669018531003 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3143589440307445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004352566272275511 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.10072973751222168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0019070871002960905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.11104879023945975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0027919267110680514 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.468994364527055, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004573707755828136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.15619428997394516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0022182380723355204 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.12268948388502472, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029619856495348173 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5051081506672119, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00451999806892129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.17258810787266, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0024527387178268248 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4f8f06d20e094fc934f02144952c464e7ad3c9bd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8462916594144025, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05355432178643776 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.1278346302045783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002830129642681041 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5523726285950913, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004673512208644458 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.1860265502144425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002448379226213876 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.06637198877707197, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018025537365178354 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3123595828710712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004281424928475089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.09678025171714386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001657069001811618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.10317713408980647, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002276858012143727 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.46461791835294375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004513282830193284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.15098496922208457, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019252783341197905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.11465363747249216, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002512595497125225 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.5001173935227217, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004512311417204082 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.16714030208262487, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00222548922751469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..77a8f3218f6fb621c6b6bfd67c78a95a7efa2cba --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.8681612789604296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06823634030163615 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.12245978780125266, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002687499326593076 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5431558597151445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004799564920689805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.180067960659294, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002349653336178239 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.06531606229983922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019418038810328894 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.3098803783037246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004421579698743007 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.09493898136075185, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016863474758624926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.09968277865275298, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0022756173885421344 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.45735672460251464, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004628702328621956 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.14667555612439104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018835128549316987 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.11042759700784825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0024350506323870297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.49324760825101704, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0046636387400625905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.16241611578249593, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021493351407070567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cd3356b149cc034d12a1752446e95f425d193a39 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.041187412322809, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015212733720666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.2722316696175966, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046313887753788 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.06697983816620594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002031025478858438 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.013070747205979933, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009299883930244878 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.0784073545551269, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0034778014934171812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.020385877678818807, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012767680070777197 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.037903415422310224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012109396727457977 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.26240382375978827, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004287520636463648 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.062415738929841806, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001651359195900617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.034753268390260716, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012974419802824465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.2358071803161779, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004143971381818325 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.056710032049994176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017539781237973047 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.4008552013254772, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05150329890936936 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..038ac3b8c50b56217745144afd635f9fae8d7b39 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.15982099926698054, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023693100913810337 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6818447127780973, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0039531831866198575 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.24260640014350202, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027768727339828343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.06975853570163917, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014283398957334861 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.32360264018596285, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004059465670996164 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.10697454965926212, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0018050822143224363 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.1216236757997378, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017202747181496215 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5577387527759773, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004272666872975094 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.18681341391416414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001991917246485152 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.13510402818298697, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002022793072384455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5884796713341609, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0038365814028463647 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.2056812857329856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023946587191271823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 1.9140039012454733, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0890847222319399 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f88fb670e3d5b63f73af8139f81d9bb30fe346c8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.14341756746441994, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021572189793251386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6692546744897384, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0042151038244085 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.22138147276109751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027276614342971843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.06513694044684888, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012285955010832561 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.340485606672825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004440352246050117 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.10198081937332562, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016820967709998602 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10787155464448445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014894882606878578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5414697804415199, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004443875807815742 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1686362992157118, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019210388579455546 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.12338580056305985, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018753197047131456 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5852144480947122, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004054572409003386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.19080393600333767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002379667991105986 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.0435403008123805, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05599898561038878 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a77093b6511d9df1f2be383678a265947f8b0f5a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.14272679805162844, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002164859751576681 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6487556417334954, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004293557109036889 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.2190831997552054, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027341558417022524 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.0668120918836395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012982874018482465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3372622132047622, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004484332483533757 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.10398704909605484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017830520112251 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10846308776944723, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015579592414390658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5276090858620072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004477426871140599 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1684031326291484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020124658427430444 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.12406050272782151, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001899638347295273 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5733012952773476, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004139080844482678 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.19074419384356142, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002403668787230532 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.1672251632248507, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.085436950676859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cb2613cd919b3016b8382ead77a8958759f8a754 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.14145240880709448, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021670886996082153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6252122931764315, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004362943212513885 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.21502708033318635, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026834110246252534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.06554379446551205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012763015030346867 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3230079259138109, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004400185110309783 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.10086843076345688, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017128031184130191 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.10695926781407243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015582110755479384 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5039167223908304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004437183394069731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.1644008070500405, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001977392152542587 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.12299929966219343, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018989567005332623 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5541344307335275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004167377796893304 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.18738101319708167, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002358418447535664 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.11735852420537, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05771649184585253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e2cc31b3f64ffefd682be015c18bfd99775bc192 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.13156331057584722, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00203140416680327 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6063200915125521, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004442245861735415 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.20234254736965623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002550195794881746 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.060544387595184664, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012069679093256386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3113139243596638, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004491647148840445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.09425690323781327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001647787208632052 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.1005384257823588, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014914329572150095 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.4915833625433358, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004427689382576967 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.15605859797727822, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019039344200042776 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.11521013016106826, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017980094224657457 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.539127179437604, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00423506881193511 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.17736851108434767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002250984265206207 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.0204780863218708, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06557727097019353 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2216d77f1a5f7dfc20fc8a9d4928e82bdcb4ef3c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.01704208739894692, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00038134288013957874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.11377181108508606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001221955325919493 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.028025138472397872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00044637295791461953 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 2.243966255235454e-06, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 2.24396625523544e-06 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 1.3383655879440027e-05, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 1.3383655879440093e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 3.843511432044316e-06, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 3.84351143204421e-06 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.01704208739894692, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00038134288013957874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.11376898775808511, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012220146777960862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.028021943208707357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004460769556010694 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.011845314198379468, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0003148449817594768 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.08190788638445848, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0007545756315167773 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.019321658838726875, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00028489753193219213 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.0009597827582021916, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.787222007252637e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4459fedb0ccb47f9c454bf0013cee985ec589c65 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.12338574759046983, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002589887344409682 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6456549100912315, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00481962570332347 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.1889848661578333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026135009154337114 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.055329645026635225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015917418388580041 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.313132985326238, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004251686217898649 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.08423671425300502, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016535014543450363 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.0994133930281887, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020701300317446116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5475096620248264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044971516232100136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.15317825327978427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019859291190526406 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.10855678843630082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023039457528804285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5770399349468458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0046936823206671025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.16658141546860505, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023468522672181577 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.6165943378598948, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06490921111069572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json new file mode 100644 index 0000000000000000000000000000000000000000..90ecd871085efcaf3f0cf852e20daa15a52264cc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1436735682332211, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0026579719365293812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.7218682078589462, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004052459422366133 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.2217914175746633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0025405101764270433 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.06775722832824736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016552029301074905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.37598071720158144, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00439962213590862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.10461100159118264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0016599147829125997 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.10707337342923572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002042276382879633 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5690619918916721, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004240683333110691 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.16606908225684747, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018185139704651083 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.12837715690106088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022626789530070107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6569143874299237, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004022385130191463 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.19923289796510618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002225786554401813 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.321779562659134, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09538695554747774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b9b72407d6f35a3daffd265612ba8bf66f4f1666 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.12704646175613926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014965307460908653 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.7128153505808225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003930554377011592 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.20833619042301474, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002066573477323779 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.05894052317262764, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000893949418071198 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.3735214163364516, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004317456593719202 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0979814053469226, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013226281454679096 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.09418094825751869, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009581252383436105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5641669222725268, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004154206174788438 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.15603166938331334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013588402692121038 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.11490360635669425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013800805146195233 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.651781997216155, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003952076475725171 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.18866870584363496, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001917270059093571 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.36020603763585, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08996406440773841 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json new file mode 100644 index 0000000000000000000000000000000000000000..159948dae2dd20b124d0ee5a1efbba972470cb9c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.12364000397771846, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014761152355166514 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.7008467587934072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004059296791116624 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.2028260190998294, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002051576187767249 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.057702782901645155, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008602811108256085 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.37151697900109665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004354887109733919 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.09599216387839389, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012765491377424094 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.09198221409882565, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009571046005731393 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5557672590175016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004224535855348862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.15237152706065096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013583120131860718 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.1119488799103195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013559824797201349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6409089457417603, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004037392997113196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.18385704643499215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018931085048969525 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.2779805846158383, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07170204680218034 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3359cd8901f6c484426dcef7df7b11584bde67ef --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.12201036458298717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014806840791434343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6991506268276161, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00406189988321713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.20011806334917795, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020113768045419067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.05693913792040207, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008440989695321365 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.37256288020219525, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004406895757444443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0949140921236195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012601273301899127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.09037011023468404, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009499539634538327 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.5528331022129467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004235035612235216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.1497544344956808, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00131856546208141 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.11036350095750773, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013509318154878004 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.6389934623997965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004055163370934182 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.1812738472805107, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018629632662416327 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 2.3867607776529205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07410446411824859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c3a5a3d6458b0ce42ba01aa30c2dfb2e5093e5c0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.16426814795441422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001963248697761681 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.2844248414233626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027186243949292458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.19362563894365822, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019089814975787524 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03345718350937418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007942980850551589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06070606915521197, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015158618040175397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.0397636720249358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008958485875661834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.11667958865167334, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001269157880124843 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2109918938185312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002085058620844339 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.1392443163337954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012458799774916409 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.15221459831627412, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018171164600694356 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2643817230667675, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025380641829612694 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1795453015126057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001766867622602647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.8179420568974458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07211632534304736 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b87c4186e5b51784ad5e84873ef22a43b80b92a3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.1640591875496648, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002043851896737318 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.27910878442257525, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002898171924167552 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.19183391025210358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002001582947927051 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.036188393072658635, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008561684568777711 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06424101549232061, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015449018680878575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.042828281320032364, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009516028511268984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12217744504962826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014008167708105845 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.215360249932899, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002288377671801132 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14451746994762324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014033068919906031 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.15179552758825512, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001877771950198993 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.259675418847687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027198844155353788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.1777996809470896, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001845289272394589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.0675085561659943, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05768833984906303 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8c399414e98afe4b90b0bc94577a4115bd046a2d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.1797105018003507, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002084859264572737 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.3033184413058786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002905580486984753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.2094263617487459, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001995618924716392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.04358087823681392, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009101606204514657 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07652593936670125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016877876980321972 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.051091351936466585, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010020514723555068 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.1327550835742571, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014273687313135473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2317387220648072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023325790748031853 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.15636202509062477, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001396569835369375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16605597026357832, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001925105257047686 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2820819142889093, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002744826890973276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.19393075382172492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018522616086280982 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.5483385377460994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09290482394504518 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9d5afa9acdad7dc3f36852d937d30b426c8a3ef6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.15943864465828597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002433310645340882 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.2568625432098374, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003461904472445565 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.1781210798922743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002324914677271187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03958355651727602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010187263362724825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06658917213497038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016962168841314238 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04482124580351551, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010283219957219916 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.11796879235762016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017626613778932401 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.19527007389438916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002725370936396766 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.13242278894615536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016618685155073802 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.14827800010457695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022750345989955767 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.23948635929233136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003251371946293944 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.16564510211599384, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021627154072008667 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.8242304451124287, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11538371340257694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..22b2406919b86598d9b9ba8e77877ed7d7bec8d0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.053042765456545536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019869824388181845 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.08545779450760739, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003005122083966226 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.057482791279950166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019817479295706616 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.012522243430672486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006528832775101719 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.02241566754614324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012219610814187573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.014133038738676246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000694610933915733 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.04003813076428063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014800060863990802 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.06635053217761794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023888257381984475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.043580733989129754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014829182403567982 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.048851682561389824, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001836524393749679 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.07862148294811239, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002785921294419508 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.05286546828976805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018276666697922153 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 0.5910509935292934, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04934236043571074 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ac958f361d9e3f7f703deb8abec4f0597cd3080c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.008816748761467577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001006265857562293 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.012997624430774934, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001263899892629013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.008704060149453078, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008478567145710981 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.0017435776785760806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00023533716916175653 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.00314320442872562, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00042253899620955503 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.002017912016348425, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00025853872404257736 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.006962484807564813, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000844699745065237 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.010341885117026819, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010188524693231964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.006723293959394975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006442903081428328 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.008338960824215367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009673737935465667 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.012211796140355531, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001190300176358689 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.008149504005547098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007911718409998427 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 4.297671653759199e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 7.785583349989124e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3e926d2ecc338ebe5cf2d97cb4c2d3a74d97dabf --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.08776029171301661, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014617422175453763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.14351167622490646, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0021269699450294326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.10088674481147626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014869970606006355 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.01087054186081413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004430740620268589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.018812282252899392, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000824752401096914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.012721191178839145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005085457582114725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.07683211106766431, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011750417491257104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.12945760989081237, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001895510928096148 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.08932496723516803, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012301685164588575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.08173215113769984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013420824668087022 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.13446123935161725, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0019889224163928713 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.09415637820011638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013706067956967248 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.5431527952918177, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.032104868441593624 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..45f126dd194ee9b8c8ec550be2793b291d07cee7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.12905529214461575, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001979491067372841 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.20839846358975436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002894759247759541 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.14631930516448613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019879191160377323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.02375449052867389, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007917088515716603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.04007317722615956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013050110520617926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.02711335795670395, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000808976754077905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.09534534429433016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001414874985280666 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.15863360428057022, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002229164497978125 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.10882491551274338, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013993629759518366 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.12009612909743919, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018400178492989732 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.19446607740504704, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027009360475646477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.13618798520877723, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018366248198839138 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.5569271440681214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06403146716065812 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1ade6291a4f03ffc91ab60e3c3a7d46c611f0907 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.15985727072408007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021342704211066905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.26171420796191186, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030128510679455042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.18306178424233208, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020567981461233308 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03897138221781143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009589153266609357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.06532718295828605, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015904932736581785 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.04454347983688272, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000997348114141261 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.12182921478329327, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015428600740151884 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.20506194499128635, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002398145917032908 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.14055026158657788, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001469411947880256 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.14876297631779306, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019916419469703283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.24429220231662616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0028317427638311745 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.17048750507518073, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019155865863537292 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.588542343303483, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07430586558531979 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6d2eb908200f787266c46318e1fe269271d3321a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.1414603636700371, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023891742509933684 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.21860881858220305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033065287240296643 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.15392307578099246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022338320016352573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.03540228251986772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001012362770647821 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.05680577059138669, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015537958394163272 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.03895725296191616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009703362893100186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.10910133183002717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018029976378664805 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.1734226719612005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026958607966266557 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.1193487893330064, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016609278652029178 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.1316568373218448, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022375099093730815 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.20372540420257393, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00310337674849783 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.14319992212601101, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002078026313828863 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 2.731309122566432, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12856901986784722 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..625ae1154c2b40d9897e56607532fe1178942bef --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.04529026039810705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018683347017330464 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.06985563534380705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026707350721197195 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.047383213342466105, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017667540426735513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.010911973549852157, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000610311148352081 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.018743643702495896, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001066843466304922 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.012100846520590037, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006347499136665145 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.03563453127314495, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00147733951442123 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.05588201065370463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002163744865002187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.03713795436852138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013568378761987527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.041985085768643404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017408930957524085 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.06471195868753603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002482277880366169 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.04386313053982596, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00163848796865913 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.4824783196971761, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04463252271552567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1f04596799faa3aa1592bab3745ed0edad71a062 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.0070764690349056645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008075324699720278 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.010036534764679986, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0010720722462779057 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.006986757193006964, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007246985354434017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.001854426650392226, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00029520732928741403 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.0026116976110932373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003696132049834325 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0018127250079443033, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000246838125245964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.00560973325227454, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006578100417105013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.007965123538597381, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008557620706126542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.0054709515272040035, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005631353830893648 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.006686937698828636, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007687710652974154 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.00941357446197627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010074978588991074 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.006566961010689614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006818902290682459 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.1023797520667392e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.3615613872249627e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3e863b48ee706e7a21501e476e54774be3477b0d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.13940638469106723, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020766596739565735 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.22027868660038527, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0024555989238055657 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1545325646812761, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017491504584820998 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.02295354423344447, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008596166860623005 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.03516541724082877, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011192218045994813 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.024434310470910135, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007015086906141807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.1093757017485915, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015570564427566638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.1788578203459584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019850983591823343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.12236419999758073, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012480669383126359 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.12869616248408597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019249850326154927 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.20402125079021746, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002281736333276174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.14268473186817657, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016026623127596423 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.9508280856877127, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03110425637842823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..410ed54ecc5350f50aa2cbcaef55b0171391772d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.1443105260666999, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001976066139534122 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.24132371040856623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002737769238603288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1670563070726708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019178617354935641 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.027398570547821213, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000827158978199139 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.04638771664106042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014224849717481463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.03141526988382421, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008646679643430015 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.10951134631545736, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013991751146943878 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.18961514474322305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021768119549567286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.12809102133791261, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013663009524185852 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.13436615559009527, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00183327155604958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.22552793868897747, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00256618921407864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.1556006928965094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017716161324128888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.7665424794124964, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08195279766239584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..83f77e63029d021a1aadb6ff763a2a7c14c7887c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.18040456283641296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023668115610486376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2870648086297508, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027667857701435265 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.201225172676659, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019811525577731063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.044638474881729014, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001221684957935622 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.06999975662618645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016116340703400012 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.048234719869698274, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010310275655512609 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.13940596176892595, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017847331831893683 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.22920159148286207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002280659464388438 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1566809756684086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001423439082962154 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.16770892114701086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022385377077080216 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.26750883501904393, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002616055257421549 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.18694734716033373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018516981957565392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 2.803596179721391, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06743147958318904 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7bc4f6e54f819c8418f965db656d5b326ccedf07 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.16137937728841734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002729028979629109 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2387837109438451, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033294587186186273 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1703204755658047, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023437253279040265 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.04090526043528262, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001235838484277452 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.06101732506395295, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00160920272003809 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.04242364396606276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010396516497661902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.12349766223247734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002064619072302476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.188129807231652, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026955200484454715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1308133731843072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001724287665682434 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.1502826430324195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002566907720178576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.2228164439816456, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003150083408193586 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.158427617468449, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021925425639389923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 3.062227755689224, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10516467661969281 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..299a330b6ee5ef2c34c983a06fcdddc17ead3df6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.04779731174149245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020548178719198367 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.07123791494122411, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00272031115150723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.048444404337984054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018220046283176402 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.012493623204348362, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009264858341368517 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.01866679464180306, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001097487245025396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.012171454699532493, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006624651441077691 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.03785063497182873, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016606407010599609 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.05742149931165803, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022331279191766247 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.038101293212841725, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014065239228148929 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.04434190566928883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019234474448849314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.06599904255640211, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025261730166920423 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.044795627250137825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016806124326569428 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.416385920099641, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.040112901840196856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e93879286133734e1e55a9e7df4c19e7f743a26a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.005887065759378087, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0007186584994339659 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.009025529640709563, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0010109891028141976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.006175541531128815, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0006856911987577207 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0013791403401351644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00025334887590687766 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.0019778725350326183, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003002459206151824 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0013784838984500552, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00021928864199571185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.004586578522696645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0005392063119391484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.00746110009096202, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008469892635174075 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.004937632375856857, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005388614915141283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.0055058688026879445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0006742675837988185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.008392335211224188, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.000940745680937788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.005738594434124072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006340338087020388 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.5592181183403974e-09, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 3.9017186369415516e-09 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..867a2ed389b8632ce6dc228169bdabd95f07b1b2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.14692825480864516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018425931958625118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2512970070836738, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002648136861575683 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.172029086258773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018269882294786313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.027966624066931015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007198127679157396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.050572163826476904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00137135380680017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03327297097578151, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008291280040639999 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11468189898145331, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012927912945671872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20361426191617446, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002161893564032071 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13597766469494962, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013133399966305693 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1348221046642956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016706708007563362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23199038101436165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024449378448124903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15820053127675784, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016635461401018714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.414984862410896, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05940234677879655 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1bbc339db0b865a801616c82a014714cb2f4ed57 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.20096070137159358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024183404012715405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.31553844384377067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002899492197080727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2221915237870407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019942889407287214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05245104495325915, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012835508781176563 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0826631709797398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017188300283128646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.056766090400891124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010805710237233974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14393356016704015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001799791687854001 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.23074840768589486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022312022411283892 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15912396957362257, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013601645244304275 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18819591770649413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002281534419296507 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2963346702924994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027562668344971878 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20821150035815492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018771242477053024 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.836407401710481, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04043755617370114 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6677c8754802b47fdb3afc33df242e402e7a72 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.20680907589782577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002488502986523018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.30989444650796616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002753786143548591 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22254451379263313, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001915395486050023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05499744970071221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001313828428933835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08212190745009494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016893036964833342 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.057748452491246806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001071605656207478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14843146397136125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018718282405222096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22714968032490238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022095657686829933 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15964814622161871, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013347717397906025 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.19471899528471098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002355009451192118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.292336561157753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026262120649372398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20960062484582434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018046035863626521 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0672692499706633, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.049106354646547744 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f36f38e829da374795de7f700da6f79cf2d35f36 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17474019948410954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027474429834987804 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25521371457329683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033152648455079072 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1820115847047415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002231556986150065 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.045362687902130126, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012564421439368422 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06804888392817302, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016992089106080323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04689779702656875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001034952296288115 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1265226291981258, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002108936479136444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.18709591094229805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002591170589040495 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13086000378258256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015951624748139174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1650854923795236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026195222440337307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.24111486429370707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003150424029915772 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17171382413552935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021029154857551075 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0771001660724235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10166627860233955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab78bda4e0e9da3accb538a0a861fd30e75a911f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05686654081896349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002233440497229162 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08353923265042008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029077074042461065 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05660115935642753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019128121580337133 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.012648534117560613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000714480304383431 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02199374005627044, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012054249033507543 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013741746630537094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006742800088300104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04243723633575883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017020140807939923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0631413785577323, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002247650978929538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04194122086488744, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014016615434499398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05323619772247561, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020619540523883185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07914684184702428, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027725917227984414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05334942672891278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018012620317959791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.6074700254086013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05423743996990111 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3cc280e787cfcdf46447ec8af510ab35f02550d7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008781294870051274, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009050499890841822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013677582922976033, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013193791852402054 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009252067005496314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008857404763954463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002216585419720102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003047584752236144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0035060227812251585, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004475043348074056 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0023512305693387013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002931024418143357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.00679769925058833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000718459101307871 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010367182111106583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010118399277303067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007007677157867705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006708462523675694 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0082009891177148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008441371445186326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012916195361676434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012503160451136067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008671543071872297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008282868408056053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0033020706801269e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.9077024795694913e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dfa281b4954bf57e8caf32be0f9634d2bb71c281 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.08966310598899173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016283594159426443 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.14086823338303966, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002174573651185083 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1005296177122325, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015389563787558595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.009747347404058394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00042465671299048196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.01702445557173677, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008431303828342194 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.011216041205494898, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00047851956788785876 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08266300329867055, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001411373389832569 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.13215740335454515, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019780686878669556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.09330112527692144, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001337771293407071 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.08265536975136227, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014906774813849123 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.13084094141254124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002017106703749903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.09281985594753377, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013961291968416943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.6124556246903006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.038254977576635665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d8746421231795d606e2dfd5371405e6be9a73de --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.12619327348032797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017890629958271637 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.20045647375518888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026946167754429164 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.14247720198956396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018061896391544346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.018133516802832303, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000645611933600182 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.031827347668963016, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012365216961313032 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.021015379090322476, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007116134026935783 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09050018965284994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011814640560731773 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.14784088101449194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001990541308747493 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.10275833507189983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011818702954760711 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.11860200033066597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001670916163668385 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.1888009611952507, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002505695512304149 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.13395201808210386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016761350624605734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.0361019783677474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06745079097957657 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d33d65ffb564ed48cecee9107b86275c0e26f9cb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.10326783555477309, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001845941711723819 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.15774264721944228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028918138893316418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.11404319152029233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019424688335624655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.015686648273678768, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006566075701044747 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.02846742132686831, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012449112557856106 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.01856227196277119, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000747759698991048 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08074967691674281, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012838590605038145 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.1238571271772768, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002138262266070014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.08879104640279128, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013350757685037958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.09519149494947343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017162671701933705 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.1465542733502144, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002715845708109963 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.10533375181568398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018091025815919538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.0062083415777778, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04711946186027571 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..94403e2caa5dd19853ffccf60f8394149b992e18 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.07857613907066331, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018325767254077753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.11231507755062209, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002709125328853898 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.0826320712636738, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018381499864311686 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.011896740680175497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000601828187026642 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.02057079275702888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011498890895523576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.013461430147774255, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006594173038785609 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.06274112802109241, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013639990133955174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.0899022034344876, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020795974250259583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.065544556988537, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013299047040813018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.07257907609294534, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017086622186232854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.10409821801585775, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002535862979074873 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.07624199938761006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017069202628844525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.0058220356103125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.048299933628742284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..38f5fc2e836b3c0b7fd42a01451b375fcb87578c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.019168420675696286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010795160831791957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.02879908909496583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0016458129256000573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.02010596120483132, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010917797901789433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.0028556430231600335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000302994114231941 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.005469741300282359, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000651898645432401 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.003266077788148105, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003273226353783014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.015404523683234762, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008241756698203975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.023150586278677637, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012827505595113496 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.015989909215756735, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008105738923510461 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.01763922638803302, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010041223908314812 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.026296995107134465, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015071738271990678 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.01839170907812179, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001003818069703428 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.07242667655921821, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.008932967771763006 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c5af4ee6fcbdb5b079e69f45eeba3dff7515e53a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.0015780841392649505, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00030145052547005737 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.0027069709166605916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0004523943421663273 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.001794950456581436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00030728853559277316 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.00016207296070587727, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 6.383944014856069e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.00022581084966854161, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 7.606359517652543e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0001630519530024726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 5.716887646946743e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.0012686454818619593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0002161999716520848 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.0023860171187140397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0004079367273708954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0014936361251205509, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00023979795330628012 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.0013993374329327448, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00026598516241118476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.002450141316005954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00042236431455605384 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.0016051321840952642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00027664878910365937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 3.936862342904368e-14, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.4388487630942075e-13 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e5a22d010c997bd4586186ab704db8c6c8b40792 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014794927843348632 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014933117490932573 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6c1a0d50fae04efbf268cbc1e7bd69d5cfdfcb2b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014806864733738857 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8d9cff8ce6158577c8726778df7ee72c18ddeb7f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.346, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015050266127564438 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015080663991563098 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6bd9734368d63da042ae3d205dc2f1b8447bb247 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.348, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01507060460376841 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01500870618212173 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b8b7c6c983c0d7744ee75bb82ba19ad77f2ff0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014818724459095526 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014734079309311901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1602cf52b968c59f9e40ed9b9e101bc4b570eb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01468399195108797 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014658474370509012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..be7c5af7ae901641b13640e77e5d607ede8147c5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014987482264363937 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aeeec553b1ba862cc18fb10adf688c7f30d4b72a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ba9a65a34e414b801c8de15eec51a147134787e7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.359, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015177264224798596 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.362, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.0152048409129195 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5b9170165487d66866f9cc87f374b3f68f6c97f0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.356, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015149042659306625 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.348, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015070604603768408 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ac1c8c02fe1c732f804efc00da1eab32b21b5bf8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055238 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.346, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015050266127564445 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3369b551ab4b31386e3e37a5dee5dbaf8d7009b6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.353, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015120172605483697 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.352, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015110404505648664 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef766c5c75fc86d75a64b3c1c09596eb9442da0b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01500870618212173 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014876872027456732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0a70cd13156a74efa42eb1fee73b8b5ac35e027f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e68e0dfe369f28d1af96c46dfb034e3e4f71ff6c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014998131348402707 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811488 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b4e064e3bdba72eac81f70834ccb04a4da0e850 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.366, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015240612726405749 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014944140233795027 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..87b12c7dd525c88307514c257ea367c6bb9803d1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014933117490932572 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014965960710224473 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..977d41fc2bd7ae1ee08c00db49f4e120949d81b7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014853842487270333 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014842213153411242 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9e640ff88d4588e78bf82ca954375011037d270f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732968 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014933117490932575 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6fa0a0385f3bf1233b8e2c925a7319fced515167 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203933 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014976758771620342 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..98691aa804e4543c97b919a11c057b10744cd4a1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203933 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014876872027456732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..71a5b6e6890fb913e6a59881b8002449842f162c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732956 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014888272588203928 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ebecfcd23c3774725cb015f647e961e723e07631 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014998131348402714 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014965960710224466 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f677ac0b17cb842989a7060f7b8c2cafe610e228 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014842213153411239 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732958 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e180ca2d1e9230e5f6a8447884061f7fa8c369c2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015008706182121731 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014876872027456732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..01ba94ef3b4c83275509315d85a47e5bc1b89be5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..66ee06cf6b81e30dfdec6d28c68e4af4196b693b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014955087918653593 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01491084616422987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..593f0fe5d23f399c4e0feb70f3fd56179155b1e9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.356, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015149042659306623 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.342, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015008706182121728 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f58ef60ff81106486dfdb6a2a0fe44961ed7aaa0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015019206922356953 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014818724459095524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1685058a866bc64b5cae977d4c7e90c27864ed83 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r1_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014830507204541031 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01484221315341124 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6f63449be65f6a451f6b67f2d77d104f5a956a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014842213153411245 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.351, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015100563798316405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0fec2c31c8c182ffe944f16be7917cc838d63896 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.309, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01461960097720649 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014671272822977885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a2485c4a14c534376f54dc44e3f16b12fa255d6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880215 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01468399195108798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e4555fd157dc4b12d209411f3da87121e68e022 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014910846164229873 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014853842487270333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2eeb2285ff5ea0f1b73184f3162e8c113bfc5bb6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014794927843348628 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014498627873361427 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f8f3853aa91a9fa9c280bef92b47f4a15708b4bf --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.327, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014842213153411244 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014709193056057121 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eff31ecf17424993a7db88023c323d2e52197117 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732958 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014794927843348632 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fa9bc23146d4ddcd3f83b1e77353ae38a3cdf117 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4d569f869178be45eed221107d21a0e39fc92c2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014658474370509008 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.309, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014619600977206493 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3a3abbb78d2fba9c3436a5e4afd4f871bf51bd28 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014566646394664396 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014566646394664396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9d850ca98d8f8757542bd596d102ba3473275a16 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.306, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014580006055436972 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.295, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014428554438445526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c878b6fb4583a005f4d3b23fc0673a7211ec8054 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792498 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01468399195108796 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..124b94ee0e9ec8c1315782c653012a7eb5c302c6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.348, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01507060460376841 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014922019523732965 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20c6f45558fae5111d8ed2c86bf0f47d86ea9ebe --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e95f79bc97ad28e4e2d7cf3fa32f7b1c215fc49b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014709193056057139 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574891 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a063708eec656c79696f8c27c1ece101797aba4a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014758652303574888 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574891 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..36b90ca02f33cf7243f08f121cc8e6d13f4c7f3f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014606483127342758 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014658474370509012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fd615a8f44e20ef347c1a28d03d32ffa4af288ae --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014830507204541038 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014746404865473477 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ad155538b71755458fa13c340ecab81719c26869 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014944140233795018 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.0149659607102245 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a08f5e90b9eae9918e1b6244064d945c24557a2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01460648312734276 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792508 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3aa752b47a9cd6b20e2081b1eecb129c2ce04c18 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014806864733738864 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01488827258820392 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5933d01386359c17ed36fd0a03c2891230f76b49 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01487687202745673 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014770821817934652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8b020659c7f27b454c729a86b337bb2b8ba63b36 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.311, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01464559638572269 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014709193056057128 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7b933b83ee7f110a5c0e3c5d38206eaeb088fde5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473484 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01479492784334863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef30883454e24b929a9dd3a3429c82dd70a859c9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014987482264363937 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014899597242811483 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc61c22605bd116ab863e34c748ea00a50990dc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ab5516ed7e6bd376a20a9ea9ed269a1c5727f61b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.322, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014782913600996693 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.321, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014770821817934652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f9f5c07774fe6df08b5b78114819bd9d8facd91d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01469663196079251 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014758652303574886 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5980bc24dff7093d83e5f135683e24e88b917903 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.306, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014580006055436969 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014566646394664378 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9cd3e2f587534d2ea8b2eaefc84c612b5e7ed1a3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r2_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014746404865473477 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.312, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01465847437050901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..de2b0ad978d092de5038cc77cd53daa63aea256e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.35083333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013782212417178193 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3475, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013751753243291852 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5deac3d78b2382e62d018cdc65e22d8016347ab2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3441666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013720551062295756 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013696658778002515 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ede6aa7d957e33c6afdc16a7419fa8f10e75e18f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251951 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3125, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013386029277441229 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..842bb2a09a466fbc6799beb0bb6b1b772387a759 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01364760294240639 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013526454480351018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..219103e96c63833b63aa6aedc1c0d345098bab05 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.30666666666666664, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013316642319070699 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013443538681348052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d43c38cb16989d07f6fab07a4a88db0b3ae22e48 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013376268790982098 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.30416666666666664, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013286140243317441 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1f40005ce70a62dbca72c1a47a5a74de59dd32c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013622434813136774 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01344353868134805 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae60a49fbbff3de58dbada84046970ab05fb2e7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..85742557a9079ad83684f08585823dae9fedc38c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013462309712005136 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013526454480351021 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4db9d4aeb15650c7947d9ebb5034a0dd07d33530 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013570806258433626 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013544340907003665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab3fcf4c9718f039446bd4000de3e9141259076 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251947 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013526454480351016 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7adcfee5db9fd0e73e2cd63007fa354ff6190e4c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.30833333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013336721143136464 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013490095282989521 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..30385a13fd8db0613665d5caceab6b856e62a898 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013605417345710526 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013622434813136774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..069fe4d2a12651e5b68143eeead61d3a816a4cb4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9cccc5d0e88cb1359a7b2bb31cf72b2b69ae6c46 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01347162092976915 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.305, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013296358936471105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b8d917433dea9de4a19fc244e64b50aa61cf96b3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013630871843821474 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013471620929769149 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..464964783532cb1ddf41895c1728d6783c15d1ee --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013508372867300215 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..06c7593d5ca1f67fb4de5b91c272694204c31a87 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_can-we-infer_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.32666666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013544340907003663 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013508372867300217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..68cee14a652dd895af4671108f4d6117ee77ac05 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251956 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01347162092976914 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..337571cffa590a7e2620ca5fe63b8365bc8f07cf --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013562032919529019 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013613950010225606 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fd593d8543d815e37a20970ea2c955a09655b316 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3125, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013386029277441229 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.30916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013346684134591945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b7fe9cf76ef081865aef0dfb32a8d7042a1c63f1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3258333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01353542204341745 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013579531277800925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..70c5dd8373abecf1b2cd8f06bbc1f8920b80ae50 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013424568830356446 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013376268790982105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5d07c6c3362ec4b3f0bfc5d528dadf8ff1e1c98b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_guaranteed-possible-impossible_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31416666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.0134053993149841 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3075, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013326707242912057 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c0b36f3ded9ad612f55a3b3eec7297ac0266b466 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33916666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013672343491681819 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013613950010225608 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9b00936a972a767741cfa07f0224f03168cc2e27 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bd836d91c122379921271828b2bb9c8d6011e81b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251953 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.30833333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01333672114313647 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb1fe2afdc8e7ac5e7ea100713707870df7bfaa --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3383333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013664144006618266 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013553211167251951 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3a4f4ab84746c0013034263ec21b090b3b5a3c5a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013490095282989521 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406398 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bffa24bc305d2d57a2e157c5456ce0a9ba98b5c9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_anli_r3_justified-in-saying_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3233333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01350837286730022 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01360541734571053 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..90bab493661371c8428e88fc890a1aa5d3f6a4cf --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23720136518771331, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012430399829260834 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23720136518771331, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012430399829260834 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a770391f41143882410737ab878ead97fd3ddb7d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.0124457700280262 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2380546075085324, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.0124457700280262 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..35ea251a4fc6b96a2fcd70d9a32d5ce5d3d66a35 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01263940711192644 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01263940711192644 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6864203e39c211fba9463bbfa84c11d647739d6f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23976109215017063, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012476304127453954 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23976109215017063, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012476304127453954 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9b60d38b353cd02a7e36412a3917266179f60da3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.25341296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012710896778378606 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.25341296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012710896778378606 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bee906408f5fa5b12c9e1bb2b2444c8a091a5991 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004759 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012399451855004759 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..86185f2a8d37cd2424f31005cc938b8c553679cb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012682496334042961 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2841296928327645, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01317944244765389 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e23554a8dd7512cd2e385615500f9e90b5c79832 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26706484641638223, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012928933196496354 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29692832764505117, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01335202597672522 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..61f27e154f5907605d107cfc83efad6794cdeabc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012942030195136432 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2815699658703072, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013143376735009009 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b8cba7b7756d1633043b2d95f5fe4d23de55df7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313969 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012980954547659556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5024507a986b1aab9d9d389bde9ac25219c6d05e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2593856655290102, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012808273573927104 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.28242320819112626, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013155456884097218 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8fb9c633b4cb7610a4434bc390873bcd72726b3d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.25853242320819114, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012794553754288675 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.2738907849829352, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013032004972989505 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..50cec6d0c8dc29cab9d0bb90c7b00148a373f4be --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24488054607508533, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012566273985131358 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012980954547659556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8955f0a2be96a2729dc16ac3a34a710908a65b69 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012639407111926433 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2696245733788396, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01296804068686916 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1dd31b422a60d5933b9debef384168f9902ef777 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24658703071672355, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01259572626879013 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012902554762313966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70197f4e6494c1f919935b8d50c8decbe72e3562 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012639407111926437 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012942030195136428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1eda0cffce7000f20b094dea12ef34c6adce2e1c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2525597269624573, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012696728980207704 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2790102389078498, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01310678488360134 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..90dff9e6e6cd5e0eacebd2ad2a0d6e26f7f77702 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2508532423208191, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01266819862131543 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2627986348122867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012862523175351333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f1c4757050a063929e0d92f9d4e22d906a8dc1e3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23890784982935154, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012461071376316623 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23890784982935154, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012461071376316623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bbbe2db96efc6e355512657fb7d36d46bd8d3985 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01236822537850714 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23378839590443687, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01236822537850714 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8217d28ad8a48cde9bdc0c659786edf34888787e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587087 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012536554144587087 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7dd5d58f84892a06fd75e4a982daace1925f1ab9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24232081911262798, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012521593295800116 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24232081911262798, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012521593295800116 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f15113c7e0caedb7433c0b4180fbab0ab0d90233 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012739038695202104 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2551194539249147, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012739038695202104 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..501bd8ad218290ce2464bcfaae2c4621ac4b25c2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012414960524301842 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2363481228668942, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012414960524301842 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2b708597f2c12c00729f3599839a5b24d92204c0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2619453924914676, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012849054826858114 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2832764505119454, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013167478735134576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0283f9da0b10add8fed7a9d375f59d9415f4d87f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26791808873720135, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012942030195136425 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29436860068259385, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013318528460539427 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cec9943e103425bd8896886cc7decfd6d5289166 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012780770562768405 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.28242320819112626, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013155456884097217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6e7b0fb4060f3f6add7b4a848831cc2b77260528 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.257679180887372, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012780770562768409 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2713310580204778, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012993807727545777 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..70a2f32aa71e8440e27d1b5076eb0db104d946df --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2619453924914676, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012849054826858115 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2721843003412969, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013006600406423704 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a48e74cb61b023474e51eaa2ceda3d07dc38e0d9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_challenge_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313967 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.28242320819112626, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013155456884097217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3826a45fd16fe1619ef5f9431c898dfeabc93ba --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00882458861121907 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24494949494949494, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00882458861121907 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa4392a12c7dbed50e36b62cf90c6357d7662c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.23063973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008643708884504997 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.23063973063973064, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008643708884504997 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..19ec7055585c91dca88d62533d27857c2dda51a8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.242003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008788455043255566 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.242003367003367, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008788455043255566 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b2217393a9027ea205c1ccd9a87ed552eb17cabc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2478956228956229, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008860162361464025 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2478956228956229, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008860162361464025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5f6ba629442a1a1a02cba5fdfda2523aaf0bc137 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00884498458193489 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24663299663299662, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00884498458193489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..895c39f0f8abc33d0a1b4ede37f25607e563a255 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24957912457912457, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008880241465504344 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24957912457912457, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008880241465504344 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0f9a6257e4dd992a9288946d6d984b3f58c6c8ce --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.359006734006734, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009843424713072174 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3244949494949495, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009606970654515783 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d141d8dd8657caae9192dcb4ead076453737c377 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3560606060606061, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009825454608416303 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.31607744107744107, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009540440071928285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ac9d33ffaf92f1593a35f62ed5524e01c994f0b6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3480639730639731, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009774627600259012 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3143939393939394, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009526702423162909 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4cd407fb9ec4594dbfd79254a9f0e23bc93a9e94 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3404882154882155, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009723676813825867 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3106060606060606, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009495260551195607 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0e9e1dbc30b97861d5c6965aebc265c6e854aa5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3312289562289562, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00965764131135091 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.30176767676767674, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009418994158522525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f46e03d1d248b601a67090db06b51606e9fa18c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.32365319865319864, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009600478182273787 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3047138047138047, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009444871667360211 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c73f9cacb59fd4f7e9f03ebec3a4f3bb32b7087c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2946127946127946, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009354224395837094 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2798821548821549, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009212077524656529 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c6fa19c9b60ea496054dea44ae8f932c2f9b25e7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3303872053872054, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00965143021642819 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3341750841750842, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009679106032919058 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a56f86f0817898033d60a9929776db9bfeef0d49 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.35353535353535354, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00980972894815149 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3581649831649832, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009838331651451844 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f3e1d6d7c66b6e0d93d8d164cfbbd8805e8c54d7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.34553872053872053, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.0097579487306703 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3531144781144781, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009807078935467613 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d68f663714511cf9b7927f2e9dd7c94d14486880 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3333333333333333, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00967301666813338 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3425925925925926, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009738105469984187 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4455fdd0bd1db145531430a0d9cff6fce67464 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.31776094276094274, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009554033064443064 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3383838383838384, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009709034670525096 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d5cee4c45283141fc9841fed56608ea66b1fee47 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008757032594354022 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23947811447811448, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008757032594354022 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6ea0e5594edb370b234e74f5124df2b49efce99 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23526936026936027, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008703724269718638 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23526936026936027, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008703724269718638 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ca40f6811db7c1eac21fe72e07b77c2f9cd67ee4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24074074074074073, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008772796145221903 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24074074074074073, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008772796145221903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..56240452f20d22ed6ac11f43acb373b5c42b81e9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2474747474747475, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008855114414834709 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2474747474747475, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008855114414834709 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d59f6ce37917c79ac01305d5d1a81bbc4d54f3a7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24579124579124578, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008834809366391489 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24579124579124578, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008834809366391489 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f9d21c4636d427f37a1927a9a31add442ae41809 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.25252525252525254, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008914948991495718 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.25252525252525254, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008914948991495718 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..07a5f3d7f259a32244fea8d058960760bf2299d2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.36069023569023567, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009853512108416743 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3186026936026936, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009560775507673364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd915f6faea0079d21b997b89291649fed5bb90b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3480639730639731, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009774627600259014 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3127104377104377, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009512819491443735 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..65eeb79639adf108d19b87c04b581eb143e28e48 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.34553872053872053, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00975794873067031 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30765993265993263, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009470292575831181 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4dc9d75a2c7f34237f537a57c20af8c4390bf4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.33880471380471383, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009711980224301649 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.30892255892255893, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009481048387761353 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b111b1d2b4ab9bc3b22d4fad915aea978335d9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3341750841750842, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00967910603291906 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29503367003367004, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009358110551087425 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..46b168fd5e14a93f733492a7d609f7e6fa02a276 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_arc_easy_qa_options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3261784511784512, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009619849417035172 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3021885521885522, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009422719042483192 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e0423dd7a18fb9a4c2326b9014c74f755f436e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.538, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00910382483037647 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6383333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008773841218429196 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb6c29065ea655dc9c6495d599eca09feb1d2423 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5356666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009106972161130879 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6116666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008899620943397685 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..633ecd748b9cccb8414011896e4280783624053d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5443333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009094270381387362 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6156666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008882569490543052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..631bff23e6950570694a2cd041cdae3b98dbf79c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5566666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00907140524362105 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.617, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00887674483503323 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9f2892d4c2c9d89f3345336db656d9a47587a9f1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5656666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009051147480837464 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6216666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008855801251873009 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..238e09ca21b496dabd9c84289400114df8140d07 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5716666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00903595664371605 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6206666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008860362324722518 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f113c13c49da005742c7fde10775e89cb7e8a5cd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6233333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884811049411477 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.43366666666666664, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009049526374650795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b391a40d0a8dac7da90958e0cbd119a8d637e68c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e4158daf0a6ae78e460bddb9c7cf7a8c38c844d0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5396666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00910145395014027 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009113781890088811 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f487825b0e3a7d0f760b226618cfdc467414db87 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.539, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009102414587191052 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5166666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009125157363376123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9e059786daf90848095deb8ecf447e17cccc4bee --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.527, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009116909528258622 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5076666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009129157751283581 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ecdc0e5f18e82508158f8de25b87c7d5295b9d12 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5133333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009126984242044514 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.49533333333333335, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009129833442820515 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4858e8c90e6a17d19dd84594865762ac20d7ca37 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.623, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00884965755342756 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5946666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008965091467970754 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3aff68d3fb65550751fc7d6d47a13229651e2070 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5566666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009071405243621038 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5456666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009092070195065412 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5a586e964912c9c72b5ab40b595c9bec67db8b2c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5536666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009077486613450291 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5426666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009096928229880426 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c97c56be27b5bdd13dcedb27d23027e2fdc44535 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5583333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009067881941319675 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5496666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009085074954912703 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_4.json new file mode 100644 index 0000000000000000000000000000000000000000..689a958562d00084ee3a9441785a497bb5933d82 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00904031207504128 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.556, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009072785596468857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9c56ca404ec387fd8d5d572e5617ca392a2768b3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.567, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009047888598785733 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.556, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009072785596468855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c1d9cc79043f2882024a64365e46faf0d272ca --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5896666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008982215188519143 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.402, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008953140207390567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..61b91f67ed4a2367c4c562481d735572add1981b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5423333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009097447488896774 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.541, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099483512819305 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ce1367d687854139b9d1acf515bf346cbbba6eab --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5706666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009038582451449426 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.561, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009062029213030572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..21e421b736100beceb8e5ca4b2b1369ddc7950d9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5633333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009056690207178128 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5506666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009083233528874798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..78288e0aa032187717ab2cb8f33a41b56131d562 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5543333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009076164124491365 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5446666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009093726495969151 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..07dadfa3e12527056659a57a36fa606b1b720eb5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.561, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009062029213030572 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5476666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009088646624339615 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c626959e1beb2f8aa724d60d92437540d6b2ff2d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5293333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009114505467759737 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..14013c91cf64ba72886eec07222111d1c5b3b4e3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e7fe94ab5c796367c28bf263d3e5836f71a6e49f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.48233333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009124530050684579 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.496, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009129938951699208 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a5480fcf04047b7dface37fdfda9185d43101eee --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.4676666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.0091111208252746 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5126666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009127300863830172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f0e1ee46f1dc47bec7401024617a2c84c405b9c0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.481, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00912363671545717 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.513, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009127144583936549 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9fd33eb7325d7ba56592bbf903bef79fcc07973f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.47733333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009120844478925309 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.505, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009129774600800656 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1a3b13fdbd60e19eaa2883927b60efdf50b369da --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809221 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.1754385964912281, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..786e9e7b09b32050a85177dd3fb20f78fe8e50fc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b176510a74ef7efa18ca1a6e7024940f51ed5c48 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.26666666666666666, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cc1570a803bc05d372a14080a092813fcbc788e1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.24857881136950902, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b288166ee6773b476fc48e1e60d22d8ef097f8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.3392857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06384226561930825 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.22990271377368152, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c4b59ca67c78f2698772eac9f39a681a26f335 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_GPT-3-style_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.30357142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06199938655510755 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.22028985507246376, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e6fe753b9f44f847ac4f4b6a7535db57136417a4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359538 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.1940928270042194, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..14e8ccce7b464467384a7e0d6ac4698acd17e4a9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..244aaf3ed4f2a9df13c73e72941d96f533c51239 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0672477765493766 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.32236227824463115, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d79ce77fa48100542fadf6bfc1713d15154d1c0f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3757011576560449, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e2ca847d515a00ae2ac8342b202679182e77c5f6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0672477765493766 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3196798493408663, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9af62874c984f0be694a885ab8d9129b0c36cfd7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_MNLI-crowdsource_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06741998624632421 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3416488477072939, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b72b2adeaa7a49d6b03f4ffb5bc0c303c3e60c49 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.4642857142857143, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3398692810457516, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..31b3ad53e1d907b7cb7dbb28d3e74149744dbacd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3ecc1e14dcef7214e6429cefe0e402e783ecbb00 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dea81fbd14303e09a6cb80c331a8f8f9e73e3964 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.31761006289308175, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_4.json new file mode 100644 index 0000000000000000000000000000000000000000..135126b37ae90aeca91b3560b4c1c8967ff202b1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.37694592988710635, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_5.json new file mode 100644 index 0000000000000000000000000000000000000000..feb831141468ad101080c1ef7d17367f2b326d7c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_can-we-infer_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06741998624632421 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3548587781510513, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1a8398df6718f779e606fa99126992cea7b82bcd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.14285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0471841613625583 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.11887125220458554, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..88332043ce500aff3886c126d979eb4b58032ef4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.27314814814814814, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..706bfb489eae6e2c4d1624ab362107da7a2a72d2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.2567567567567568, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..065fd6e06d1d989118b38e0fc873ebf58af3129a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.24074074074074078, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d8349331886f1b18ffe186ef0834e04aaed1c421 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.28992628992628994, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ba582ba34111d44fe7fc911a507f72ac719f0652 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_guaranteed-possible-impossible_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809221 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.23318250377073904, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d1051444ccac7bd8a5ab45cab2f5e89f385ee127 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.27465986394557823, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eba1a715f1cd26eed090de25df7562daf7a889c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2a523726aaa4062b80cdf13072efeafca4a8f0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3006032601719933, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..130d7e155f8bb008372b81e39eb2dceffd354c50 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3143399810066477, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7f33e23dc493eb05a2bd7ef00e62fe5488eede --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_4.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.067031892279424 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.31636363636363635, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cd7fdaeb3c813e3487d44fc7485772ae46749a68 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_cb_justified-in-saying_5.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3073128622518189, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b99cf10527eed9b981460d31fc275227b40a6a31 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620333 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..584fb1113568d4cc28a5da514ee960260ab2c7ba --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..761fbae07da35b4e062ffad6061eefbb232d87e9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c2961b8a70c1667710a639ea2633ca91ffd20b06 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04923659639173309 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac79bae8b5d5150733afeecc7c382b40fe0bc9a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.59, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237102 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e9228945ac4a3d7e5b5469784e5638e4a6dc3ac1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_best_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.55, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..912cf702dd37e2aee398bf57bd40088338aa49cb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.6, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04923659639173309 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956911 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4f272286854c9c11b5c7f49ea3bc1d2cc81115bc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..53e26530083e35f6c47415c49503fbcc8b4a89f8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7fd7b0f9e41a6e5b87b9c4fe37b601c5a7853c3b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05021167315686779 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8065fcff43e6b7537ef29d1fa29ad581540c3d78 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1f846fc259f01a4fb2a1856750e450fba50c059e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_cause_effect_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049236596391733084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..76a72b42efcb8cc6fa8a4f1f885583c01da7359a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.61, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04902071300001975 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956911 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c17bbc86b86da33416ed7418bd3a8dbbe806491d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c5b40a52ff6bbe737a00493082da9127b22198c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05021167315686779 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049431107042371025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..25b662c3b8c72f80ac797bfaaa252878c20ff62e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0f2da13368a4ba64e04531243bed67731a76ad76 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5e020aa43e7f01c89368a2337176ff3f2df31645 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_choose_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..333f16bf89818a27b84af6a9f0c8960d5780e0c9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.62, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04878317312145633 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..36b530d08cbdc0301763f359fb7057ec4ed36969 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c7825be00ac83bfc2473d958c470931b5a702501 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bc758d8c07fdf11b70a4e5924fe44534ce9638f4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ac469c8ecf51c4f944e31881ddf9f5ebce1bb60a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..38b763c303224e3b458d7fd95adbb30d280d96c7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..12579ccdbe07e5da9c7f0c8693a7a77f5e6fe140 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.61, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04902071300001975 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.55, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049999999999999996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..52aa5132d869fa428d59d18d267dce6ab8545e20 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049236596391733084 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ecbed04cf05760e0309bb7950d45c12b13f359 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04975698519562428 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1604200dd8bba6a2ea2bba8cbd914c340e9fe273 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..941d44f432f3d3cf7f74a54ac444e8c03c6cab1a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9a7aa6e0d7d3c284233083213cded86297074ff8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_copa_plausible_alternatives_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a0f8e911f9ca84254592ad784dd6ee356c170023 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 3.498486545620417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.028599955391489566 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.17483459362877837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001705017286322628 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.33493639354854016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024663398758924945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.22495159490910463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019053847340024682 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.05964515275724504, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0009755619175694137 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.1142906263856143, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001814336091779065 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.0768086035487303, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0012111613779042028 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.15005963762480484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0013171712778281752 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.2919690872282272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021251924476385774 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.19419308478997838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015170582220134538 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.14893285461116787, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014850614932496661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.2860286356508758, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0022430688988452152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.1917621810854489, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016819035833037649 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a2b2219e9b81aae90f7c33c253e0ef8d1351d74a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 5.552260199704303, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.057401553819660216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3259195028808269, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001929323968999145 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.572157323196931, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026426040314961857 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.40600971218219595, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019372178068667871 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.13492928471559051, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011996866679482643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.24424415700293647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002141030333559227 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.1695476123074274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014000703195505763 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22280765716264103, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012944708889095894 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.39909766220453646, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023618899336078415 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.27934677067641567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014140720869577354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.26925744025419523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016959347663601083 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4741109359252275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025232454404190492 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3357354638134433, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017725485372938805 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fdee4583e76f54fb525f98e182cf34f4a3e39f38 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.320043236974254, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0703674127483568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.3311240341080181, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0017853276326339404 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5884284298124635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025141637691686656 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.41458290792353053, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017754408662774433 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14343528305064893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012055773188276925 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2640886894644106, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022528239016002664 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18139573177296225, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014279319814940361 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22789410567174723, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001239923319192286 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.4134133885978249, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002390777095443192 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28718354333461665, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013718939798951697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2757280227521462, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016014662456065143 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4919426398639349, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024913442742156007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3456449838143285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001679248842503913 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a5ca0dd844a2fd4e74a55eae26e60ecd1369c93 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.622404462604137, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0600956886865709 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.32978578163106964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001728792250109152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.58993990670276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024371452756877373 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4139372581461456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017028503243186983 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14600046958938082, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011984602947182365 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.2703010278791222, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022446737679045724 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18499822465619842, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014157929728201822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22926603301976495, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012193510568444229 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.4182116954475941, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002363070228365806 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28955171897620546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013460315368352376 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.2760388688353806, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001581361548630609 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.49527056164943506, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024657921238186126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.34678316488645144, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016548796761139582 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..554c8e05c625bb95488ffb740f631453dcfed893 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.726984471838634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0796088492574431 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.32648518891836104, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0017271914729613752 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.586257235035879, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024423096252639558 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4106941328054044, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017229230075679713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.1453440606347176, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012436842104140615 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.26971984634892004, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002313107941237128 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.18449634822804548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001477498111348285 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22765173104030403, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001238329714231677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.41649689481485414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002350472856644654 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.28803275605668655, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001367927628256527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.274783630094611, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015950198452968627 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.4945541982831927, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002451202764511733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3458744644841339, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016762712143040775 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f64b77bf82e630a1b0e27e757de06fb95e526557 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 6.5756845375883355, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08897150277070985 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.32116519599671667, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0017400612729082726 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.5784532325776863, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024510408288361423 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.4044662434309372, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017458799577526741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.14182020295749317, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012237648320547914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.26385422416008314, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022588904922048233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.180205137590262, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014510894649712618 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.22455155493389567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012390887238885092 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.411817587585727, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023153768885770987 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.2844408397217688, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001371012610736257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.27161546746024895, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016061149766465918 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.490609400586177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024821818387890946 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.3423573003846329, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017004223296000738 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..628098befb5e66b696fe2701af7c7a0f1888e53e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 1.1558513353609208, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.04074903284344169 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.11516247787644035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001524841446986784 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.22747710284617287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024176480638780427 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.1493845282584616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017612814690175535 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.023214286463636956, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0006999863181351649 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.044410334346547925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0012895264732825 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.02963754911980315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0008583825054508186 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.09351085926051252, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011626517440736233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.1875911020135083, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.001982311694030516 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.12196365835192624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001376970711141067 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.09651005871721619, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001243153944300717 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.19223482953366502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002042794088449503 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.12556472090225665, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014533252243555674 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6dbaabc57d23da5fa540f0f082ddcdb8548aae81 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 5.632510593428182, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06797359125001885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31416640095368, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001676578347273029 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5652729230635114, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024927456901652216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3950199600478533, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016709891545967043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.12825863146984678, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011505311700009703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2387490220507428, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002140418284139839 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16266540713305758, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013365594771830832 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.20916346047637208, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011837672533766093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3840042932400857, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002349227665339613 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26463608485476275, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013170335870528055 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.25910749063743543, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015532175334562622 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.466741053856193, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024637010906840514 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32581485051561293, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016289808132726072 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4949ad993b7ac6ad46d284318686ae7aa8d6df66 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.392142200831627, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10315877163155605 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3180209569566378, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016737440721134676 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5778453320981264, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00246975929714272 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.4015506196777932, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016768573292908987 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1368729144877671, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011691195535521908 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.257752868294542, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022701334543955894 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.174575698484694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014096927083128974 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21739223338213798, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011987485605256046 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4023377833196794, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002376723574094262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2761053811033139, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001353268307874384 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26596256244660715, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015443527415276346 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4842569652904595, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002471682356356687 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.33602429370388687, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016395104225531447 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f9f6d1035da641ac9cbe66d3105501eae790aaa3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.63349544243743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05281657466901086 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.31357525152538196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001623161380165631 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5751723085823159, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002495778990632278 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3974550338517387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016482540144611944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1371291641511403, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011818735703140412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.260356557510865, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023088013029268985 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17547831360286914, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014288067294337741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.216247455101675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011928912670333205 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.4032048757502036, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023797635332692417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2754925401248682, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013545543541219737 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26435133382219206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015244603278307882 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.48612921079402044, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025405746329231096 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.33527731886149553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001640163433964876 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e2edfa6570f990f6cc4ae93e13f09e3b03d2c072 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.709765572579541, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06261403260610318 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30664438468283994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016595572641215095 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5650188060391115, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024946022217113522 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.38937605226208516, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017014154240876697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13579088054388685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012067203935791926 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.25817000306049603, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023009577668858558 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1739020471143616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001452734859135155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21289213132253543, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012160818190789682 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3987146480229239, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002331544671296655 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.27171532019877925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013724928070502846 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26005394613535154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001558252869531713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4795521973701686, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024908487929554543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.3302419538023624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016726242236806516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..70829bf20a71451a029f05fdd639232c05b10f03 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.575403777061748, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09823540673154174 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.30336615193836675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001651812386890844 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5612874601960536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002474965171767822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.3858772280160323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016981938784725468 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1336565960148854, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011883817497889528 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.25509786184825395, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022469620904438923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.17145057785135936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014287502471356415 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21071104756711098, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012104628563537965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.396382791753183, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002322655929385875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26945646442741866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013738953482368282 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2575546053753626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015516497103719832 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.47687647145147444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024725571404500667 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32763137597619785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016696428766529263 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3fd7e4c218944397d0d55cfe553d2fb1fb028f33 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.0006666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00047132592062028273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 3.968253968253968e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 2.8611082123824066e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 7.474747474747475e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 5.378367894229668e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.0006666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00047132592062028273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 3.968253968253968e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 2.8611082123824066e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 7.474747474747475e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 5.378367894229668e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.0006666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00047132592062028273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 3.968253968253968e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 2.8611082123824066e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 7.474747474747475e-05, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 5.378367894229668e-05 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ae287e4a742e30820a836f065bf18971cb80d433 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 4.659796079611401, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1368931758688134 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.17644909352160107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0038149287890057383 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.21230940827718442, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00454974484640084 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.17808584653315795, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0036047878484278886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.06341138057720765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017418196750443745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.08183158513848933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021584079306890677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.06654255399675307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016555414433965217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.1238963811146196, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029398927249347016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.14507709522146778, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0031870533189973928 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.12177858627685004, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0025003205364259196 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.1513576373029321, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003401957689671663 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.17927286088246633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0038765489000090123 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.1509777229111984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00309077855012835 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..667353ed5e884262279625ac94bd9a9d6cc2dbb5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 8.349660138830153, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1098318499392546 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.24305067824681437, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0036097560022010406 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.32723288456222843, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0048182868947828535 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2653185459339885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003718960309470489 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.09330398651801074, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016570405561187557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.13658467403886668, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0025405278204551044 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.10730240032895362, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018845525107885157 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.16814754590628309, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0027282671712716595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.22485624238227028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0034527275025707706 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.18130203037790546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0025961043557325117 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.2041811673945807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031846062911353203 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.27268640990499865, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004106397617602531 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.22128162666490145, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0031723992786646523 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fd0850ec4a4e27db10fe93ddbc426438bae966f1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 9.100604672627616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11628331252327472 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.2744732005419624, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0034020795182056205 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.3792037077764579, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00465258042529242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.3057154871720569, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0035668673332201113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.11192035685967208, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016787135403730127 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.16449012589807802, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0025920539576727714 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.12907057263921567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019101957633704694 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.19158493144962355, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0025745366569725494 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.2642632272554717, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0034193238088634525 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.21199121837713777, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0025587269926346133 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.23056997456483158, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0030071550428457374 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.3171677356585434, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00402997199179447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.2557399896778223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0030869457481539284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2c7e1a22aa890ce48a068e2a78f6de29ae53bd18 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 9.460262600466185, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06690235758953499 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.29909310953486523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003198137138338973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.4171947659672954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004380236874977217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.33535988312538023, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00333242597817113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.12487041668274895, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016702178366713861 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.18358213761587733, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0025750805740997595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.14413942196532828, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018940385351166674 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.2095831569760216, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002448524558694762 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.2928182365204402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0033071812049612808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.23407187738369561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0024485182965088507 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.2514610077689454, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002860099081408974 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.3501029527885422, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0038717879646786944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.2812675916988234, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0029431631516787177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3295210f131d4501334ceb6248337cdb691b6d8b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 9.58290843334344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09940670882099904 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.3125044962383802, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003072058347749594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.43832279193619333, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004161083245905112 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.35170201207032026, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0031635095609000353 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.13168237302502842, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016416394703174114 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.19471607478958905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0025429321562742252 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.15257686050529207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018607982697786416 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.2193444318712155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0023735293014745776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.308468298974907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003191423466850776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.24599327220157718, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0023449351106501785 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.26273018537823156, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0027544779870844888 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.3678867286427804, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036888444777911644 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.2949470778370446, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0027957905211735317 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64aae186798b5d819b40243d33e28b3e4a44f901 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.5451634050753023, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.03476122561593657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.046590041682019065, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018113877733116784 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.05509845941495712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0017154015403554534 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.04310290552413315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0013354170612674893 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.0061050451406738674, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0004913565515769247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.009015504658999519, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0006494884515366231 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.0063826724183375155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0004428905323251431 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.04442283664669869, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0016857127584432392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.053364978373077035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0016220526842467798 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.041469788976882346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012468639228984321 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.04212798093112617, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00163957458202675 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.04928251726552432, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0014753429112820023 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.03867299387707548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011657405780310776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e4cca8a9ccfed473ba141c4fbf3491516685f377 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.396112958955289, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12774001717020914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5339579313432855, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031227264254700595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.42523020862127436, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002981115684302531 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4479642771715969, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023079960050490524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.248732624279182, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002509203740513705 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1952491238459977, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020864674211335237 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2056915755809246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019163170514045066 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3850344789575973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00278033592883375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3033612056952134, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002388739694886294 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3204968358680575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001985520070286022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4342921156865786, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003003575710929277 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.34421205733760346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026942002219059206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36313254630951347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022360796971689707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1d2dbf5feed3f2850efad9495bf74ba58d9662bb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 12.976405649449786, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20113725542092992 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.558994227405181, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032295866736836163 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.44495262240037453, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029490508361979437 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.47007019927530463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022920087841303556 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.27217728286113113, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026898706377774815 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21411207509510502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002230097528851014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.22591032128288588, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020560357969047136 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.40975809640656674, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00294221380121017 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3236212819558878, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024673142471829217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3425232001087342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020806737697165846 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.46033224287141866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031762512099836643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3647714098177991, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027210820647532467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3859835303051329, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023029475516390383 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7bf020ecbd9aa73a8dae4d882725a3ee2804fc34 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 13.841780368990428, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16004525418920557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5672114023663154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031836037058964135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4506315681499688, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028976257859202057 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4775062117887383, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022520658159749783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2831489435622744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026849363746315716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2224359793863677, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022585552678834257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23547797340215765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002080191282609909 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.42052513955243626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029794181672521524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3316609702212396, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002482260811969057 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3522099816274127, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021360587201332483 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4731322251491655, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031830112351273762 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3751305124701987, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027537781991579034 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.39783266320253785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023487903214587246 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..89b10522c06aca6e8f25cd4f8d404c6762d42367 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.347043577937871, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.13183200117809515 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5657141271378652, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032494806378829547 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4543139798474618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002865992018425755 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4791803139648653, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022359999842246512 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2840374640470508, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0028091760925762137 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22548495306647312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002307333846171272 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23765394178309218, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0021422406093072697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4208287732411973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003031588432467615 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33645734099493246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0025176571940632108 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3551586826634256, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021591742079548407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47480321305856177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003220379872286072 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.38184109614706496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002799589796872638 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.40243539061119715, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002367520270615011 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f8f697084677643206c810d2b694926a8c380090 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.305213942543, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1483607819215848 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5597772799865962, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031779357829373007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45931387100423793, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002804556836326018 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4804680058354203, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0021741397703150594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.27941286210601324, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002685923767684149 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2261024234770814, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002220825905817483 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2366049201616526, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020482405764989274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4153388685887487, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002922026669760074 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33991925444128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024671212347911007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3555529772285316, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020868456031009723 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.470187398074112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031325933820836164 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3864435015330604, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027449279832689228 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.403868227449958, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023025847428200684 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..72e1e2f6229e921c04c3f92cbc25d350cfeb2958 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 5.202591268830205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.054922717477209206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.2917166159957922, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002219184062207322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5216732757880385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003067680737337117 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.366483471533585, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002386281579917487 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.12771487727214673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001331726066831988 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2362911624934372, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023048686809464653 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.16205539672246214, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015773137142455621 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2181802591620801, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014357601973536051 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.3992804600446685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002414052410394236 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.2763256240716396, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016051756708536963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.24125596921065487, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018822466477212397 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.43323632221522257, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026995804538005315 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.30348032953012855, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020484158677831262 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5237440c82979a22ac14d6e9da6ac39bb4f5ad3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.510370404578311, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07017652112732815 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.37679150052732646, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0021690658868186385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5994842792127021, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00255563453868782 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.45209118860105924, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020375402593377735 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16696448229806954, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014179634395889713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27258591423013007, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002211872159153144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20190477038644114, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015696316292779203 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2606246077766141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00149866573783182 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4234162076398136, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024102400017687316 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.31497173633802084, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015604606735793588 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.311329828827097, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019564900632477846 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4967568002020189, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002564011157133322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3738662228300479, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019441791456726448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0e92868c12b41f4e2d10aa12d35dc9dcdacd3b63 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.694055179072492, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07704898802651756 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.36979779913491, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020832359668947016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5914089255959427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025404798791399787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.44487573960989946, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001977816470537129 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16613522297283992, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001397693878296302 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27310008704696526, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022423121530451896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.20141804290762905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015588769877815645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25859076070964365, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014689767050739151 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.42183833274839183, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002417511355707716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.313184127205453, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015466479745984572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3076440801284151, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019053103567047945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4935606984408204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025806815753750124 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3704488980543351, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019175726004970364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1a75f8abe8332933425a7ecdbc4fbe4188a07c75 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.6514653763342375, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06251939465471733 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.36420647303972614, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020915518131283885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5822439340272905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002574821849509054 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4380843233846489, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019929244812592456 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16409417489732006, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001445153868542824 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.27002111151104974, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023305847933292184 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19894653788829594, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016174170448261663 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25631927067186866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0015047882308515217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4172204517599229, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002430745636023191 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3101723423743742, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00157850445644739 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3033028389713666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019232661628729548 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.48618027753856774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00262308382372153 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3650855458428154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019418658394657236 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4570f7e4a6e720c5fde1fe29864070ea9f0e0770 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.608484443770737, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09177253914044482 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.36000067737683655, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002059158783528559 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5751430196884769, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025553447543118276 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.433038880083634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001976429894642913 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.16179960223783285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001465512561186318 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.26598167030517494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023601948549819467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19618559845779804, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016525158117788473 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.25407783683621693, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0015117553852618231 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4129708872099754, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024312767233502217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3073988661453901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016004229019968986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3020096150583601, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00192602224235909 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4835110037230676, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026281093270445572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3634810794306642, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019587560708400013 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5b039bc258c670023bee4402eea4309062b6d163 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.335359360908907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0880259547967717 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.3544672605326427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020400084083317866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5675350811323949, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025580342690609322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4267199017854688, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019587924811470827 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15730682660007175, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014601294548965934 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.25829361144949015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00230882843770772 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.19065995281551984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001637747580731308 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.24923159048984417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0015024102603759335 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.40544016089116924, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002402265187758597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.30167860222723625, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015893036860766903 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.2968667042793682, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019183519029005254 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.47565074316708567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025968713342409276 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.35740425194488684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001944256853237914 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d344548fb4b783d4e1515a7f741b217cf64ca849 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.09962536145007753, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001740659618979375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2495957789912658, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004084832622173279 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1404701926071358, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023545854909488343 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.015032914044550117, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000742400789969203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.03926943996615448, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001959252805398743 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.021410321262745457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010461597649610937 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.08055461541255139, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001249774505808259 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.20337457259832462, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003060417391791486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11379782676552355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001698506173428402 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.08079598752449389, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014172566441327129 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.20443087549335434, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034517764908789015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.11421613258385542, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001935031419720415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.8720915413287463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06977921896956246 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a72323f30fa96ee5a12d072a6a0daa864fa6cb3c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.11166497670488186, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001646093728324672 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2745822551275572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0037626527798719244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.1568991797418592, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002216712754065909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.013225636476818368, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006474952763639561 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.03329408008418082, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017080640073784744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0187210253001416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009208759050088938 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07882774952520541, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0010561315770534385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.1958952699537899, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0025491385981828486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.11103657989201658, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0014320021304524589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.0896933686749845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012859991795038399 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.2224374288611004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0030478633438990885 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.126287265188546, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00174254258134218 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7071340922221964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06567476459760405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..62f87caf7d678fcec3ecc7a19d8f0cc5a3c9f7b7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.124060562800013, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018136218300640054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.3073301314994108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0042524341650730015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.17473465066618604, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00245708961708968 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.02227233591062227, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009171326429615212 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.05765394909576481, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002379587733761179 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.03174503623834507, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0012950949882147274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09291237590423006, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001297408097673992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.23207281546402703, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031750505176512617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.13112441731732927, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017687033339916917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09759594898910186, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014440054482869897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.24407902417613644, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003545475320336588 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.13779052503552564, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001976539742066777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.2055699356745069, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05452685939114017 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..85943eaf3aa6781ce75e573d118775cd9e0562ed --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.12858513884438236, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002120442853077002 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.3058467203905133, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0046548186919951684 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.17754481617140364, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027411678148972474 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.02559840737779659, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009669040664741145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.06396559494568956, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0024066314342877596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.03592964212545872, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001337319847086293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09637012660269542, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0015081518881888312 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.23129631390107847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034855936093635756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.13344572346235084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001985028910537152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.10149076059236951, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017109531080025527 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.24358222998073842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003957993939468868 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.14060704133134494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002274916550088745 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.3690976879754526, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07157486729365321 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0f4caeee096c4acf218801d00f172f829c74961f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.03769871977195407, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002336453416963925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.07311282171723119, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004454298183792852 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.04580987420726127, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002686434828650492 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.007625557776701037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007866165856765972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.01686591602432881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015442682087128132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.009837365340636202, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008814198827318251 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.02814473364569592, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018141347746336734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.05436185626976168, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033426299994124293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.03390847923190842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019941253411488963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.029868454600151712, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019063509322727447 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.05818680411207516, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003622236236410205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.03619213971219476, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002146702755941942 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7131409325442915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.16045020297035031 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..566b85c97736cb53b9f3450a60fad788c8328cd4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.0017152658662092624, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0012123554660875486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 6.387886674158631e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 4.527370030666761e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.0001231472929586137, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 8.72630464321695e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.0017152658662092624, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012123554660875486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 6.387886674158631e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 4.527370030666761e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.0001231472929586137, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 8.72630464321695e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.0017152658662092624, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012123554660875486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 6.387886674158631e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 4.527370030666761e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.0001231472929586137, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 8.72630464321695e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d535556aae6f8c4411074fe29f51160dfe8f5a87 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.15468558801825094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027906195490837874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3291046037990046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004388092575953328 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20129747803416173, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027922294615110735 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0400294063524096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001724105483571795 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.08497447416593046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029266165872910183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05123793702073981, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018214840639693597 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.12465526978296572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002299651183476469 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.26725068507569655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035760579918268356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16254032673358534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002238751342864787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.12064651566409819, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002398515107357595 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.25801606284267453, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038925350228375816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15699398066183545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002413809068769271 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.969669748694084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11920564417930218 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..744e278fdba674d8e8b39ab8c1b98108eb2976be --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14151483177458335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018666664224144868 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.350507712921647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004347527687226855 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.19916839431146724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025060831527181906 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03563953027984095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011469030277248067 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09161704225190265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.003025312844144268 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.050711870640867504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016273936819634284 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11241775738220008, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014552290073169463 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.28016025506550185, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035605767069095784 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1584759141966615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019769396812308743 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11164690738304506, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001554937658585751 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.27937348602219997, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003826151047457801 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15757176490392727, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021232145053781655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9843126589812214, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07803286264393723 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9fafb7d28941d0699402a869de005a377a61c3ff --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14505365849514526, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017717206403909573 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3590688289676267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004194349096882626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20413697486271104, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002376457670474134 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03641466698412616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011418577722541036 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09475841284636158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.003043438432012329 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05192590462289213, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016190417513229905 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.1165115860707238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014128544079030998 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.29053341231755914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035601421682268683 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16427512906528735, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019258406494987107 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11344382044719216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014987095085429382 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2835275825737783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003772811693926147 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16004314278537282, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002051543734547068 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9960960514942592, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07344519144130342 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_3.json new file mode 100644 index 0000000000000000000000000000000000000000..05eee738f9dc2d392ea933241725ca1ab864a3cd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14713365585983412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0020824125761163305 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.34916556365895823, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004704775790168902 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.2022841747965752, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026384886657312224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.03592889088406201, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011712459443672357 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09050705948348292, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030325027339432697 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0504745368841509, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00163370097686754 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11559690666144418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001671096800205579 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.2754667184343775, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003934807057494367 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.15906261276123296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021562437238946204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11557269164532327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001725315585914204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2766392842578445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004134898630416766 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15926265578186155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022530285725081956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9836229338335558, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09808438341855263 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_4.json new file mode 100644 index 0000000000000000000000000000000000000000..36e72e2877d2a09d60221d6a35e12247cc4e82c4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.04765862333877057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.003034035842078532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.08668561522891197, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004843273307719953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.05505531576181226, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030265646206736638 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.010462634958970572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009728648293141028 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.021800656752831386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018115640007053421 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.013418856497160158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011417937757972204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.037309800551862775, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0024441508269820006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.06722349707153805, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037810947177969894 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.04273395830712555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023651800321026328 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.038418088250945504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0024990780458393853 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.06943623456802712, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003937161605823187 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.04406961555377166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024510630458409125 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.078602667040749, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.18381444266538938 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e94fe8cc2004615c7b18eba14bac00f9eee61b65 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.0025698165750884954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007513867047594584 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.0020265023378529864, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005860337220124529 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.0022213194298558382, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006417999440798693 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.00011170849128673766, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 7.932541467693499e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 8.475431338916355e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 6.0983480512422104e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 9.617082045566528e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 6.874253211859863e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.001814865674466118, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005255278900735306 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0014201789051984832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00040633641993546393 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0015697328565003042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004515600693647235 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.00203326778659742, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005789262577812531 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.0016561300609609598, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00047805234847829935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.0017855275703933101, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005066133842215296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.8932273089650456e-42, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 3.552979377433657e-36 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..91c8cd4d2b0bb3a1a4a14b74a748e2220c557ece --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1691487054500622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024250397934530475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3329424577074328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004657379753879945 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2135984788737725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026714992621488937 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03660197341061794, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013876615412254558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07882441796533844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029121244310315966 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04790575968435739, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017255007150536288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.12480506419752084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018165017981600132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.24849031836840368, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036491823702079194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15810158263668517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020195259778767218 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.12993544368406393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019145615600976785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26077607018509813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004054503415661984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16520124922802173, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022185800515451995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.103288067343989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12415880355418442 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fe6b0b037333d31d7e0f97a3d9fb176661da7b41 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1369839651831878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019614322408207006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3369949215642884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004522579510481593 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19244278875333556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002639391462026543 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.031094803643512716, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011365997631264868 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07935713496492708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002884276945599807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04413377405232099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015985481401232032 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10657563716596334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014875062360428326 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26467978376956175, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036163241442920165 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1500620628581487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020207447957827454 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1078060619083588, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016045831813046104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2680535986457037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038953175735036904 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15187454343375792, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021906040942089064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7714825425476433, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10905675609895228 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3a770523adb0b1ea1ddc8fc64454faabc86c74f8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1416725332039807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019003755973271465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3459660596871756, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004306340084752496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19861514735566918, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002536869900552261 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.032589396300244954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001135411316234186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08273970858446308, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029435229941216076 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.046170973346933354, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016039666517335285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11131065526290164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014510877883445626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2741645486545069, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003497544935564585 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1563808799657367, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019620345863827896 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11082674371408649, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015845483453646742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2737340240682109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003854876130312793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15581654576519338, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021586761800846295 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8719539285791582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12363679141275902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c91589f95058b43b41c617b0a12473e8c105b7e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13956778448576387, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021737270808076243 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.329102550985314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004775853265191935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19191359742761793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00277051626651925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.034143138087541, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012197975952686348 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08270285997486158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029002084266991856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04715930396420784, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001624147693037141 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11089339313344634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016751091885532099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26375167528125915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038713870137101424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1528761624914166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002169630539318251 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11161318605392968, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018444411300991288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26547328847718554, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004239107141632789 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15380687866772882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002388364659924707 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0369871247788307, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11889285675918113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6a9fc67bd00757d79cdfeffe2efca8b379e77d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.045437643598581885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002731741164998445 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0834633850546414, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004709515538170699 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.053658040802239085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029774150017601803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.009714659363066126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009061043015811388 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.020204993259341004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017173474943290197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.012335864733508397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010450675751901856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03525079234225019, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021438053902596605 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06521180915166398, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003703751644624955 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.041486821630655535, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022764116642440576 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03647650806323336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002248078408967042 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06711329846733026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003863144394106958 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04288192765446807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023977202282267356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.9532704572175751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1334974714932564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..16f79b893023ff49ff4956eba7b8f290f036ed86 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003263719197769084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008887617448902996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0028345456514645845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008116977743173961 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002997767517780879, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008334506629000642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004590858641913026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00021377485443811557 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00041953329689178745, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00021411305439143847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0004337191943913522, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00021214389566854186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0023145025817495005, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006312757980824667 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0019717134578344517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005600175515270516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002106529594577242, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005845294157065285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0026347083023874887, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0007273791404483311 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.00225801889401816, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.000653191190142516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024033045589119204, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006766571604709825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.120169018915429e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 2.777103521919484e-32 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9ce7d916b308ad88686965cb6731c25834e261f5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.15951620968916577, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002506196733818455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3380965213762154, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004239971963662302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.2065385624658632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002495181752673378 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03267382741321086, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0015503006643301331 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.06995212128557793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002538369711070906 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.041608603555378855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015173762355031157 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1158742599512755, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019653714280750737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2450321311305678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032031104916126017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1492749985320237, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018430993892411397 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.12563518916564906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0020584621317715304 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.26886059673342566, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037413258402390244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16300942364929533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002098349740204003 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.575373380176573, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0743995294897499 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ec0caa59a532351a7e2498ef003f7452a674fc66 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.14477662439175204, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018795188514419876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3557260266989316, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004456910041921236 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20332653496456884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002535349618927259 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0332459231860444, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010850655973791109 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08572120065468666, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002895224082192127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04729852367510724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015420776527216781 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.10920770628719932, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014291037457289909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.27011776305596435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035598997301664764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15359088976552104, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019473489961876593 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11478861556780529, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015689281670261022 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2847152787941595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0039331867569525865 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16157946667427311, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002148423192266251 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.8684098635173505, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06338992961884797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..02a46271ef801bb8a3cf9029f5d56c243ac2a906 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.15082144789712365, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018424832486209342 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3651698865935423, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004302647367894026 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.21082155347220088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024569832954891837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03513129846275268, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011451237082588754 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08918749690012981, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029682699425890064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04973150539674991, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016131300737703802 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11407141265154397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013994053119914564 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2776724573795989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003411155443688218 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15960845606925958, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018776627339659353 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11828100442855599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015337129412005281 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.28863758587492094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037557189858194494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16564364312727078, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002071661209778157 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9593269560207813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10392450231035287 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5d7ab5e773f12302792d8951e60474754b5a829e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1496531280766477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002170811665161093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.3453250604427468, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0047401463995383085 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.204398760358889, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00276366410673389 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03495998710112079, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011734323689905663 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0853256030786561, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00288948347840057 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.04868798327580798, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016054863551996723 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11215486999034506, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001620815089476965 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26057916777844126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037363611572360737 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15352602365435797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021112441534149396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.1182804937364553, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018148963331361236 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.27561134068552073, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041549575197084255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.16206552227978172, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023555539018775963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9760625459105288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10026358022338139 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fdc238e2890e69444d7c9dd69b69c099c0c94d7b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.04792518752207993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002929855540885626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.08695740365895663, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004905955060606447 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.05625513613265757, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030874961718411237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.01000387771406212, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009949695824421575 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.020457759459906272, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001799928617720977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.012561323055777309, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011221605993979012 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.03654303187169094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0023333023205796763 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.06544765513897487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037490143344574268 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.042368809782355486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023587611382963657 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.038524011442170915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0024337355610465307 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.07009896492118943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004064534959138374 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.045014902928945284, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002517391175655878 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 0.9987646889959115, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1912402463323597 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d8b79a2c926c72f95bc610621aee93e52173036 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.0025400865271768946, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008365321705968738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.0020522199947177865, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006198953753349982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.002203951765543492, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006780578514746854 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0003593890386343216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002587217800602705 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0002044705818290724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0001357861630560493 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.00025865120204742847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00017713704426631428 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.001746686503389582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005159950965839201 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.0013959158984171599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00038826984967891984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.001513916054366738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00042616321835822437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.0021182020561365147, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000660664849160224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.0017475858050803443, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005209596862265322 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.0018556868062698107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005514893920061358 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.3330277651810904e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.795417937992427e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8302d3de55f415c1b7479d73e341b2c34f4905c3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1656184917860495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002465318130849214 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3451271695394911, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004485695093538501 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.21463729419325275, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002671429409363269 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.037345942132846435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0014462619260980324 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.08165828005081678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028400731706742888 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04915332059409096, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017122102302659385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.12269872075535913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018643694649009327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.25782978020360103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034687018620499803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.159314857725847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020081165474175313 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.12736067944965968, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001970254989936668 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.26933208119177504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003908747660119136 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.1659189540151564, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022284342453661927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 2.064984135458196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11276853324640117 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..df43a1bf3238f3f79cc7e204652def9ec08785da --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1260899791574131, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001905161571513694 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.30488764933726087, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004406152742466401 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.17624612929896505, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025718000739001897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.02734160155747233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010866592288641356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06868815407030224, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027368160750971225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.038642466121148654, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015244525321813124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10218411344219641, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014948765149499935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.24887472602828783, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035843610201584087 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14308653397520427, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002030967495914565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.09807478367610013, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015430046746426423 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.23944847065255154, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003722025149898182 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.13746530371970767, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021096421026204895 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.558306696837244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09981504570416164 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6758f6808b6898a2b25227daee99b36c113012e6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13253960760217687, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018631836737003167 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3178043987357393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004209555582114566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.18476307451897173, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024826342866030933 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.029929646843642313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011076561079256734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07427687891626805, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027863929510018536 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04211259064384283, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015475034261702306 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10895910761984066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001506797437434206 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2627359450729775, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035428896385583442 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.1521149622660914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020254138607920464 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10274775436240437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015507057264651516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2489222546895533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00370928368654382 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14364115654972823, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021055633428788308 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.7326078380754275, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11082657511901861 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..292830215eca966a25ae60c883f7d795a47aac90 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.12889199801472273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002243455154850897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.2927286723681437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004638755474625006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.1752477037036964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002823759912213769 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.029515166535948576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012016618969078606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.06911038852478063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002746574682775168 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0405702032936913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016059537675197902 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10464484098581038, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001773802303200482 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.24008506670200253, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038295614084253612 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.14276741278375094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022683257050729088 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.09960601532293903, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018313506067734666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.22860772378561103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0039645397857025364 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.13584003612122558, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002348292605866413 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.7305491538270004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08408232834719435 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..96308b7a24c13ea043b539b5134172624b39273d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.04123247889915294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027362423249375554 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.07120572993230662, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004274377455153015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.047528176132410734, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002823749720620929 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.009497011127313206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010718639651327534 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.016786374564059875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015228476295663846 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.010936982876670076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010227818352712905 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.03244728409674943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021591440394866995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.05663645241508702, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034535293095162792 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.03751611271132232, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022506689999874908 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.031933246077056165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002143169453256064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.055391705623806235, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003398556473468678 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.03670784975038728, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022077354874054825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.6780236081531866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08801012234752068 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9616edc13741a9b3d075987dee5956f48d5a558d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.0025728987993138934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0011767976878626356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.0003240120371493934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00013796006385495373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0005723341987999615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00024516446983701974 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0008576329331046312, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008576329331046333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 6.125949522175937e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 6.125949522176059e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.00011435105774728416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00011435105774728473 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.0025728987993138934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011767976878626356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.0003240120371493934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00013796006385495373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0005723341987999615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00024516446983701974 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.0025728987993138934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0011767976878626356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.0003240120371493934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00013796006385495373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0005723341987999615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00024516446983701974 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 4.142453217449745e-247, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9e1556682e55cbb60e3002e66e877ef318c63a3a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 10.02681772761729, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.4785770088213236 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.21326442624004158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005569959071404712 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.5796067902091783, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.008493584491820132 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.26893866283043205, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006128132638012782 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.1569774762608679, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.004918509910414435 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.4493611433994356, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008641465811853976 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.20120770859481335, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.005573115844107413 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.20594687601926343, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005476023109804243 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.5638381191556961, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.008539182187801923 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.26037256266092274, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006063020148608195 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.20814801304375385, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005524999926052949 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.5660905777243646, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.008533556999477658 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.26266697026347924, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.006101223479297265 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5cf1f50827042454c0da2c142c56d5ed0b0f79e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 12.405128080356619, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.6282911341436567 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.4297880540568289, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008220917619398086 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6505221227794611, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.007032948083628127 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.43052575441974283, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007565036202436803 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.3277678584275725, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00787877500339806 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5012746456886409, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008142123538414653 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.33159324549421026, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00747564127341564 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.4165981868791939, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00815637476891333 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6336843792535592, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.007239228173826263 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.41915608188207465, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0076028128010187926 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.41983019583164943, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008186753775907333 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6355954746439701, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0072092086784890964 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.42133006102653786, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0076023574833303156 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7959fe48ed34f737d7722f5bc3f9a8e04483ed50 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 15.396111872252607, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.7207759822340656 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.4876095593388225, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008404900319386375 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6784375831550562, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006899269582410456 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.49207161703177676, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0079131367283513 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.3809761461318492, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008300803335151334 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5339790203304655, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008121721991565935 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.38914197156300356, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.008043239661107048 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.4745592667499523, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.008400824405327217 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6625149456976438, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0071233628299963376 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.48065244860067763, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007989865365422826 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.4769027851776395, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008404721247676569 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.665131716020771, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0070890587100882405 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.4826221999255758, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00797725209707895 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d6440cb283a63dc9d177cc064f74ea11b8d759 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 16.218180922067372, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.8509332297059158 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.4890706684702515, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008396913867721763 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6997303374835331, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00655926076210075 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.4977472383213469, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007835731534849398 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.38688066002197424, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008249491136803927 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5562519826404686, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007946807268147116 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.39761438363429064, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.007954981983937091 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.4760638248450606, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.008403042671103629 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.683370150325831, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006819890967440377 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.4863555951176898, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00792372037118076 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.47941150704079855, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008400997890393643 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.687165768578369, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0067675214257696 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.48895320036596807, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007901377124692735 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9c68b556afd04ffeb537f561ea4df888f3c0e842 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 14.198451804549311, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.6684426734932597 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.4481132422374573, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008389023019191415 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7084211018687647, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006378983318263235 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.4624679947761788, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007768937193183521 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.3533165568819186, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.008050961633003766 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5593924878365768, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007881040514155611 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.36764860730658894, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.007715456641810255 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.43585149585760313, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.008369484613704422 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.691252031424185, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006649043538442779 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.45164355068805134, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00783665587254422 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.4394149652836155, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008372880319767448 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.6958738596781262, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0065890391738934635 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.4544143905392024, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007811758155493913 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d6abf61de372ee971f4e708726130e9229c3f9ab --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 13.506508915229169, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.5116866328228713 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.40539001717811785, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.008246350607890437 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.720893483949784, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006190339720420079 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.4316365583256063, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.007604543745568956 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.323283729079598, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00784836767981782 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5717527841713916, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007807360491891714 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.3454708512028626, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.007528073940807884 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.3959242347319179, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00822162851212014 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.7025263199962368, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006478632818938049 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.4222245633792425, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.007657295802053638 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.3987034301296399, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.008213614641467533 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.7089771703621439, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006396913155506568 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.42507655541519007, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.007630445420039102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4074b972acfbbfffefda1f2e96e5af792f0c7eef --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.48639825897714906, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011661506839823789 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.48639825897714906, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011661506839823789 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fd4d2aefe671b9c0c4617a28650da13f3edc0765 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166557553076037 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166557553076037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..107eb4ef9993cbd33dc0bbe2d52ca6e3e7a221fa --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5103373231773667, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663330673075898 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5103373231773667, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663330673075898 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..399171529ceff620e63e6e8b11a2c935491aa491 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5048966267682263, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007814 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5048966267682263, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007814 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json new file mode 100644 index 0000000000000000000000000000000000000000..009d595a60cd0351447492cb1862f4d886eb00a1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665575530760367 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5032644178454843, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665575530760367 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..56f8705602c7ebca482a15971ae2fda2abe7ef93 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_choose-the-most-appropriate-solution_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5038084874863983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0116654857447468 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5038084874863983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0116654857447468 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..52325093c6105abedfdc21357c0eb3a3f6216d9d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.17916585022658107, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007303477837541812 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.021356332010434582, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007105713714095752 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.2286180773139028, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004259684658467588 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.036054964953355036, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008984262282200936 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003342865314527443, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00017849853167599175 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.04156016756116742, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002190482133962692 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.00581609356366873, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00028059938593242515 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.019312824968880614, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005557873015899917 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.21416319877579273, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00397919084314672 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.032951139104863636, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007476528694649519 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.017643439425447376, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005894989170566875 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.19731432042647498, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0038630321229470993 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02990474554699068, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007607771602225413 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..24a3f64d9bc99d979f2d7666a5435eb35d2a75ea --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.14849173039066274, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.006780886130825962 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.020251968434917507, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001004184999269241 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.201417622459911, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004027970741634812 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.032381757450450424, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.000864749614587369 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.003495343473522734, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00039644904227136035 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03504591595228045, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020381015294354335 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.005210530351647924, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00032295371471857974 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018798741988949746, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000866081508450852 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19048063687209987, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037923827162263503 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.03028754869084763, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007880477659311635 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01682276801642798, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0008795517871256883 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.172585092925651, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035987297019058986 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.026758462015252938, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007341044406760539 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed10b72e28884f415c0560bfa03ffdf6807e6fc2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.14199881699588157, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007298077065204388 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019700320148592156, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007325263284771759 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.1998389873544708, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003997380487910199 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03229518293521006, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008347091468006159 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002931431196844649, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0001919227650150536 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.034428579543291746, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019427883698678984 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004949579293108938, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002653281442859 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.018031465684386884, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006335497251678613 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.187052546508197, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037036180117878304 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029752024739423115, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007341759429352048 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.01617912400918, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0006560029568093861 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.1700270228297869, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0035543068929493704 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.026370393722191878, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006951231645923941 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d5acae024436c70be62a9e18179aec5a21370aa8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.14611074118885167, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010329545037543015 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.019397395062929524, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007719276203796916 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19955139137105213, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004054485142263899 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.032166278597507376, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008641364081647094 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0031310622057378625, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00043845430276555624 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03466455497519352, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020194141383628244 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004937686316660083, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0003186532275811235 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01770820830988742, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0007194515278158072 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.1838484940943189, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0036582750075176943 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029306952043178024, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.000751437752875103 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.016012342883103627, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0007103602078950152 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17027891330022135, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003611508658664297 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.026383518066224777, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007264066302234951 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8977c7c21cc958e9e62878bdf60f8c3829bb63a0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.13831086006846338, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007223374215514496 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01879738324600387, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005908622270302197 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19954407744415306, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004053615634720824 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03155775272111882, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007831075219253004 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.002591258862523811, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00014237661089962314 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03292368767698468, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00192514110533362 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004548001185708453, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002389268902228686 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01714064421552565, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005309706560737331 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.18441955079753033, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003735346161231745 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02877406291863521, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006862843463300422 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015437163218078281, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005145259041792498 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17088506075667292, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0036630999910594376 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025839149783962134, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006410049906716466 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ebd86e6a298c3e638d90d4e318755d7946551828 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.11807145345667959, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.007767667145105737 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018866851198029937, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006293413700291389 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19598796545291594, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0039141300191215 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.031432905163962875, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007860008461527916 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0027315307202555304, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00015835092179141186 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.033330339409121036, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019094967708652559 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004669858309181997, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00024073389992590414 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.017052586854830805, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0005424487932078866 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.1799650841386416, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003577924759854656 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.028453364453624484, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006751447210313165 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015736631180765175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0005595356316243608 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16949142493214228, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003578759284382382 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.02611703752890656, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006550883631257474 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5eead1fb4045d192f0124de56529ff888d7c302a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f02f29ff7663ff8afb019a271b112a49f0b96f23 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49347116430903154, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664829595210969 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49347116430903154, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664829595210969 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e7a786127a883c862e692f2e9314b2b4ba5c887b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4885745375408052, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011662778026451676 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4885745375408052, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011662778026451676 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..abab7b32e62aff7f675e1447f432562b8cadd9b1 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5065288356909684, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166482959521097 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5065288356909684, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166482959521097 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c9c51d3f609a41d93ae61e7d84c21fa99412994 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.5081610446137106, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664270112244237 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.5081610446137106, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664270112244237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7e7c5d9c489a3e778ef31ad1dbf481da9784973e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_pick_correct_choice_index_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49020674646354734, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663586263283223 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49020674646354734, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663586263283223 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..31cb8e7298b8a63e641156d11638698b60407cf7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5663764961915125, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011562571737707337 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5669205658324266, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011560864423151372 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eb81fa2fb6a9977108aef2cb0dff07719e2c22b6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.573993471164309, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011537375448519443 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5745375408052231, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011535468840824528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2997a1efbbb068ed59cfd1cf095e0e873f9fd63f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5696409140369967, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011552114834700507 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5723612622415669, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01154300962328283 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ad409c13d027e26c03b6ffc20e42d012939cc94f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5554951033732318, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011593746871584154 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5565832426550599, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011590883373666854 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ed3891392f77e9d127f2caf4db9d8f3aec144b09 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5544069640914037, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01159655408098765 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5533188248095756, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01159930504274508 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..490df2b97b8abcaa0c2eb86f84963821d6b1ba34 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_piqa_what_is_the_correct_ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5554951033732318, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011593746871584154 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5571273122959739, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0115894305035091 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4919276cf7b98f7974889e510d79a10d76b3d9c3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.639, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015195720118175125 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.562, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01569721001969469 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a09de8ac34ec0a4f0db6b48077944a4bdbe14981 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.679, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014770821817934645 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.665, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014933117490932579 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4e7ce0f937e41d9febb78d95592c323a70f7f687 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.702, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014470846741134715 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.691, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014619600977206488 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..78cbe041411a766efb23605455cd1f6a1c7177d4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.717, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014251810906481744 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.707, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014399942998441273 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6ba862572b3ba2a26de0d3be899e4dbb8cadbecb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.716, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014267009061031306 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.698, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014526080235459543 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..987afd06b77064f6be5a010153e0838058fede9b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.716, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014267009061031307 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.703, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014456832294801105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8c18a2c9a199172f77b13a2e3f8bf45198c69ac --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.867, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.010743669132397335 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.791, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.012864077288499351 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c9048e7a54657b57f7564e923b272ddd9e0ef3a0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.892, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00982000165134571 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.876, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01042749887234397 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..640d9799a038ccfa968c3655c1b34ed92bb6bc00 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.9, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.009491579957525044 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.893, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009779910359847165 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1d5c1a04b64fd1828e3bc914796ac51cf9855469 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.909, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00909954953840023 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.903, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009363689373248111 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_4.json new file mode 100644 index 0000000000000000000000000000000000000000..52529ceb9336dd8c5de8fbd742b756cb5c8c9db0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.912, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00896305396259208 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.907, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009188875634996662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0bf8f8876bf4d4c80aa6201376c6a82f8226cd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Direct-Question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.918, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008680515615523715 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.912, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.00896305396259208 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..15f236318eff144b213f31aa053c2d4fb547f80f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.5, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015819299929208316 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.453, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015749255189977596 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8681d1ec213d13ae60f88864cb304244ca72c4a4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.506, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015818160898606715 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.475, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015799513429996016 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dca8ab131ca8099a806db6f69255f096aec5d3e5 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.539, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015771104201283186 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.509, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015816736995005392 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a6b447904a0cfc349f215a0e4562c9bec12c16 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.57, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015663503610155283 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.521, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015805341148131296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4b47b0e5fddbec9dbffa53a0b055f140846b3197 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.565, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0156850572527172 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.554, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015726771166750354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9ba7eb223a11b056aa707610bcc455abea94db0c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-(Closed-Book)_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.577, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015630589090476345 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.545, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01575510149834709 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f47c8b5a2d64cc0008a254fc0381f3480f64207e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.625, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015316971293620996 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.531, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015788865959539006 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b05f46f84c707325b3b1edd4ed4506216f154918 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.42, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015615500115072957 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.408, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015549205052920676 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d05028693862a663319812ae85cda0db65583ec --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.477, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0158025542467261 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.452, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01574623586588068 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..84e2070e69c1783097a02f8654358a496abfe0f0 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.546, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01575221038877184 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.532, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01578686875935901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b8fd612c757d972457ad3a46e3f475ee5bc723d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.574, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01564508768811381 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.566, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015680876566375058 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5a50ea45fdf5115fe178bdc12c490af8d2ff99cb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.622, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01534116525402665 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.595, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015531136990453049 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aa3652a65d6a1cf109e0f8b339a17ffd14d1047b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.601, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015493193313162906 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.525, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01579951342999602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..87d00d414143c3ad967fc503b59a3e8fd4eb4c4a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.507, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01581774956184357 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.474, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01579789775804276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8c025e1cd3d1749391971d72765485476548af47 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.559, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015708779894242676 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.507, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015817749561843567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c757958ac47d5ee65b5d9c7cc1e29134f14eccb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.607, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015452824654081496 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.57, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01566350361015528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f8f1b75c74353df86e7096fa8b3a47abf27712dc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.642, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01516792886540756 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.608, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015445859463771297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d3fe5c4445aa88b4dc255ac8679fa0e577088a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.643, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015158521721486776 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.604, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015473313265859406 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..df0485c166cea9f086c0f88503d9c7a743c8011f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4730090860502405, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011545573278697235 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.5024051309460181, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562298481438055 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..01fc21f767d8fe1cd0d456e5174a99b7d69f4300 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.47140566541956175, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011543509045585206 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.5077498663816141, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561043278863545 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..10bb2460a37670318e50d75011de629971513c2a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.47728487439871725, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011550494192008947 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4917156600748263, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011560845076525713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bf8fb2f2da054dd3fa5e3785e3c93b20747c5b22 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.47247461250668094, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01154489847386458 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.47888829502939606, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011552120807053817 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d3a98f7bd6a9a353c09790077e062d1b1bce2fc7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4730090860502405, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011545573278697237 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4778193479422769, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011551049647290312 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..60843040e17ba573830909a07988564063cbb138 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Answer-Given-options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.47033671833244256, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011542066509767012 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4767504008551577, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011549925483927456 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a8d583619842de6b0f23ceb9649489d93d8526 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555016408505474 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5221806520577231, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011551049647290302 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cd562641f5bdd267c88950eca22e8e1115dbf6da --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.48583645109567075, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011557792331301673 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4965259219668626, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562153149168298 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bb251fff63c37cca6a1dd71728e41d2d24121687 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.48583645109567075, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011557792331301671 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.48957776590058794, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559920087347776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..68ec4f4408ec2a05e3031d8b5c219ed465e168fd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4751469802244789, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01154813982307477 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4778193479422769, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01155104964729031 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7062ed2c2a956ea49696c597435041fba9262982 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.47247461250668094, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011544898473864586 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.48690539818278994, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558466383367183 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e8fef390252b78aa1b11b4b1ff6050d0b8213f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Choose-Story-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4665954035275254, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011536599118298173 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4836985569214324, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011556285484521572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dd83c6cf283272cf50485f92478c95084c1038dc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eee3ad9dc6c5dc5dcd588340b2c05e167744278a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5f0c59fe8043079f86a9906e044d4b258e46f2db --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..84c8e7befbfaab219e187848cea1cd61317826d8 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2d19c240b531ca02daa579ca61319932712d19e6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_4.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3000739521c7cd022ddac6459d4c54fb759f62a7 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Generate-Ending_5.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0cbd7095d0ca7d34c03bc2bcaa4b7991546cb253 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555016408505476 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.5109567076429716, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559655791130729 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dc65c0edd8c665abbf990294f049fd21671f84dd --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4820951362907536, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555016408505476 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.5114911811865313, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559378273599126 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..94f4f49a04ea8fc17109a2cc421b73ff815265cf --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4853019775521112, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011557435464292916 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.48957776590058794, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559920087347776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a01145b853136572a6757a0ba9dfd0df299321d9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4794227685729556, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011552636515221862 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4863709246392304, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558135970599896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json new file mode 100644 index 0000000000000000000000000000000000000000..620e4b7cf6f3ae35aa10269072c68c8b65cf5426 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4831640833778728, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011555875693960771 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4906467129877071, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011560409019420362 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json new file mode 100644 index 0000000000000000000000000000000000000000..adc67e6fc91d6426ce6e1c5aa4f845ae66802672 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Novel-Correct-Ending_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.47888829502939606, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011552120807053812 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.48690539818278994, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558466383367183 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81ed2f5bcf09901f0ddfe324af7ea352df004d39 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.46125066809192944, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011527657726586461 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5104222340994121, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011559920087347771 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b162ca1907d207f22be7eab62e35fb70277a95ea --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.48850881881346875, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011559378273599123 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5002672367717798, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562430600098487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c6983111cd07df13cf25f9077ac34337a4daba --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.47728487439871725, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011550494192008947 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4911811865312667, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011560633656952968 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a882409ef187552a0a4d4deec5fec4df23802882 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4681988241582042, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011539022035111228 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.484233030464992, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01155668204219638 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7daa4d6ebc02ea81d19088f260df7d1827b58c8f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4692677712453234, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011540570846495544 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.48743987172634956, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01155878357073797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b4c304b0f38e2ebb081bd191b0e0af33e6c3cf5a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_story_cloze_2016_Story-Continuation-and-Options_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4740780331373597, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011546883081384901 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4826296098343132, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011555452669106634 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f89521f65cac54eacd10d4d277f3ef589b7c1935 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.4404332129963899, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029882123363118723 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4729241877256318, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f03aaa894e31e71882d261cacfd3106f3ab0bc75 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5ec9495e937521d98fa0c14e2ab1255f3a61a75f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030063300411902652 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ca70eb8b7dd0b24b91677c1da294f94e8a8c1a93 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.555956678700361, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029907396333795987 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5451263537906137, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029973636495415252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ad8f45c94c4c4f2e8f3e177b55086fd9f3ce12a2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5631768953068592, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02985524739031494 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030072723167317177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c5e2a220cb36dc15e95eef5f11f8247488576881 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_GPT-3-style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5631768953068592, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029855247390314945 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029992535385373314 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..917d4eb05eed633875364cf32e1b5c8cbcbee206 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5523465703971119, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02993107036293953 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4544f641336e90e174f53e9874b84f9b3b174563 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c73f2c99702931c6bb4a1e41d3afa551faaf8224 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7817b8bcf83bce60b9caa1c511d937cd6462fd8d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5204c6bda1eada6aaf009d1137e63f4732eb14db --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.47653429602888087, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030063300411902652 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.47653429602888087, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03006330041190266 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cdc864c6dd7f0351a1a15834bebd37462ee1b0b6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_MNLI-crowdsource_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.4584837545126354, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02999253538537331 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.0300727231673172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..15c95d7692643d4a3b8edf3666bf7b702962aeed --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5451263537906137, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029973636495415252 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d903f0330c4a2dbbc81db2310f02ba2ad389fd9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f44fac772b4faa69b4edc3264dadde2b2daa2aa2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317177 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030063300411902652 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b6dc63a0d63c28e8dcd25b6eb0e64633a0cd11c6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03003973059219781 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5090252707581228, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..59c94d5524caaa16bc8dc4e3b4098ff6373af987 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d542396f5bac598df04859d7a6a55448aa1f90cc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_does-it-follow-that_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cba646247d40a44a674fdb50d57f51bf36b2e2bb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.0300727231673172 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dcd0956202a628522b3f78ce9f531983bfcdb5cc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2113df45baa2e6e4c23525b98811a4d3bdacc09 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a94529e2ee9bb624b9ca6728f0657b27c726bcc4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dbdd10a7ee66f42f3d00b1bdf63382a065892398 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331327 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030096267148976626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_5.json new file mode 100644 index 0000000000000000000000000000000000000000..be95ba913d18415cb7cb54ee9666999df6699bfb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_guaranteed-true_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.4729241877256318, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7d95cec8d089baebf0e5cb1e91f9db325d024fd4 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030072723167317184 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ec6dd0f6997ef78ba97f102dcec7942469d5a260 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6be0d2673e6782c788fbc6ae60ad628852bff77c --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b15b3f3896d51bd9767bdc386345f8e82e3957fc --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.51985559566787, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030072723167317177 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f01856e1cdeea3a55318b8fc771d83de1efe2b66 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.47653429602888087, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03009469812323996 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_5.json new file mode 100644 index 0000000000000000000000000000000000000000..638dfac5d2ee9dce8f799aa0b6c5a36922b9acf6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_superglue_rte_should-assume_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.4584837545126354, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6249bd96577a3625e869960ee830ad6c3943c2 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049516 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051500838485807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d58db49699fbda6ee99757216158635505d4a3df --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915853 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616445 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f498b4086729bf47a0e4b75ec7647397a24cd946 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330349 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050905521228577 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..aab5caf73e33548e77520634f689cfd8de1d8094 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5217048145224941, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01403923921648463 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052131146915852 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_4.json new file mode 100644 index 0000000000000000000000000000000000000000..938a1c57f50f9dfb9f4c9a2dd3cccfb1129924e9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5177584846093133, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014043619596174964 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf19e15c3d3eeb463f48210d5369a92378b3467 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_Replace_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.5193370165745856, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014041972733712976 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..705ff0f0e92475fce71372594db262eb56122c05 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790516 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d068287990476aff9613f408dc613a9dd9a78902 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4925019731649566, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405090552122858 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404771839399767 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..312595419d82a2cf9cc2c4c3d93c28a968e58749 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228577 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014048278820405624 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a30170cfe84aa2640ac6ab8e6ae3f6bda89bcc6 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5067087608524072, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051220692330346 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5130228887134964, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_4.json new file mode 100644 index 0000000000000000000000000000000000000000..634d6f7c7771644b4cbf38324806579396281126 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bd5c4334c26a018815adcbab020d57b4249b12d9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_True-or-False_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140519560640769 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.510655090765588, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049294536290396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..30f863af7da2d8e5333a0939a3bf2215eff07d22 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4696132596685083, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014026510839428732 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.47908445146014206, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014040185494212945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..68f5d26cc2c4c549c47ec147909619dbc5244b30 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4909234411996843, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497704 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4909234411996843, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050170094497704 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..db3ec082f9d9c7257fd05ee1d3b779cb5884834d --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.48303078137332284, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014044390401612967 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.46882399368587213, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014025142640639516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f004047b3de703e12b6998aec96279a3d6534d3b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051745961790516 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab184892edacef0065ec4bc038e8e0dfc868a85b --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824194 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.48224151539068666, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404361959617496 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f8b6a171a11ae984dabe8300b92c005a656f185 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_does-underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049512 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5710b38c53d614641255e61e0580b9587e8902e3 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824194 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.489344909234412, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140492945362904 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ec808d72842ca42fea6ff15a688132641a14d8ab --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529022 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..435a3b494fdb3a6b656b4a00e5837f1bbca0e84e --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4909234411996843, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050170094497707 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1721601af561168fc48a8fcd640b3e9620870ef --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405237625922564 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9f78105ca37aa6f6b7ca3fee11311cefa7c6b772 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5019731649565904, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405237625922564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3b13d95bec3031e031529af54426c669c516d50a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_stand-for_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529022 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225629 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_0.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0096c71fd8effa065f6bfa2e76e5f0d595c61553 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824194 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.48697711128650356, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997663 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_1.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f3ff29884ca73ba72426dce56a8f3757fcf8c88f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.47908445146014206, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014040185494212952 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.47434885556432516, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014033980956108557 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_2.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..620262b893f72ff2fcd40c2f1fc37caac75a9e4a --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050555322824192 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.49171270718232046, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014050555322824192 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_3.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a0380aca192698e2736882711de8d22eed56ffb --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5138121546961326, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014047122916440415 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_4.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cebc30d173825d470ed08549cba949f2cfce5be9 --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5177584846093133, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014043619596174962 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5130228887134964, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047718393997667 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_5.json b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc2d1b3eaae3ccb1fca9748a0efa44ead23375f --- /dev/null +++ b/4b284b42bc4/eval/slim.4b284b42bc4_winogrande_underscore-refer-to_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405213114691586 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..021c49fc58d5e51b30236b3d823b33dcfc711ec1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2612da46e191131810c7ee0285bc707eb2e5ddfc65a59737e0e8b6cee39f2267 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7380f3870447f18b7bc241b676093ec656bb0e1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f5fd94a58ac0cc7ea80467a483793341781b80bc1657c73bf9dd62543cc69b1 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d5b3f6e188e5729fd626d68695777b71d47a844 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3beb6a73bfc2ca16db68872f29c22ac46d0c65e32cc824abf1e7e6ed178c3a45 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..816ae83a0c3bea6fb8f80607718973fbb7f8545e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3de6685c746900c0aaac40a690112029f96c2cdb4b558c64f1d73b1f7c112acc +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d5c47d3223044c876371e5d94c3ceca5aef5094 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0999dc45f5096296f8d2f13719c858a03266ddf3a7bfce86ec8cf7894965de40 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcfdba255a2ae996339ff67f2457a03bdfcf0758 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:972ef97babf0b3cd81acee3f62873672cf1ceca20f2ba28ae929346778d1e475 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30de7593fa4a01b11ea98fb3c144a5a45e0be805 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82ae7253b67123e0ba30d2fcbf7b16a5816c24f59830ec9bbdac2d27756bbcd +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b64b249dfcb42277d8639dd1ee9502898db6e46c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6332494bcecac6b3241b7f5746e5415e85d034de3b443ece9021dce6ecb4b592 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d97d900efe725bf77efb850316459fdcd58aa3e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e333530a5be195767783560d97a1cba83e56c7a2e3b96318dd47ecd11f9e25 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75051164d761d93e639759d90f7c572b4069b19b --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d0f3d9ff2400f45a855b3eb00d1dda13f0e6c7f7d3bbf382d0da98aa42bcef6 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf095cb1574c1d9c4f06ae03be44ffce5c104cbb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:313f34c6166c39eab5ff105cb9948a52130e804c2a4cdf46944cc42969145a36 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb52f6172fd7495b27312cdf56c9aaee911e1c08 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7bbd9784b71c142574afc7aa56d553afcfca13a9febd5cae9d3c0564850606 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b969af13ff5477d7a09486aa6d201dbd753c7b1e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04dd96e1f57ccc101d6489510adfbcce3b812f8d8a7bf866327e35cba7e5fd91 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17a89556c15ebafad4ba061b505e727caab95ce --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0facf7b90d3134811f1056a5972c3f253ab2d508333039804f06dc52d4c91232 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36f1361410b9d7a1dda192e24d6a8a5ecabeec2b --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bfb70b363b8531e5c72a27dc1de1678dc885f2f56f6cbef260996b82e20e79a +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..672e6b83117efe8054045671c56d0ffa6035e9ea --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b769bb73aa23ef711f2a0f1c8fdf565ebbc1c96c8382e16f04d788c31cf657c +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f283ffbded688efa55b92243bfed68049a88be2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ca17543e0f1c9929020a2b5ec9be2d947dc381825391f073556300c962bc1a +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2312a4bd1746cabb56ce7d272813bf004d3bea54 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83cc979ce32ae39a5bc01166f16aac9defbd8060340cbc099efd6e1145e68f2d +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..974895b95dcedafd6e39f1791c46a1192ca81a3d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86c0d8dbc21e48b8072373512698b1b4be619bfffaef0bda039fcb13585f6fe0 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6b17ed1b28370424051d19c36fe7ed542da5895 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94b45f9ee4a9e480af1fcd712bb7b16dd236b18ff6c4e11b385e7fdaa9fb255 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7196dae09f59d9008989855a4cbde672ce44aff4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9d3fd7a53c93245a19c94f33853b6bc920b1906d9405e4eb0cd30641fddf0a7 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a079ee4b4e2fa7115264f923c1fffc6f36c7237 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:887166ded8162937cf2a30ccc8a628eb39118d757c1713e1b8e46d879bf277e4 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8dd2240e30bd1185d6b1032e060d024bf2bb199 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d4b39c4573b7aab11d0b943a931dc7882b9a1b7a5a5d9fef22ac4da6d401e14 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50385de931ec821194120160a48b5e1361568193 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcbcb6744b2dfddbf8272d6025e8fd440ddf9ac5afddedd174a6cb7c06faceff +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..247394754aaf34074c6ca10f7d18e7a4c126bbd2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee03a900218153d2cc350fd1253222eb99343e82f0febf97da1a7817a988bba5 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ba1b366bcfd2a520d3893f3a983ff19a7436bfd --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b3842fec7c0125c56b7db18300280648faf7df7aebb4ca51dd758605c46182 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..619fc550ec553c2868b60538b0c688eb3b95f9c6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ef9ebb7a36e3675dc3dd988be4250963203483ba868d2923d3568ff2fc82118 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a692f1fc232cce7f461eaf371fd7019a2cca413 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5606cedcbc5538b317ee67f2e1a56b8b45bbb751aca970535f8991146e65ba39 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2c7acc1a616cee2dfd0e6f9b1253c03b2c9efd0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad47a04e9a71d384a3285460247f02a7061892823c9b5fe4a4eb184cf8fb60ed +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a64d5e365632fa4bc47847000955ae494a4b8cf6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723e07a0068be3820912e18833d64f4c63a3271a76f38318be03a9730b3921f4 +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..482777af829776d25862295b542c1bad02703621 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9829134dbb68a4610795e7d9bc6e2399ba8d632683d8a79d753730dd63f60d27 +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1783bd8b8fdfd608580d875b14697b25849deb7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad014ac5457047b0a466cc0f9212470cce701f0bb02c7e7ab8a4cb0e65092835 +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..204db7203a68037a0641989b965ac8035610b688 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69236c2a32753f01eb00d5c7947c0d0d0cb1488472d295e94360c1e5137bb0dd +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81b85dd9403d12ad450e0c8432fa20fde38b6eb5 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc9eb4361a6917a009f82e537ad6cbecd7ea56864b4b978f41f3af94ae25e8f4 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0ea8b1b133d711bca5b45df53952f6dd44af9a4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800d5b20e73614af1cd3db7d39b545dcdf8fa941940b850b7a856eb53283f9da +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2c156dbd0657f4a3f2b86ff22cd6f8ec2d1bb12 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3c3e98f7a6f4998e29d99f32c6616bc9e6c875dbfa8712c960b4896436bcbc +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c401fbde0294b3f63cef2813adfbf30191cb7385 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b865938d9898946689933ecbf8efa9cffbb143f98cd8b2700e2b3802cf4b3239 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01cda51548f7b0431893f258b16734d610343d26 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:931069cf430325c0ce84dd9c4a5e453bbdc2a659dbbd92c05648adaa8e30831f +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8da042bc69013c251a5cb8dd490bee3066269719 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce91d4fa573187f50081c22bcade325978c09a08e6a1926901fb85d5cd287e9e +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19b92d5a04c0df2e2204e9bab95f4a78a0eb7571 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b62d945eda5e07f2ee4923c2e2b3987f63f79df8d77401809519524d27b5d9 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1270ea65a5453002829082c8935dda1f4331107d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a81771ffe10ebdbfc9615c9849d9a60c0ae4cd2e594a6150bcd67990c61fd2 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c44dfcfe2f2f36040a19d0fcb163bd61e98fec67 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:657d68d29a11aba716a3f98cb1d7326954bada74df79224926d01906bc1c77f1 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9be64565d99ac404c525d0a4569e417bdecb0c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ee3a7372d4e9a34e1349cb3247561ec91cb47ae5609ab2aa5cb12a7c892a08 +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e5060e6f821a53fee71d16914a28e03934fcba4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3e2192a8a5ee99ec7b6522f579ea2d1962360326be417de2374a5e1d84d81c +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca7285b922bdd94e6eab78ab506ed1b581e4aab2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2bcc34f688c807644015918afc333f4c72fec8215bc511920c95697006f5777 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e209620e6636138449bb1e974e3b86d9ae3a425 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e49fce8aa2b2cb34a64d62a7e055fd674efaad974977f0c1cc352aac216bf3e9 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20d3319b1497f149911f2b246f2193db2518c4e9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1564b46067b177a810b363c39818d16daea93da412d661837b0e9f75f1b98269 +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c87e3b5cd9ac464999a25a01af8a1bbdeaac1a37 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d4c3581c3113c2d00b18b45d7e0403e03849ede518cc8fcb7fa28179a1ffbe +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f4579e140af81cebecaf533266e07bf333492d0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c92a846b50f82abb4303e15bde4503850d895dfa25b93709f07a431d21f9c0f +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..639495ad3f02e6ff2b43512bf1ea0e0d467732be --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d56ee91f043eff56e097bf0921ff446b188316792f76d9ca8d0fa151b04fc2 +size 199058797 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eef84d2a1ae44eaf262c3d485439e4f06d0c4592 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1493e753c1be56fd7e8f84f4367c58c69fb65ff0d15d26a76d982a53ee1552a9 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddf5fa1e2955a25d07f8622839717dd442c6dc45 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02587e23b049a80bee7cf44e81570f9f32da8943ad47e50aac1f604203e2a3f1 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..978db07cb33e152e2e68eacd700936e29822a4f7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e856d3ba3f7376203aeea9f01b164bbdada45bcceba3d1bc7bcd3592ca2b6a +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcaeef95c005bc0763e91bc505c367ba1ecf96b9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08e678e5e7a7de2f2579c496b2c80231aed4dca78f340c0ab02eba1363ffba6 +size 199058733 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ef42e456e7fbfff037b8f75ca263f125b75d8b6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60683e456cf98d2cf1a12f1a94c11706f549165e4420571b5f651c91c9705a8f +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44c223053a72e9d12413becd35bf1d7ff118a210 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7747d95684956dd6d21989fdf718e44621f0bf3a8ec53ee85a413990138072 +size 199058669 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd93643e02a0375b46f00deefe03fadbecb1561c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b118d805a2ac209f94b0cc2537c8504d908aaca34630bd0d7550595aee4828f +size 199058925 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c037e67ff576544970864aabd940c451497b04de --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600d58c8ed6ddcfcde182b912b0877e95e54d186abc6743ba58748cd09e6fa8e +size 199058925 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf8038791efdb1ae8cdd6874e8f3a484c6b7c2f0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30e0efdce93d2a1cdd824dab53330d0119d5be6d0db117e910e6e983381b861 +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f32dcf11d9b2a1880443bbdde9597978bddb554 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4edcc5cc31a6bad90f2bc739db605adfabefcd3b28965eb8da568b82c373468 +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b756ee337f3a2c4c2120d42cdca32abe5c839e0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0836a7ae5b7d44129ec74278701f4f2516166a94e3183de62329ed1912fb1ff1 +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da9ab129577359e5510c2eeb7c02d44468af1919 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0974d04647b66ca0ecd65a8fb29a16e6f63ba1641410110c7933c3703b90de48 +size 199058605 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9da0bb66e59d92dfad999c4f5426d40fd5bbe47 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03507690c3194632341f26d30ec71db4e82ca71870eeef8a03489773b74552b8 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5b0ab115b5c3a99606ec79c71d899ff48bafa84 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2132a6956da1271c1b7845e80dc1848089e617c84bdc777995479d9207277e3c +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d021a79b556cfd82e0db2ebd99168d511662e4b8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ddc9c637de880508d6ccdb55f3295fc764a0f38724d9d01a2a3a6705a2dfa9f +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..018ef5c347e1d1a13da26c1ada44a7f913457dc6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26bae119d2928ef53caf5d560ab827a09647704f9ef11e6d1cf598ad8c54052 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd5a968eb2e60cb75d9e42045c7a9c8fccd13be8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:506437ac85a4c36fb11a1b3ea89e13294991a0babbbbaecbea16073182f7bcda +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6acb77bae879b2745277086ee16ac5c9c91d5bdd --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc3ad1ed68a2df0771645b452f3b726d873c32ec65f68ac1fe651a72583017f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1accb9291114a2f18a50dc3416a350d4c42845a7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bedaeae1d66d2de99b5bf4f1f527665f273a82b8ba8a1c5c8ac31b597c54b94 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f606d5bd5b998c2e0ceafe312e3f6b56248a03d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beae4374c245d480363655cbafecb6ab108112945024da1e0a0d2416570cd184 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6841e2457cf0d4bbdd3ee67f7f1633bfc15bf3d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7a029d913f606481b4c15ced377ef6e49e01d4e83d594c22a895e9a005a708 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cc290c93ef943a9e79dfd8b028229edc178628e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3d038079e7c8e651b93b2f0c8e67c1c58b410148d09264fcfda1115a93618c +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05f8996239c73cab49496b1893fc7e2df37a5f70 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cee1b39522b960079dbbf8b93f4cb23e7fdf51a1a9bc0d874e41830ff4da2c4 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6de947cbd299a293aa91488438cc9e04708f572e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e17dc4634c7e6cf1a63e1bb6addb7f241ae7ec05c6189bfca32bdba860ccc52 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9153519c48ae5f842e5823590d3aea3e938ec3be --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e411c866d860f57501818098c87516d34b38273511891330dd7e89d29585ff +size 199058978 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bf96db4ba2e7636b95dcdd6a4b5248fab73a0c7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f041b36f474e22e2ed1d8e344aae40322581763eac66d669e3dcc02ff318a808 +size 199058978 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1674ffca13f134971eb39e2c9103891f6e6c48c8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a090b39835116078665ca4ee70502bd2f8e5518b9e960ed04653e9f1a74e8547 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b05740c30cfd67a23b639953c1f553eafed088e0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4963749f1355fab61a8cd4621301cc7ddf9ff3f8cc61e8b63a703fac9028ec53 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67eed15a0641f57668a7a377dc5071496df99eed --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d84d710bd9372a517f3d07217fceceb3e414c92ae43cc781df2729bbc13540e +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72a18233f07953fc1994661902b05ffff31273d1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69dc41f9867d2ac2a8c6071b623e04c1b63f791163757978ca290c77d2e08cd9 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e89e3116db4a7af0950a50a759005c44364b66f8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d060d9bf64ffc69b877e8decdcfc554b303c807cac7b73dbb7877afcb4e2e0dc +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..570241b4617987406b5fde2173f78aa01915a160 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddff184d6176c8bca2616d9b7ad43708709d218d23e38d163ccbab84c540790c +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bfcc4b99a748570444293004c73705b9eb0cba0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e04c746f6971cd493de7aa09bb3620a3b7f82d6b1d340a16f7d4e6c1681600 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3172379d86b70af34c99b5013fbb36cb03379739 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe802333ff36bcca6263436c69c6237fe50befa04e69fc39f2640754904863a +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..613f6bffbceed09824f9c6dae029cb125c38fbe0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aafa4bd7b4423f80f82f7352717072503218b0212a5ce3ffa47cbe6da285c3d +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a33d53e8351a5aff1147a8e4e2dd65ea3b493cbc --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83de40fbb616827025467b439330a9536be7708fc03e59414aaa5a0560ee5859 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23e710a7ee988c958cd6dfa358ac60724c5a662c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636d0fd3fc4b05aea0a20eb6b02537d818297adb358d78b2603b3bb0ba415a93 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f21542f2221a6b74f01d9dd7913777e43c9c896 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec56d39358a9bc9187556e2f9e69a9789d9289457f682c3a9a1117ab59292cbe +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dc3bd261eacf8479599f415afd5716022f0a48e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03cc525089a4a466f7ab7329df27be1f29cf612975de22d667d4df186f83547f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..868b6b4fa20f840a5f905b34e02db3900f781df2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eed6fc1dcdd8f2e76b8fa40a4f27ee80271835bcacef1897793636a1d391a2a9 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41c6eeb7916d61d0d24a971d57a535b088aedcee --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52b79f0f9e3da630886d3fa3f0515f1dc0d92bbbffb5c6b3f94368fd7794eb27 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a6c9601184b3e98c1a9960c6c66e1eb6a022dd4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9631b5cfc9cb724459eae8c30858d8299e41ea16578149bb19247e6a0691608 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ad5fbf4fe0bf634f3c08920ab0686787c7392a2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1473d72c5b47b2c27add998302fc3c8ffa81413dff8d075ab36213937f9d77 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..798d7903986c810b64365349b4e3888b763b5981 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2f42ffcdd4b0deb6bde8011666ff1eb1abf629e6a013719cada072208b5298 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f907d4d0a62b772cc89da3f893829861a4baf216 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131853be42126a7ea8a9eeadc273ccd0b16b4840f0fb18194ab06f1ee740b3ff +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3972cd953c81f8ffa9da23436c7408766194a78d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c629d92cc0f659c3549f4b3b5a60ad07bdcc42406d70c48b58719757af4e3f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77567823ef03a2267e8788f7df1fbf788d8e568e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8448d2f4f3be0f1ca002c710b9a3471850e2903cd972d4ce9bad769fc5ccb0 +size 199058594 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08209c109ee786e3334a3a80ac3a8d6bb6389483 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a2edede23c1e50493733b5ef66e9e22b64f34f0e862d5a706c427c321fda8d +size 199058594 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6a41b60a4a91d58f40a4e17eee6af3a61bef5c9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e18ff41d0e7a28fab1a20de17fab6ad31c7dd7bd7540b1b1a3ab88a7aa33015d +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2b96077758f23cd62ca28e953cbcce9ae8b90b7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ebd5a0331a9c9ddecacda544e6bfc5b7fd323de9e3f810a001c0e56a07a7775 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5931d5c60034c4a4bd97b8c98a4bf8d6ee9ddd1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e53e0b80c881fffe97b79485cb2934019cdd907bcca4dff4bf4c557556402be +size 199058711 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0a451234576f733efa8e5dfa4490f2700e6d9db --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e2a3b17165723bb11b33398ceb4000bd8be9b8dcec490b538088dbd21c024c5 +size 199058711 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78caa33b53d139082bf917c83f0672466433a0e4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b5c24700793d66689f747bd9ba25a7bf2cd3373768dc125c99f7f98345de1e +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ad6b6c0fcabc1b57fa83bf700512c88d1b97598 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d43da32d5c5e6db3d13ed076dadb816706a88accfc57095fdbcce7a4119755d7 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..830b0f947d51b59570d9403181ce1270dbb77011 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:412b037d81609ebbfdf383238bdb34d7a91c69cfc907b462c2b493a007e5c22f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..869da4a628256a27bfe1172f6685a48f1cb549eb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8892f5d10bd7938f38125ae522f5714ac246eb26b0ef31ab2da97c7b25200219 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91622464bced6c30110cd61523f6a93d6c44365f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75be4778276f288e74a1d576198cb5f8bf8e5d1562ffae238c8e13807e58b48f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15c9d03390eef6c2330588143194163f8733792e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:675dd4fced29141aad4a2918887b507ca72d0bbf1d337821f56d8997880ba471 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3696d19e11786cf38c3d8435f67015ef525a83b --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608c0be4b244867f0ac271d1fedcf38d0ece5e816f30105e482efddf0da07f18 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b160990f3ec19caeb9ee80ae6f565d4280560422 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea43e76b11f0e39efcd36f7ea22b10dc8b19f6a63fc182219b9fb247dc07e6bb +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..501fde502da1005685c008bee4936b2aef53d91d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd7ce7019733cb743b31c5fef29a88447295c39bb068c2b196f9e90929fdb66 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7a0798accc4989637fc1b083b22760ced603839 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72ad040e9795de3a852aff8b5ded7cd51e6de98294d56cc4001fb2f67f5f11d +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c92be9e5d6d2c83db75c2fd82e045bec42116460 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63445f223501484e72fae6eb0d6631ed38795bf87901969c8b4834a8d53cea43 +size 199058594 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60b53d9255bc917686fc83c0a4156cf07d958b1b --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12134fef76f763ff9cc0cb3c29d5eeb3529d9d2d88ce3d66c7872c5e8159f3e +size 199058594 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a90531819c533cfbac8a33a5b0265b10ad6f69b2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9427abd8afa1747ce234b8b084bbee9c0abc369c36019ae97a1caf74b83a8115 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a21e0bf0d78d614a66a2b71a5338d5573758adcd --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02224da7597ce60d6b7c9093e8c0b3ff2ec21627afdec3203136030fc1c37a51 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65bf21965558c922f99a0ef491376c8442e95874 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a3ae6aa28278e13b920167fa4561b5c543316018146b65bad9557849f4c3697 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4bbbed83b8a002f85484a72d7f0843c457cfa68 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013b1ce8fe586fa10431ee6d441b7a858ee9684601af3a2590c3d9c81df801fb +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4365ceb6b5b633bd82355794bc8bb1e15070f7a --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a273330cf134985564b6fc0226c6f58486c135e694c366acc35d888c78934b74 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23ed91dc3e884963735953b3565386f8458ff50d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:900c9d270d0616834f4029404ff5cf05feb02ff7938741fe81fdddde89ba31b6 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7473647242f7e1746e71752310723cb686a05c52 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae53c8372fc3ef8803a62c1b5d458a21d00c25251d98a68964edb8ec0d67ec04 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2d38f38cdbe5b40d60445b10b31afc51ab25d20 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc57cee4734e01f3873cc817b33bd0e08e1a1db646535b748f182bacf22b1566 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc75d2d00a91bce7a67e0051ddf4749646c31c99 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1709c588e74d3fc393181e60ab84af87bbf78552119779e36f752220d9d0795 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a9a0e7f7f47bfce30914a593ad6f01fc867a7d3 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7babba0fe66179f386663cce7b4226e45f3487a3e6f563ad02b9eef0de5f5352 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6df87901aa95677ce43abec416be3c42a60a50b4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79895e7a68d283aa845478c5472abe1a367091d518a19239b31a22c6fdd5f285 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfa6c64006773f787c32813fc97ca24b63ce5f95 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53083c46a3ff97dc6141d97b3a65be8413d2e256d761c6ddd1e8727cef0351b8 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5ab056e727eaf5c7deb7fef327c58518856c483 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e198b232836040c6988e0565b686e8da728c226eb6d2bf5b974e97eb3f31c3 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3d9186a1a835fbb14187edbdb45e506175f00df --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8299e663d4d075d8ce98a93383da68efca0a972e71a8325ccee870342698120b +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92a99014d7800ed9e8eadc9c8bdd629252582348 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fae0b8f3b0851f0ffdd3835d006a52b36652a14ab72e463fa024bec9b0333cf +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71ecf12b3b93d7c9ef1cdf746766dbad756560a4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af153215ff5e8b2053e9a5bbd9842f6388013dd5a6e05a9f9440e9bc74c391ea +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c669fe753e2af983bc562ec908ad45a96d020b06 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d1d46d056e5e5c0f17457d878e267f8d735c3d63102766285f55041bb0ab56 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb0830476960d79c45021de014e2fa5a12abbd43 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02cf66cbaa762b8c962033737fdbb567a2aad2c5e43ed9e815a5ab6dc259c905 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ea953b24bcacdc5eb71c1eaca05385bffe7b985 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb615ff7486b7eceb67166f9845b009bc1fa6ed92c1276c846367f3194f910ae +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc98d4816e57af699887c2281e79919920f3e12 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337c350aabf12b74b68e984b79954b58106c5040097e13c82c1900349d53e85b +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60e5d889d443c2f4b7cb88a2a03ea7b2cd371a25 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a4fe7c4ec2011ef0d2b2b6f6a864709356ff5b20742493c804a58cdc3418d9 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6d7c0b32960324e0214fbb86e1b890f3b12bd5d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:279da9a04ea92a1822f7b3566d3c1aca2a4301efb501612751c0a087fb441bcb +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..935a6be96862f124c4e4cc99cc9fbad6133eef52 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bae96d8ce57906bb29692be1f6f6dd11261d80b40068cdb3699d3495059cfbf +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8e7e1b81d316fd79f5bc9797159aed22d40fadb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cdcbda9d1f0c02ac0ea916a6689096856b5889201269beecca09a36c122a32b +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1a9d1c0111733f519515a80ec33bec5278e0ba1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73af66b338076c88891a7c7356c91340e4f0ef28dc19f3c0d98b8dab5822f33f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83e0c4288e416ba0fc4aa8970d4c317b88eb9385 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17d722942d28e195416e64425fe8f2859af1f5d4b8d51b087c8df5130395030b +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dddfe3415d6b59da5328d8c2de2d13fee2d1b64 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b430f07fb80e63c230fb64b0526808c0a3e35185bff682192287fd8eddfb527a +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..694e444e28b5065411b6c0ae30351a6dfe9a6b14 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3ad2e2d3f33e47d8c394b227938e2dcc98083d92dd1171bdf50bc6e6035b332 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e66db8c93e52b74cbc490b35a7865cb489fd3c7c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e25def36d5190ce26d52db739119d36816a650a0e564501c94a80e192a70b7cf +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2914157888fc89cd0ce426f185b6d2a2ac491f7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c411c194cc8802692eaec0438ebaa0db6502a3f4e6186cb994a12c9e8dd270fa +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99f7636a929b1f617f46600bbf43015b979f9b70 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa50ba41f379e5ab13bcb08d8cae0ceb968b02ec9982fb05e781c8dfda849c0b +size 199058775 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c5096bffe3637fff0f6ff67638ef25fa364c76e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9abd9801f4a4533ff3c48cd7b497786be901728c37fde93db309850844a256ae +size 199058775 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d64d986109a357a55c0525c004d52d23045fbc45 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735b08df6a92ead09c90132ce7c3472ecb4863bc5dbde67b500f12a55481e51e +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..299d5929ebca482818ac3eecd3d0875224c524af --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33682b3ac16615c3095c05f2e9d05f92234f58e81c80e0343e6ad205d04366ec +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ba8eb34e0c2a910fa0723eeb5feec1c06ed60c6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2184bf23eeaf0e0378ede7bde312cfa1e08d6173ad0f6211ffe553c850a79d5 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ec09c982a72000324c7390c61eea685443a1744 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4621aaed944b5881446759378964fcf8a7bc893652dd8cbd4988bd166a098b95 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ad5b238cae82a8e89cc1ec099081e8cf8521e7f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a80a096154bcc65616bec60eb2e9bd82a7cc1064aeacf8b46bfa92e2fc41b8b +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cee505deefe3df1cd309c5c112987253d2fe7de1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:444325af92dd22fd6aa408e7c852572977af3810313e6ca6feb3fbe77e432e3b +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a91fd6e45d3857c7be4e9331bae88b0dc1872977 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e38063e5c4536c6265a402dd7d7041fe9292704ed0679720fa2f38654ada4e9 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16bd6049357d2a7dc2445d92cc2cee2135e834fb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76f05571ea768980b676e092f51053a4a4c1f7e66021b9ed80ff3547ddd6380 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a213255be03c23b020645b093b8db9fe19f67ac --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499e6af83da6088c7d95861169158d5e5e94811c08676e185564fbbdc534c82a +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f65d85526740928596d2560f44ce7bede93a7c0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075385a85f81dd2a1bb8e940c5f9d70b9efeeb3c84f288f2df2fd8360ae0f57b +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..249e4c818f037a12a8bacca9f1e5080a9afdab10 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dda82f4a542adf88c324043be6711c907b4278f3723361a94890771bd94d32 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f206db7e48259356ee7b67bb2269ce647f7e504 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e3848f6f2153fcaec38209ac2807e1dba60c66f92f2bd9a8034a3fadca83a5 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0013648a39c9d6c56a6fe742d22932921ea80543 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8af5aa94569a2882a9fde4a0ba628285b226cc5875161f2e62b8c2a8c889dc3 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c0fc266a088b301e67320502f592f04bf87622f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63aa773ccfee251ee10e5ef747759a3fdefc8cc94dab3475e3bfc7e8a2854af9 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ce46b46e30702533252614e1004dacac856fdd2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2edd2019a7fb9717740c375e8cfe72c92afc6eb74038ad0c9ecd32ac0ed414c +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f93ca9b3f101350211ab1e9ace701708834597f2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4154569f78e78cd13cc6168d3df1c1771a9d2668913e95d97c15e42c08fc9a53 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..758a6a5ec862e5a6db0f2e32dca8fa37fd453c2e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:629e655c9b53f5c3cd9c9afbadbf9a930aa03ffccb91f7d32fb6d7cb763ea96d +size 199058914 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa8346419c7f67a0416f4162048d6039f9d94bb2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99cffaa085c68b763a4e4a16625b5fb14659d24f7ff4c151e44971b9f37cb42 +size 199058914 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..313f20a53ce916facd0541b5b957c82eabd9aad9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22924ff1cff261688deaf13ce4b12fe5143efd270931a623a3ecbe361b41927f +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..025378d5939c1675e959a71f2f454bd7c08f3237 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2357fd484352b9dea52524d14df0c8a93548217f3474912f2885269f881640 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0288b539099676db48e6376eff35abc532da3d6 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c31f4940b6f29fcd9de86084e70c5005a4bd68229c446a5d98cafdd944cfd1b +size 199058711 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..717e724a968d6d07247d5476684311b746bf8c1e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ceb903f763b8f60b4b5daa88eb15b173b2e005b8de039ba11c22e254f73a962 +size 199058711 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dc9e48bb672d0e1755f9f12b7dcdfe8ff33f13f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5692e1ba89bca55808312a09c4f83725dcbda9922d7af63c1d494bc8204d7925 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f63ecb54f0fb25a84f13c20151224d58efdaa11 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a719fbe712166efe2de4298074a8182a85981ead9f69194b9fbb8efcbc9d5c1 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9f74ffe2ead8004192254af6de549790044e964 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57dd56a062a8e76ec1dc32d0d3f5623ba8fda67261dae2b35c518b550ddf3bd2 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa7aace878107c160168080e8067ecd51e5a855f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3448510da578effcbcfcb43b32ae052a7077011b56f7a27aecf090be05d0afa8 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8db87c60b3891137abc1281448fbb26cc7b54f42 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f0d2aeb7b55a045382429a83126f6eafe966b563f451be1ca45ab5d80fa73ef +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09fbbaac433e6c14837ea4a9f9f8026f2bd76094 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e84109e167093d3ef742e02728acb214179a6a10b1e9cbdcc47f9320b84a4d1 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4c1e37ce4ea2fd70fd198950794ee874bea96b1 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575d222a740f67fc7fb78b837fe1da4f5cf5fb05ae9b105d2af3037f6bbb506b +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd026a9feac71b47c9113f1840d63ed778ccbc7a --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48daadcaf893631a147d85d90d3fa537b5b02537c27064836efc89fdc38af592 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fcb19732f7463cc5dc6083353e2c4dc75be67b7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bf6afbf1b453204ffa6ba0ab0a772b4189845782728e80d16a96d54ecb679d +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef2916e612c0fa2a7f42d97bcc49352fd65dd710 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f5a9564784c6cb913b8d903b33e192c8997d7c1352132e2b73f1a7bc50c5721 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dac636951ed8c264fa9e58c0ba30b6939e6eb526 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdceeb2bbedab25c88f3b05e8a8ad843ee38c617d9346254264c91cfcac80cf0 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3178a21e61b4dd4a3e7e155ccc431894790306e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17775a084f59f8f8c7e93204487f2883851edf493b07617dbe8f6dafe84e3a3f +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acb7f790495e8352897e8b110c1bd64ddf4ed6c2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df6302184767f6318946d92aae1015418580b8653dd3738ac986a0a7ad79940 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84e3727972dd12dcd625d518c02b76bcbfebc04f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0837b40da58582f64b6e5cdafdacd2edae797021cd90696f2a9a11a4138c706f +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddc024ea18a0fc567c132dc7cc8e461c638e47c9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7b19e870b7b93ec099f627521d03ef7f4b6b104c6692aff1dbf5b06098a1c3 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..317cc77b85c9ecb5872da343ba0047cd4ef6bb3e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f3761887319b20993cfec8905489c96dc5617a062d6809a827434eb1cc01a4d +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55f8430e8328405594a34cee4c47c3dfccc46473 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e654e50b86f53057fbfa85fe658cbe261190e63e0c385bfd4a091ef023bf228e +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9f061fd2c713118ec60ccf28375a31ef57b81c2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1e68ae9906f3533819ba63013fcfb01ff3db1d737c88dc7f0067698896422f +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd4c00813aa2930f415bf6a22fac33c92c785c09 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00232bdd3072ce574096d154e6c3d745fcec6aa1bcd6d9de6889ac01f92a2de +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a9d980e90604065679517a6073946729f5bd385 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f599d44f00511513ffaead340113b66faf4bce7555ce2db0ba24f7ad3f2ec78a +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a4810b6fb89b059af9ef8bf9378b2ce888594ce --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:986585f9bc2e17df0b30ed2eab431e180ef51ce867d50412609e8ee38fbe1678 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..286e8017a55d8f1713557389ca46297657636458 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1933b206df882d6812202e56283defedc1d0a837e63d7c2d25b869551c67a69 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae8d9ff3732f037df4ca0f3a86d6bcf0b6b8d6c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40a669267fefa848cedb5309161b72bfcf081abd51c209cb8a832f38e02d1dd8 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b45292d8c27dd40a9c8b82028643e44097785ab --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e995e370caf6db9cbb37840605e78579c491e61366476626e65fcab24c29d035 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e1289de696c2772f82e9b7057457486e6042e81 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d91d596778d61b96dafe57d41b554799cc7ae901778ae924a491de30e91a4dd8 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4f2d301d94b42fa9c96756567ba8c49289e5ec0 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f82d07e28b76e2d252650670c21e61f58a90f8f91cc56a4199b221f3211f6190 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53e6d70c716df05a7dbc2ba3d03a0268d2ae1480 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a14a618ceae4c4f7a9d10b65ad9a433b262e17462daff0391db1eba69d1d09a6 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1e70d268b468908553b912bdc3295042e5ecccc --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c93b5cb80137563b3ae981d1e9fab5be1712072e324eac7c404b86f1b077f3 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4910450581924fc0b3d4c1c2ef2a394589fc10fa --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93c67d6907cbf9774f764d13bb3ebea996ab82da201514a7139e65f5b7ff20e +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bc8e439409de3b545463f61e0f14f052c753f9e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22863c0536abf07cec8f0df6d5be241d87492540c1f4f59b7062da97451226ed +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7f9db2d46e9504493f58e83fdebe46a7f55894e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15240c94b3768b3ce92da9dfddcaf886827ab260287404322ee3a54b8581d5bb +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c88eae4ccdcdc62b2778b80a5b189f1c1a5a106 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5625e9f3bb1e97b16ace8fec60eee6bf0628ebb7c84cedf51915e3951e0c9ab4 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19c77769e6c3cd6326b07b565e361e377457c5d7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e53dd505e96d83ff8cc0c90af7a7edad52403e09954e50a61919f7b11d0287 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00749e8a5bef5edeb01ec8315df4ddfd7c20b9fd --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac986a88e29fb48d4f6ce746ef8b13d0e4a7a86c3efd24404fcc4a78d7eb1e95 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85b362e0eff5ac6d61886232e74c67d8678d3b39 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9634d56fa97c1ebe2d1313c7d6d0fca7ad9e6366cc848138926ca9292f37265 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ebfd26e6a5c5b1076936ac6b01a68d1f252126e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2de0967bcb047a44a9fd04957d1195fe7d0a6966f4d572f31b05a34078a3503 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8673ffdbc75dc10759f88f87d5d736821be551d5 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3981faa8cfbb2cb85d3491b84fe15bf189365cb094ef859e72700f2db4b6792 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dddd3e3acce61932f70ed33e31e0433a40ef2830 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2326fb213ecc645a16855b7c36cbacf0098073662fb93d94b2916d563236dc14 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e06671e5ae028985512986562e3835ab74e9d68 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1973c74d7e3d9c3f66ddea17daeade230e8506d9a04b7590bd20430a04a37c41 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f0b23d79d0eb0ef24762d2598547a18ddab26a7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edbe0c55fdee6b840b49cdcc2a667add389518c0e86c9c401ece75aebf4dae66 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9992d590a7829140ddb1e0ec84ad1ae5a27e7bff --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90251071e36c09ae1825aac0ee2c552f9ce1fcc95894bc1f9e8d4ad6e12cf21b +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d12dee37960899789fb7147100dd4b37a692876 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed802f64e22c960669aa4ae72f8555af98e8778ce661427c4a2de8f8e415a0a9 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e40315f0a0db193705eb71436faf3f615f26f3e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52a2f3aee832664e157dc4107f1ef1b935ac32971a360b1d21f6db4db3859553 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f7bea441d64b1f9d52504608f099f7d2355f30c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5388d6f68b781d4214d92afaa1726ba19dff8b08d155953348ad3434f4bbc93 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb5998558cbe6f141f870f0c704d50e9ccf0edeb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee7f0caf0fbc7e8b2a334bbbeb333093932072c612d362d6f7b122802dee87c6 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..839f4f16bdee5268c805e95f0945d39f17b36d1a --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1dfe93e01017b6e76be27dc4de087c302f836e8b00fc24ece2505796dbcd3e2 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6c5b500214bc284effe61a0002a51811fcd2931 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:552f8899e13030c9a4ede46b63ed04aa6d4d0a593234db927cca37398e504f16 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47067f18bc79f9cd31258dc241e9bddfdbb34901 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9d482ed66a58f7bcf75968dee8c4eab823f46986d7b1a8446149cc3d2e52da +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4e1e3ffe113d2f3c207ff2b69824293cc764bd2 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ccda48e7692dcef41cf42a589af734759fd401a2d2416586478b7aa5f890ead +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fae0ee613c7d63ed4fcaf819f4957a0b4f1eda1c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18961a12c6969eabbec4e69c41246d73ac3945aefab708371ba6745dac3cdfbe +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c627d82d4c53d51c9ca5663e39f5d0885bcbc6a8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dccf3d7939855b6465d3fd2004dd799a932e6af6c556f6fcf594dc3e7a6df4a +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f8026766025b91cd0c414e325952145c2dd01c8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58675c9a12f904506cb669bc81e4ad51860bfe585c7ab8731b192224d432c1ca +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db0cf6a75286421a5190191b453d0f195767cd25 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2da07fd3fa726d5d06a8d58aadf2f751441edf2535ffbe7833880cf5d18342 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c798ebd1a0abc7742620df901d32129689b42e7d --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13bac3b2751aace146e2112c09c1119405151789db68d5e3258b79889cf936a3 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1daad754748f45f4178f9b1aff01ca484bb33021 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c01700bf0682d8cbdce160e6d7dbe6373fbadb742a000282ed88b3e9f90c07d1 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26c14d5b3ef0e9916ea92be2b8b5f4b96ed1b7a9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d8823b715f027f930b9f407237b1b675ac508162939cdf323bbdf18279bede +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..407e66316d0ed1c5a8d1d5cb9b6b41a7ea03c273 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f690a1c091106db3aacd2500af380f9247dbd7c70e881268af5ba41ab93578f +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb0331e5c19b1cfa28c1508da1c53b779f8a72e4 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3530b5840ad5fccb4e75121ee24014a2b1a93f27e896f29944bed1b4f21d15 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7dace79d60d51b3895aaaff28ff0cc6e216fc16 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f970e821c9f01253fb8725d84b7a1801373bb335e8b151e86e681d4f5ce8df9 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd484e287640e251b2c5fc7de3674d7bffa090df --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f2cb505ce159fde188b50c42a0d48e91dd1ff3f42fbcbea62d013e4161ad9c +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16d43bdaced33d9b8d15686449466e31c4f25e52 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5666b7ad65a9cf94bac7dc7879ecf8230785722da82aeb1f4b8c69b4f1b8ed36 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c21e391820ae6c63e2c79a22893a88b3760ab961 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05eddbe3f983ac45e25655f1fdbfb3e7b373ea2977d5d1eab6c78aaefaa70074 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..657278d85e9f293cd04151145a1bf316952f30e3 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb7a9c98026cd2bc4928eb5c24b6281bd90b4133e22aed54ea7497f5f44b6bc8 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ed9d0ba9bf575a849753e5a1d3f45016309d69 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d79e3fac119ed0b61e0dd3c4c2c8836d4ac1dd9ba88a4dd2eea0fa0323090e3d +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0d02c10afc450d1daf80f113c8d4fa9fcfd333f --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:319a77a82cca9fddc0182c92f20be12f605e445773dd6985db662ceaf557f91e +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4065998512aa0fa472df90bb2c06d377f419845 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d613141deb6d6ad4c3f55b6d5f16f1ac8cde7ae7f72965fe0afb88b27c456411 +size 199058647 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dba51767ca4640cc31b9bd558d749f98d14d7915 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf1ca7d180842b0d963e1f4963374c7c711783b6dab2fe7dea5eb97404ac2cdd +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8a3b7a6fa56bf6d242d6bdec2691dfe29c433f8 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba73bc711a89ac7805da273944bed057fcc67e388b48629bb43a9a0381bfb58 +size 199058850 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ffee8b274dd8d2e02d9937c17168a290d9fd9c5 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ca4a883c2dab8c99bc10460c825875c37fa9cfa4240c1519f049d02283a78d +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0efc0577212907774da8a99b35487650a74471eb --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8826682ae99c9f3ac410954a85099843abc2a46b1b88d03c3a548fdfae1cec7a +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb93ef6645d9aaba0a95d722c76d998f3cff9b53 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222cf12d0814613a76bbe450d250fa5cc33f144c9d765b92ae44f33523483546 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0793d9821a6dce53c28b1383206347e8f92f8fe3 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:767840cdc60dd49b945fb130a4c7c5e828b63fe28d8f8d78237af6d7e8d1b9fd +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..228691c322c6444a4baf332c855c905846418a49 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b2875581033e717bfffbeaf220b891cc3999c52126eb531db694862a004945 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e07dba607bd202a9f49e1306e6728be98fc9c78 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f34516cda09694d1d5823dd59488d8deae6df5c04c8c87be2b0868159d2bd5f +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff17767ee0769ba25b309204c9d37267592eaf52 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:387aaf2de829ff33221b5b830d17d6eccdc121c44f2404d77bf5cf55dc9d8737 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0827d368cc83b62a0b79c7598a23377901cefba9 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9825c5f55ed92e283d81f690e73c193e9957bd6c254bc4bb254e98099db09149 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0b7187447912899e7dcd7602705446e32777d06 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f243973ba7aaad442e330355b2b25d14b2891d659d6612c97d44ae6d0ad2492 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f201878b8743642464adda02a7aabc59c47f5fc --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5073e7e41a74ef7541c8bd3d0e1f27f649090fc7cb0de203e3de6fb30d3d54 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..114c1b3264fb4aaf21d0e1d3dd03df877bab1c92 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069992337a69ec36c91869e16c53b5cb6627e1d98b10133be05d43da9e5bcb54 +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfff57d4c54a668ce9f2a95855ea5ae3ebb2092a --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb924847490e4a04e350f01459b4c2a2f8403afc9fe1db9c5fa5a122185b02e +size 199058722 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2d327f2e1ab3711252634103c7cca90a7dd212a --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f2cf290a205110b7454f2cedca1e60722eb9fa93720421ce45d1417233ea2f +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe98070db6b545b6b21e0f9e5411fd89b78f1bf7 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309b8d3e44a757e1c7459cac187f7316340751e3d48faa6436ec96edaa377273 +size 199058786 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de02603825873951c5ec1a27db6e7d9834acc795 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52eb0a70b5606ca33a82511d9171db625d25b3e24586ea435d851e17fbd967ed +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bc694f342eae8159c7f8369c3bb31fcdff87e8c --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a20b1d9a1db09cbb486051674604d4dbfb1d678b11ded08615fdceebe281ba4 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52b24ae5860583864fdfd7913ca9bc56daada365 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d65ab182975574108a19801e2fafa7234b197b944be7ee5472496ea38ea2b2 +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d83d2eac6da13079b9a360093a7555da3a09c47 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e94d3552bde2dc863518d9be4d8b0760edde8ed29cc27065c9014664260c0bd +size 199058658 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..638e3c4f5c444a62e8bc0e7444a949effc46ff3e --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e0f9766b4e83e37acd7068d195a7bcb42177401774dcd3e2b06348c1b82f7d +size 199058839 diff --git a/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11e3cd7c42e1f285b7c7accc76cc20acfc054801 --- /dev/null +++ b/4b284b42bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc24ea2a512c26f47b8806e04b9e535429fe25f9b4e9bcbadbd09b135cccc4c5 +size 199058839 diff --git a/4b284b42bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f339e8e30fc798297cefabc6f0ea13b3cb2fa284 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aff583711d7f020e30de1152083fa27e68ea95ee5da65c85a838849c7fc96dc +size 167511299 diff --git a/4b284b42bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dfa4be50c3e84cecda0c95e3552e66096ed20f0 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5006870d8b4d739978e975b17bac05350b3ed0627423c29a00b4c209d4ab8e19 +size 167511299 diff --git a/4b284b42bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69fb58bdefe4f843d289ffa69743fe2479a76aa3 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e352dd3c0c4b9d3e81547e74d191e23a737c90d7b90dfd95a04b06222c117c8d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cfd9b0f440062be754e6c69e5e1b0198e879f4e --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e060e5104b4fe6c50b27d4119a0ad832b0590f9613830cc0ac3f0f1e1ce87bd +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0e493990f709e5f5640c7d9d11efd50c55433ae --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1102e1ff6355461699be2e4494e8fc933feff1752bf309fd53f16353dc011263 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3537f17e6eb2d3a2ce863efe192da3849b8b1a63 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54b5df994c7573160c50228953ba81614b6b727a6a8bc052f5d178683461d71 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ca138625bd9894a2fba358154cb73037ba4980e --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e5040553d0bbbdac54e37c5677b67f636ac06a98e28f9f0725469aa7ce27dd +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a58907f117fe54c82842f030233ad76198f953c --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebab22b2a1d328ebf2e10585cab8bf42da6c236cfa5ea6219d7bf781cd8f1046 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91925734f9072f0965cd891e914f3631569f5c54 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6023a7c165d8718d4ecfb18de901754cce0c27c4e2fc232f1556e1a6246489d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16ef527426babaef30f57a416a6f94d7842d18d2 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2146ea8159f14293775b57dc5c25ea17df8b1d31006a5e5e551ec0a683fca7d5 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e711871aba3c8adb10d1e37096a76ff0033b775 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74a4a9704d324cf4abd271ba65a55c8455dc61dfc911464a49f8eec123a9455 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3957619d5a89ca48abf35676f5d50bf72f0250c --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1d1f3aa0aa66e8b4fedcd4ae12eec61a5882635d32d46a8aefa7870a21b869 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77a04b166b6e8910f9eff7404866f30ec9433fd9 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1f06558b625255adb58afcb653291bc6bae6a28953ec46ac0351b544321620 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54cdbb7f504665b3c41dab1a0f360b38ddccefd7 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2d967d8dec3569b3e430372b0a9d2f85a3d24f0f93325e52ff8ca21c4fb10f5 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..368795789cae9d8408bee71450fdc594b498706c --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:415709fbd366bba4425b0e2faa1ec3cc1ba51bb3333005e25c1aa345537698b2 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4b987bdafa2fcaf4d24819daec264540b5f38b6 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c073bb741f8f653cc32f53376afb95bb489ae46de2f86e66caf134eac96c06 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae424dd4677944c684973c7f4f16ed6bde3844c8 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b28b46936a549a4883f69f9a0967d632b6f1c170b4156cda41d228fb581a04 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0534e3bffa47bb1e2e821b4da611c00b07e77a4f --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259cde6f7ac9cd7f82a541e9debf2aa38bfe30d6817920fd4e586ffb4655a736 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f60ab3c91d8da162e8a80411ac726a1daed211a8 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee28b0772937ca4caf761a7da4353586db8f0cf55d17e22bf8670421e847144 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..307b9257dc5381b712328f78aab94fb14cae24bd --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a05f27637fe4b0e91043394c6cff5acdcd1077185e327846528b644da4e45b64 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5eb49ddaccaa412818b4108af83e58ce4ac33d4 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae08f56c8aa6ec043788134075a61ef6736408f6b125694d326998e599bf54c1 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5b7cd7beb8a38a91a4f171142dfdd3570466586 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a26ff40b86f99e7b4df145cec530d6f2734fbff7b5d9581c337fdf065d2d3714 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..510b061b8d5ea742844ed9bf67d33ea79f6286b9 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd3cafbb308c22d1b752d51404c40df0acd9f7205014d0c03fc53e2098690ce +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6aba893bf26c9d2350c0159bd775738bb9546a4 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07ff9ce776ae4fac42e3cdb90c2ded4917db53e8b0e357586d3c2b0a5d1ecd43 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19ef116bcbf4b1e8ba6912149db1cfa391e078b0 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36a24699d10fe9ff76c14867c6ab5a397ae943a424c15d5d9edaff703fc4372 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cafc1d62530e6b6d08da3bf996021029bfb9105 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a3ed160f59627052b6d352a578b7c602398eea6e3cbdabccb510b3812f4a7b0 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..799cddb893d037e7f276bed5ae81ff30180244b4 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1a19daa9bad71025f4d240e4ef09bf104eb46f0de82c8524147dc569863d40 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..084877b2126728cdd8b7d742e609819e144d4a16 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678df5d0cb971b182194a58af879b810b9bad8c2f9bef19c049c1c0d031ec079 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd4eb9638466edc18180c05eb4dd8d6a879a07a0 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e6c640574fc1765fba8b927ad706d6e3afd0be55b3367e36b615ac09b5b2e70 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc6cf26484b08c5facad4a2f72a6e862eaf0d571 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769d5b4591acd5d68655415b3be4b05ac459ed806344713e99ca4bb64b3bdc5d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec5623f38dcecc659d9547e55fd85ad4212b36db --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e03c8c36048541fab84900202b8223346d2a9f14b96f926a03045f4d2fc8143 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31402ed8fd559f9d064ad7fb382aa2089d2b06da --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83065f1d8d1352dae7987ef3e9d2cd22144da7d60b9b51c95d518255310b1c6a +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3edf3003fbc8569c6d51f31d55fa927105250cbc --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d207acd9ab18f1ec155815c3780c718e8ea5f68bb9101b1aeb8bc8bf6d22dd7 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d19c65fa27be61b74f83457dccf4a422344632d --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bae6ae86d2f9c4a2c1aea0106050dc3d6209ed83a821dcbee94a96db52bd7a5 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..688d5c4c0704025281a0ef7810a9cea483f65b84 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e86abf212217ef2be7f51b14004f7692197c84cc9d389ca5c4a687699f1193 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e89c2452f6d938e8e3ea5d0ce09376ce6bea3638 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5eae9ab824d713162b3d8a2107773d111752d807dfee5fe8f0e76442e758623 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eebbbc7141a478c2525a940722d23c04d02460eb --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b95462f2c5e66eb4843cf8048c219e02dda659e5abab65a056b6cde216a7fdfa +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..922cfac5e08b7b48b574824a3744e00d40e9e8a8 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6725487469d7f8b0e0aa051f0ec697d1a25d9b2be6f493cd82229199c8ca9259 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ab153b46731e36ad374c3e45b3c49d85c72fe1b --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ffc19f41f80229b73273685d137e9608e878a29aaac8fff9c8d8548c88bc49 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e143b3ca470ed86e5cbf274ab3c542ed72cb428 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb0c6b2a178720ed5f47a0cf0a18d03774f8e6386d70cc83ff34b661ec19944b +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b96ec36726f627a96c8b8e8e77dbceb822bd3428 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3e339e01d011515f14e94b357410f62566476af945c37b78af64174bb9a4dd3 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5a774b8a8d62b896e5d048edaeb2068baf26aaf --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f448dd32eebaae4e630393494fb491a1aa6565e8897e4d6b39931ab391aa0f +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1870ecc333cbf57b1f77874f35613baffff4034d --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2957fc350f766a0498741ae48015bf0b2fb45023c493e6c2b7fbf73a01b9518e +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fffbfbad3728a88fc04d6e6b506ccbea534fbadf --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede30cc84c50dbd1cd1a905dbdc8f736df5bcb6a4b9a0bc4792eea8a2caa0404 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4838a8bfa8b705cad85215a24f5c5e6469f1830a --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb72bf71e3d86e79f2fa9f2ad85a39c5658429d86a7682fd5e2ab32c2c396959 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a6310ebb283cd808606dc1b19a0a6f1c24cfa36 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f6d10909ed4a6344f8c2a8b44f4c175874a59fcf0562c3cd8192674790f4ded +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f8d9d45fa95a529ddc120d8d314c8e94e682b50 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db53b924f25042cd07d31e7f7d409f92be206c79d8aa6d022552e72ca140c07d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5bbc5535a4e88f90012e7f9d2c6c2ee96197ec6 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3770555e80ca336b3dbec0b4693354034db85bcd4a04d9ddf3344ffcd4a666d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c50d53aa32e52d3a602f68e1b19cb3c9468300de --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:371d9c207d11df09b066734ffa3ca11f9ccacb4612c6ec6dce71c5ceb9b5c8c8 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..211aa8784ded9f28f18c172839c670d939d70e47 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f08aedb83c679a531b7ac71f85644b73a8a2c633dcb22bc9f0c5482297afb440 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c025f310f2e9cb91c16cf1496152d0e40b5f764d --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:829c51b96daf02d9c22cb8fbb1055d468d21ede7a5731d63e7cd4378c1844e83 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..829f8b6eb3691393b3739fc787b2c9527f2f053a --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93ed53bf84e33ff289a118a0dd625c872b2855144d8e4083b390a4e1f178fe70 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6caa2f674c3b5900e7aaa97a58c87c31d0753a27 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:376299576b76ee86294796d4005aa80feb85d59cc16ebcb3510de0cbe553879e +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40c4faded6f05d213a26cfd0702f49ad40731728 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13c0230d33d3c285ed0fdf779c1268929cdb2bfa77b40b26ea13dfffc19120d5 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c2d1cc0c24c23d83a0ff7e63fdc346b924ed1e5 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1589dd5e2608253768dc032002b436f8b440c6ff35db69bf2b977df2fefd417 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43cc4071f3c888391ce4cc4f5dc6ffe8648fb905 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b77bb1dd7abbd4dbb426d87f33d31e2e81cf359511cbd0a4439e9663953793 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52e34cf093638404580fd0c2b7873cc02e3d76a2 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d543d13e92c14626c265654999b2f4281090311a548ad363e3e1f00f135ed9 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d93e6a951159fe0ead98b16d835e8bbc9ca9a30 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a3976202a76f75765e21149a9ab00009deb943420f42f52f611c9cd3ba6d9eb +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ed18a6a4900d4f5bbee1604bb34d039749fe52b --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:061648dcba96a08f12e8b505aba12188ca8db156c693522714acc70975013f00 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fab0ecc890c92403c95d32b14e889e1690c9333e --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20e3ce8627eefa364cd3a725a865419e52a96d5e58b98307f990f1edad3fe93e +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..642e985cb4c0821fbb7e8154e6f98afdff300009 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27840b3aee703f26fd88954b43f8d83036e09602a9a36accb2c85066b0c900f7 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..904d7e879d75b5cb3e409f39f5f6520da77fd83d --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db91c2cb122ed89d8c30f6afbb01ddf71594362d6fbbeb44fd1a2bdb2e918aba +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..615547d12d8e05e49fb47993bc86f57af09cc67a --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5af35ab8fad8566d33f826f9827404545e39c71f7c4b37fcb7743ea8a72730d3 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87ecd6d28e923c1ec4b03e6ea7d0a7a5fe2b1922 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a270ce3b8023f67f772522433e9b349c4e15fba298913e84d3254c24b46d66 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4266911467c636228b4bf7df14103091bcc8ab06 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:decf5cde64952bb77583cd00e18e6219bc767407ce85e11f55bec2ae6cfbd407 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfeb0e6d312c39df97b91cc39cc4ac73eea213c0 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b416d6da6b66dcedd4b841dfe4c906d1da79878af6872a5980b4ffc46152ac3 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28984e4065d7a841814eddf7d81a6d8aaa75e2dd --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3538330fb4cda8f0f81b103d03a65c52db176f4798d3970827c3a8c6be7492d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b587b6dc4f48a6764cd1859d5616d73b1cd05e9c --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da515c9bfd82d71bbb9c66d4416b54e99471bfa5b0c87e16b453fe5490f82bd +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..428c1445ef5fb53010364b75750c392045b41749 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de2e7cfbf9e7986e9ed772ad58bba63b8d8900ac9bcf8454992ae38cad47788d +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1e8a2c3f48da9714578aaa09b943110b8548085 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c81c2fcbbcaedb184d173149b0637929e960eeb1621a92c890187dc35a871174 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9527161cfb84c70569c80094c1b4ae3b83cc7ad --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cde19748786d2b8644c26ed9106e93992e143a5994bafe025c55d5be06a1f72 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef53932396fd2fa4b23992c34390a98a73f66ea1 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35dc618985d76ca7222d018616fde86cd76a352bca27b1f787b945dc428f2765 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cd35084c84a81a052a7e433bc50998a18f7ba52 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e476ac1ff03afb5bedc0b920b8b5bdaf151aa77d5226a7049d3ca1345eea97 +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78f10d77c7409e786c551798ad906859546f50a0 --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821769d282b09fc406f19bd75334d4a6c62b6a29738a3a05ed68d35d30809c5a +size 113308931 diff --git a/4b284b42bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b42bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc711c14ab9cdf7c787b65031d7e61daff0793af --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb9a5411a0f7d4b2b628b2e53ef4dfd2fe9bc67d39cc234e18a3384c5504829 +size 13507 diff --git a/4b284b42bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b42bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb00b9b18018f2a6011f6a05cd57133f0bdddd0b --- /dev/null +++ b/4b284b42bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade02554950d401fdbbe5e251f0a773a978536534c209739249e6e5415d9f5fa +size 13507 diff --git a/4b284b42bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b42bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d314b2c69e58222e207a33c64449196ab485901 --- /dev/null +++ b/4b284b42bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02edf518d4885416f28d0ccbb36f49545790216620ecd460478561e10d56fd30 +size 51443 diff --git a/4b284b42bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b42bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af0684bbd7f0de46b01125ca6c64b2b02dfb942f --- /dev/null +++ b/4b284b42bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a9d4abe469d91e5fb896bb6b9f8fcf5040e00430e2a774f6dc04eebdc9d5200 +size 51443 diff --git a/4b284b42bc4/transformers/config.json b/4b284b42bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b42bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b42bc4/transformers/pytorch_model.bin b/4b284b42bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1dd85821fedf6b70beea0ac3290d7ba6f435c9f4 --- /dev/null +++ b/4b284b42bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45bb761bdc2bce9534ba642323cee3556374544e312c44bc759df99e06dc6a33 +size 8781203669 diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..395698d7fa63c9f503fd0653fcd6175ed34e4105 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.37466495114095594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03116802815957843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07404480158534202, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014902788141002475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.31212436257013715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004568108823452381}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1135227535532633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020226161176734882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035086188446589825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009237445527173}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1533304291888016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032417614710948096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05397463032306079, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012920845251919216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07143892188082063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013733993783630446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3047679674724977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044697317892439005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10988632514530636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018918967086146744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07084668944691519, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001397360756298291}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2997116780220477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004343713168500645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10869588574047251, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019005624996464461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c0921ba63842dedd6396659cbad541c9ba09b00 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.45724995573151395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03077826387670849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07380410032029427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013950698189371862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36979685211837077, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005157808096954796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11499758191359032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018677715006027427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03456277372759004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008655744665680597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1825581650807761, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037692839645507878}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.053994419166879803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001195483166076756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06966555184597864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012665956123840754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34681007385001145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004679678346399935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10864238895715833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017176631324308034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07031937468305927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013198255014853241}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.35067051099532537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004753075684894469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10948950405713026, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017632128912827953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8d463f33d5cdbbd13390f49ab6b5546cf758a069 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.08094756912598497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002409102863540913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.14127847705177338, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017639762306188122}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.07376523883400579, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001716973465887151}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.001962954313202467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00024283176641268995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.0034433920977868515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003613572553244804}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.001825879960798421, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021705897236105626}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.07101385752649907, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002044467304290158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.13371197995222325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016174280494899666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.06611311172184264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014225513611251844}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.06148301912905695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020032593057346037}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.10281964626274133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014198843683178699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.0544351778389216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013592853283299524}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.005224322194484081, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00021302968079536623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f0aa37647a129315a3722bfb594c4e563eb21264 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.25105219387361044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005170360898027872}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3963661339879864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.006166209959478081}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.2632442282160369, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004649492833926223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.10324325384704888, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003441742210801506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.16628550216394442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004378246536170781}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.10898871272052842, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0031637179796749015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.19788246648847183, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004276640363752767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.31861786603681824, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004930470540221916}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.2055562623719173, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036386336219648187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.2214424878171101, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004657212513512685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3486705785899057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005610709630554273}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.2313939327549787, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004163825617671645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 1.9519403405237012, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09494593162165268}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56bcca4ac7270c57f08a09e0ab456e8fc2256f87 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.09859772585809395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.013778817389007887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.03287343868832309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013442739257702343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.21251821310023286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027817450836553795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.04861449574020396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012163696170856104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0059870925402472075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005674158222234774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.02426279686355535, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00138986055302855}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.007309673323747182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000560409181489832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.030474323323396148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011252285187580645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.20517906724020168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002637720410420459}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.04587452202968607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010358216787208214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.02488075874372872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011705975217800647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.1551694009228435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023022331833182233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.03578934542908562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001048259767960614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5a5be8bc0ec1f6c43e8188457bbd5e0784e4083e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.8855730701366655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05402887660292465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.08570202503900552, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002251191682556817}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4561121562862774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005476485108348642}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.13091802273596945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025720056432699647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.03483952529081149, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012306669089095204}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.19122022968628333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004195715063382155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.052778304024511426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001476775655349492}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.07097502062202066, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001771784998246896}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.40452647581766443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004841486668253075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.10935795632748962, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018990796502685228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.07558585496214218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020894634297122035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.40177716016300163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005117291808345332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.1149114814090014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023559804274623885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..be0f1ed1fd84e5db525d79402461ede8fe2b274c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.025938648577982906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010640148385704566}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.1489094513106922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003474067096470854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.04162588476887082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015012115710527434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.006486298578918915, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005370453171441658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.028873117560228444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002234953017274618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.009979458438683987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007921199531174797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.024766861616093486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009501479562254536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.14572560070301618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033022912313721738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.03999367372035714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013595434744328805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.020507241432586306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009098722057173229}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.11707134925920056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030877854433825056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.03281665632758161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013046364902866959}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.1350956828500859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.009865513305687733}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..21b976acd79f0ac3cae473b855587d9d3f87e979 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.24831089730550815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004780357546739232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6901002304627992, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004128295202936247}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.3160936388762234, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039777572345839046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.12446618183412662, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003184969383336601}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3533119961492384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004200483017562709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.15630585395712443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0028715875088255773}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1923432613835867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037676385944604143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5626163289646795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004213110071992057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.24689804124363615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031290004102835158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.2128993219354708, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00421846105175959}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5995327632502109, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004004484127096839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.270802478850764, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035034467745711337}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.910751493059819, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08452357261047755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bdcc45197b87fa58e4b154738581a6a6d2adeb76 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.01911538436920681, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003503314592826651}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.16632609123655862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022509999713146564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.032654498601309825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004894300326827564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.0006671575521923488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 4.2242073462787435e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.00956992595160404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006564343184013548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0012146680449496444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 7.587623242998391e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.019101357116934347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00034860286261423904}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.1662080638295448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022483526136167662}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.032632732447854994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004884533202874028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.014415704194013666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00021940484720304887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.13514571391109637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019103084597602812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.02496623803547597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003268151474812184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.005621231172467306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00014199233504074313}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ef1d8d5a3bb702c7d8ef895a9398bb9cd68a93d4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.1553138653033213, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035449641247749003}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6751958384437035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004465270161109714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.21808043695289794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003102334578158025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.07201402035915092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002186406300104381}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3367496211993743, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004320718638419122}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.09971882616743014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00198054319313908}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.11983577128787204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002848039977052699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.551063468158563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004502206105834914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.16879982896459372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002412131520433171}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.13727804285693102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00310050036995496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.610640770573686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004416190675238104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.19380889018037997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002740435097718051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 1.959859057235728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06468986760192885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9bb153e98cc17127a971c623c366b5e1aa1ff259 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17369745237153775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019416720591579535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.31135751074053647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028278252703613604}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.20744890306323854, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001889783739010265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03684832178440491, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008000457616039376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06963013596903031, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016025397023385504}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04442334363828912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000913157779834151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12131088040113613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012101669364236395}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2274075459665916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00217190346880656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14688583786669807, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012058252236626717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.1604070323191151, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017878463609218534}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2888713576941007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026566622649898735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19182985169202296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017460069670729864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.9776709548064586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07545402996122554}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5a19673fe3cee71e4aa62006b17f6943c3c450dc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.17640434289005402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002005007718430865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.3066393157994191, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028800028706439237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.20809232424724627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019353389343055706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03974292535229771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000857829234608627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07238796762006708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016527893153633355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04721416030214545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009610927999083794}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.12624587067736417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013254243768189675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2277171673299629, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022655349530668477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1505788683968366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013106237942102476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.16305254160491786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001850053368836578}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.28490909590208735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027167401690682915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19268916973951744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017991908347606052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.1658866052052077, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07121279156132994}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..80afbf40fe20d1e9beb0e6102afd9dda948824ea --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07162326549438301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013415768556990903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.11684671121153994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020890726158097313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.08233778081684881, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014136328956612485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.007179946675972951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003647094563527292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.013370485490546424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007936434590550039}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.008674743075720219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004461694446460864}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06378002446656099, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011167907708427777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.10585450994467742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018492026296821713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07382145485768168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011967165146959564}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.06755619163537285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012503644836163084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.11062864063321133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001963576254467577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.0778081106950157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013233841707317477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.43524332681177375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019834397414917805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd893ec0a125a9ea8a608be624e8c31fd007f6de --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1307619769061793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020066017869496055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.21457261780345513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002890368978894931}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15052430073600576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020148125325675117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.024730912344785617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008511600964021851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.04229212817088445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001416549742011235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.02849747941134415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008756872410718326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.09765613730182125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014148261990747527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.16574054117385323, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022567330723406126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1134827324047906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014198341617713674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.12175399642762218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018573395823604964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.20083948009008584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027101716504798054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14034629936326376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018623589986031964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.7498608764506907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07906985947026686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..05c019e1e8bcffeca7cef296d1aaecf09a2176b1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.11935860401561957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001966951798581278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.18047962059644787, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002288460298862089}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1298217936839039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016486636351090917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.017209465779815906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008676540226673849}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.02547483526259387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000987206757819632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.017695339797954485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006158660622155535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.09989513555905755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015716488614741981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1549374771537138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018887319927939226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.10955506758387423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012658212991754696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.11068732307051626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018173163434014095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.1683872149174189, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021320843992146134}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.12058299633791197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001512713070449015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.6633659334049573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049734729687044475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1d99082c9222193ca4e4e12f71103a2713121d9f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.14896561449083853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020221764972833405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2478826336868915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002751441323122002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1718968216119803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019220394498241642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.029109226932211917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008866459881706034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.04932178167592407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001497944199301407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03310418623679188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008807776755696351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.11131788375137605, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013990530230697672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.192588680149845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00217772926083577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1299888647577953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013387418656703245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1384485375043415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018637262914521346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.2314774588045184, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025719766768405997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.15994475650377557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001770065686045605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.856535146505626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06411488880916898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..126b6d944cf1c5090e1b132482817bcba96758a8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14903430496900869, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019044306456448882}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25460834562037304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027808818609942765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17464450784994165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018866549663747307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02932852184049837, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007444651216824196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05378048278152194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015115343557708684}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03487366784140555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008538791169940822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11560989242228453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013437666268655546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20506193764704436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002299748596638709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13717038261116743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001369756093492625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13582138493978418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017239221521953576}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23341669122784228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025774317116309797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1593997663047623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017134556998076298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5192817590705794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06494006665273461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..78f7dbdbe6217a1c0941c90386f707bf468bb829 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19528801849094218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002201427724682199}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3257340657226586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002895580827853013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22509002102944192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019838100403287333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04999822434377242, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010227131559085638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08773922376052366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018594738182377334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05802448056432808, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001078013458633595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13922763000575006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014964090541992776}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24015503219395376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023374560954638485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.16185295875984437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013532637497389792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18280815280692314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020740368495163132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.3054903541601766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027607171534094984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21073250157763135, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018643526489330914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.059381627710027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06573083880760217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c60a2d95a77693d8acdb7707e67c3324e8405d9e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11894228477581185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001843199326087485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.19332711537732455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026663550761881214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1357386682107375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001850556213295791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01856149961078782, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006185592916887031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.03332455653915248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012178147112721096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.021740420743345538, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006905494612438458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09623545787280265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013365120430167542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.1604168534029034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020999550260342253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.11058953803875521, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013384320596615663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11045263672416784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016912872299638876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1807713263742707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002494631697666655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.12634196821598928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017082391083172198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0396077897340674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0532741309814862}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..543f946f4bbb965c6281c619a6adfe537be500c6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1139308599359662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017654070337474956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.18500397029775825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002627335573742763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.13062891767110701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018203352596047476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01537236441625929, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006021940015586292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.026591893021603984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011115575787005103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.01792026914674207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006810574947177749}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08321252562851182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011456569094090048}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.1391534150194594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018825196882808638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09616512002242138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001182198985439485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.10669664546680603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001644249540235529}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.17374598639039826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024459852240474736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.12243045922583431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016925844417807886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.9715621646319063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.057395077491468285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8e310f455924f1f6303b109bd05abb233cca356e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..825327b16a28b4414bd5247a6623b47b29ae6073 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4aceafd6a8224b27b7787ee60b4dd780bc313fbb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224475}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b4bb4bb6caad2a8b59c479b28adf80c85996a8c9 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203934}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7c653b2f8135af1c24d8f161824964bbc088306c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01479492784334864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..770f8ef82ba37f70367979f999e96c4e241d4113 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0149550879186536}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c613ebb677fc258b861a4d593807e613f4394353 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.345, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055237}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.341, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a10c411135f7bfacd615d2291224a0fb1bb5d6e8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.354, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015129868238451773}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.347, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01506047203170662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ceb0636d0d4b1f7b3a3c661c20d12012e7a6e66e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732956}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01492201952373296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a48b44793d73816a954f90ab696c7e9213d2f90b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732961}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..597e27f2b33d795b6b1b9b61ef6eaf1f62aac59d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.346, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564438}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e3a36ac380c24a8c9cc595ad50c416004f2bfa --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0149550879186536}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d53986db435278c89d2bd45e69f639263c89f71c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.365, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01523177622626491}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..00446794d4498066cec90d5441edf02f9c082f2b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4180bc443436aa91ac670382f50ad4c6997e6d17 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.343, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356951}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.345, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c5d40ed3f13398b5bc037902e03ca805c643104e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.34, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ac099a58d1e39a9059d68e6dbf9169d0cce19434 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932575}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..155d7008a25c81cd616a8fbc6e09b156fb7c95c4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795023}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01496596071022448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f92a8f5d76705504050f9484c70f90f42f0079c6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541033}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928369}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..204de9f958b52f4c2f24c6c3c969ce9b2c076990 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b707cbe7d86d9afddcbb7d48508a6957c879f2f4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932579}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.349, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015080663991563097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8a471fb7a468ef7d0474ce1c9bea609eeedf901f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087971}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8337d129ac201882168282f89a56f2aecc1a9276 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01460648312734276}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.323, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01479492784334863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9d660b926fcd4adabe64b821615eb059d74a832f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456729}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01483050720454104}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1a54024c10656f16ad4fb173dee2d23affbbacbc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620339}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..342e963e9573d97f5257e186e294428a713a94f3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.317, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880217}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.314, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087973}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d53ca225f0aa4b072bbc1cb1bfc664172392d0ec --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.309, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206493}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.319, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473474}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e08e95cf7673e5aa6d419d26bb2abf1332feceb2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014498627873361427}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b298776b28b3ca298a417d8b155ff7f9cc6389b4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620339}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015060472031706625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..acccf2cf14d85f4ed37946739cab87e162333567 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01470919305605713}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..124c8dc3ecaf27aa498f38ef2fa2d47ab2ff9fac --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270333}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f607ec35872be84bedf2f6ac4a37eecec9fd36aa --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732961}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ba00cd676b24ac78d715d2aa55bfd22fcbfb9847 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.345, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055237}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224496}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e2b43f99473ef55dae9afe896d8b54c106d56bae --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..90584437ac8723479354707e387ed37a7cc76d03 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095524}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ff2c2954b700faf030040d39355109639825eda3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224482}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203936}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1c82a2972d3724a7f8b26b54d2c7da3e549093d6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015060472031706615}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01485384248727033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8f5b0c9ab9fcbc56d30cc823c31e5eb610555036 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977885}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4a81a7749a191fec66a29af114662cce88352a99 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.302, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014526080235459541}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8b94f9a0f437a20aaa387f59c560ca34121992de --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01469663196079251}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7655772eaba9e93b992821b6b9a4e63286cf1b80 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013672343491681815}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070708986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8ca99af41d7eeea44f7415222f88f0d070c69b41 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5740a10c45f94ac63182957fde31ada5d84675a2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351021}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.315, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013415009084004866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..568a2087fbd7807efda4a66d27ba136d96a0160d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225605}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013570806258433616}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..40db3710cb0af948ab59356511383495d86d4dc0 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0136139500102256}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013774667009018556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bf469c0e030ebf717b5d3d22841ac97739e50f29 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01357953127780092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2ad51ce8cbcdb3bded60e6475f5410a627da8a3c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31416666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013405399314984103}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103245}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d04438aadec4e2e62c3353e952886ba5b486dacc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31333333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013395739415639082}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005136}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..77d7cd9913724e0e26e0f6cc5f99734f25049db3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932873}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01368049572576779}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c06b3c8cc02b545806d6e8e0d60a616f55b355b9 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463653}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..88ed4ffe0ce9440b117771108f1eba52c19950e1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3175, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01344353868134805}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.31416666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013405399314984107}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..17454d2e69be761451b1ef562b364bd835bb3370 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0136139500102256}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3458333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013736245342311012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d0d44752f2b73e973420cb215981a4d078bdc95a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982098}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013605417345710526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cff6e069de9b6bf1760a1a28d76321179271e77f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8d8dc7be3c36ed5ae9d6ba0a2e436e9707cd737d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769144}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31583333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013424568830356448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..559f7a8787817cf89a1a726d0f217caa3301c2a3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3275, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251946}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3425, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013704669762934723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c3aee2397c2d7105ec17bd5238a666a9313553c4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01364760294240639}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa69bc1573654680cb317a66de275146839b31a6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070709002}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013605417345710528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c4db2fb93b041f58b970a01aee736aea946766 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0134529489969963}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..30c60077ff8a950758ed62c319b4111280fb28b0 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6877a02bebf853ae0498c8c6d441c09eea99c405 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739434}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012506564839739434}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a57e7192aded0b18e9665b438e5e8e6e68835fb8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012399451855004741}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2354948805460751, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012399451855004741}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c922099eabc67a8bb101b41953c6031b9f0f364a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012653835621466646}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..94ad8d9850c1a7fc2b3ba7cfadb3175da9609497 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012521593295800118}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012521593295800118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ce77d8d2c7ebf3eabee45cafc45f9d9b4d1f3f4b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01290255476231397}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013318528460539433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c07b72a2d6041fd983905670938e048e28fd145f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2721843003412969, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013006600406423709}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3003412969283277, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013395909309956993}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d684b810a9f654be492568f9d33700ce3f8d521 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112547}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013295916103619406}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e954fbc5b27fd001e240561b4de251a3cd898972 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587096}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e9f8995023cfd8732bf10dd3f63f8f1d958dd2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768678}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01286252317535133}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..11b0be8ba18b1e1f0cc940dbe0bf9d4e2d2862e1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012808273573927095}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012980954547659556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1f80ecacd8ac9a79b5d63bd49eba2b812ba7f18e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27303754266211605, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013019332762635743}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2773037542662116, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013082095839059374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..149e11a6d7d8b43bf8e04ee8db4196769e3765f1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313964}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28668941979522183, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013214986329274769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..263564fae574bc610e31d485782d81499c1ece50 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260842}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012430399829260842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6ae033b22cbb332180a2b4538f43a86363841f34 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012352507042617407}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012352507042617407}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fcff9819a3bf4d65eaf056dfcb18ceda419a0833 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012610352663292673}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012610352663292673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0a29536248bb39facb01fb0bf50f3c44665dcd01 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739429}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012506564839739429}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a7dd939510588037c813df0a09ffb93a3d1ad7fb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926428}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2858361774744027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013203196088537364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f0d9758e8459eb9cdc09e9c8fe3e8126fe6f97 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2773037542662116, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013082095839059374}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3054607508532423, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01346008047800249}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..103e11f39a2c6d0219376039aa656a1b12a2847b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012808273573927087}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28924914675767915, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013250012579393443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0215ab567e9204eb2dcd4d2dd9f1f34dfbaa7e3c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_challenge_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042968}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28668941979522183, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01321498632927475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6d93df88c47b4e57b5d884b7c1f5982530a33b04 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008850055161459239}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008850055161459239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8663072b05191a2122b37b9c5ee0bc4832914398 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008762298774190588}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008762298774190588}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..29d66dec8de413cab72b44672828fff98bbc0a4e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008809171744720559}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008809171744720559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5a0ca08a8ece86aabc426cb8517e9159c5e46b36 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2382154882154882, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008741163824469184}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2382154882154882, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008741163824469184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ba225d8a90606e5509f0196836a7cf0dda2a9216 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3501683501683502, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009788295410093158}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3228114478114478, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009593950220366743}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e5c6c29c271cf9a9890b28a34117d21066a4439d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3463804713804714, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009763542075695724}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3202861952861953, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00957415266873942}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..17f2b4b438905c015bc15f8c8adfadd2ce4f3944 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.33291245791245794, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00966995897839532}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3114478114478115, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009502311567905534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bad46dc640066041899c703497f95742ed9ce9ab --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2887205387205387, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009298805565435511}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2828282828282828, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009241472775328231}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eb3fde6a21312f1fc74ca1c8778a9e41d3ea8e10 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.32996632996632996, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00964831157424104}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3148148148148148, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009530150430975607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4ff3b99c8dca6807c1f7b40c65465ac39b409f73 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3282828282828283, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009635749509262161}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.32365319865319864, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009600478182273768}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..64ea9a6d2f6a4edf404dcabd37d0522af4436f05 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.32365319865319864, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009600478182273775}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.32196969696969696, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009587386696300396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e72b020a33771906dfcd271a36194f69f8031205 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008762298774190588}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2398989898989899, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008762298774190588}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0edcefc017e964e8f8d5f83ea17ef5cfa1f9337e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24074074074074073, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008772796145221902}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24074074074074073, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008772796145221902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..531cf107678a56cf1e7dd5c6c927a0b7d2f54b7d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008798836444222033}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008798836444222033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fc4d242aaedca2cee5537f447366ebfaf04c854d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23653198653198654, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008719840797175745}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23653198653198654, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008719840797175745}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae61c1027e45646a19fb4956bc417a0f695e76a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3186026936026936, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00956077550767337}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29335016835016836, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00934250833170855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4b6f885bc8f6a24061a10267b2438a3cf2ea3f20 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30387205387205385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009437524848293738}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29545454545454547, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009361987126556448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e07eb9080dd46e4c48815ac8c161a830775f3134 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3021885521885522, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009422719042483192}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2908249158249158, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009318815921176657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..541bcec10d2a8c5fa63234902081e35303da6005 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_qa_options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2967171717171717, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00937355949298684}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.281986531986532, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009233124071053636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..51cb59b6f78d7d825cd2480ed55930ef02a62de6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5653333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009051951785603835}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6336666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008797928274394058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa2a753bc9c76ad16450ed6a3c3f66c3571cf1e3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5633333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009056690207178125}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.628, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008825982437590614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..531546c5082b472cb4af9fac4cf29b5dbeddaa61 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5756666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009025076316539062}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6276666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008827592133099669}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fee172d7f8d9cf20235a929efcbab04dce2a00dc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5776666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009019409415904175}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.631, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008811292732995702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..52ccf6c7a1ce28be948d65b29ad75a4a7b5e259a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6056666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008924016166504413}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.37833333333333335, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855801251873014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3b84e73fe3719630863c1c253d9cb8e95d2048f4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fcec2229983f9fb5dd0439aaf16b950a3dce8096 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5953333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008962735560535853}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5943333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008966262991425925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8e52091b0879c4525f795a73fe643917ee209e4b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.612, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0088982241372984}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6063333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008921375326707089}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8022964c6ffc778269450ca92a384c93f1942556 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6223333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885273830576469}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.566, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009050339010891723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..659be441869629cc0d5db4466fde1d88ee5f452d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.541, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099483512819305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c9da79e06ce2f30d8454b5d86afc5a94d631bd5d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5343333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009108680663441197}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5213333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00912191679884186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4fc12dba92c6ef7cd7aec777e0ffc8712506c267 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.613, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008894007408882734}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.507, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129336317272385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..72b71904d886f829ab8d2523cf6d9f1da2fc0d65 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5486666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009086879312708495}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.542, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009097962646004983}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d1145aef64d1ee39955f4a860fa61352356af20e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.584, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009000463105420334}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.561, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009062029213030572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f18b9f8b77a19acfe36d36da384ff12237d0037a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.589, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00898442578218232}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.574, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009029683183412069}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..68bd488abc07694ad61c36b5453988933a1948bb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.62, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008863380835773167}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..db75dc7335d0451bf37a7774b3dd0e658be151f3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5403333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00910047692710895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..50189b249d52f347de5fa4192dde840f7aa66098 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5883333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008986619341172333}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5933333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008969751860881003}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d80be46515d280f97c45fcaf5bc7aa9fa440ab46 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1818181818181818, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3fab01d6dddb5fdfad071f9a9bf9715c27cca52f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.3055555555555555, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..48f39f367f65b234f07f3fbbe13d148b5d5e59bf --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0663363415035954}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.29170113041080786, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c3c172cc94c66230def49bbec9e7377a2f3dddc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.32142857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0629736228905634}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.21804611488737838, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dde209abcb9c5647f454d50901662d716b16b600 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.19555555555555557, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..385fea2b96590a54f56626b9aec198f86822b315 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2964646464646465, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..164736eb7ab71926c543f52ee7f7e727389a8712 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3471345029239766, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e807cde0fb119cc270a223ac55db7f6d0e30277 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813059}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3018867924528302, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bd0668d86fb7b452bb609e2502137eb1a2074e9b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.34629629629629627, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..444f1845641a4d313aadbe9e73d9b7d837cf4837 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4fafafb2f01c771cea280226bc1f351e2c2f33d6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3227672955974843, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c4a8a5817a9e4fd9dac0a0a3ba618fa06d256f71 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_can-we-infer_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809218}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2505963590709354, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e960113737ad44b213184d538e2b2a8e7fdc207 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.25, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.058387420812114225}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.20170940170940174, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a4aaeb8cbac1b1b63e2b36506b3b279d838c525a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..29119f1f022a7f1f69cc5d3dafd314ec957c2c21 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.28758169934640526, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..04bd8fcdff5c641f714a91861223eec3a1126947 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.25836477987421386, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..48bb27c236b5699f5e08b018d11d902093af49f1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.37777777777777777, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5c382064d33ad53403f444a117f5f1c5b8e00f30 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2964646464646465, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..26a289e00d647604086f50128801e843bbd3f798 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3086702262903636, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1bf5d2938e38cf480cd3b4006ae600cf560bf488 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_cb_justified-in-saying_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2988943957300801, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..559cd9632524f66af86d8ac2c6de27943c0ba881 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7dbb1918d4ff24078604f467b0c6372f17f9cfd7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2a3393ad940dda0776d2a347a1faa76be4462d58 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6e4cdfff8906433643007a8ea20f8b9b30d04901 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eec39985b7c82db970b4dd0cb879bd9ae98cd8b0 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_best_option_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ade47ec03f20d9e266eb44cf8d7f933599f2e3b8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..17ef50d10f4bf4948b3daa308a1d28faa2201a77 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..351fc1aeeabcd2e4841c81005b77123be5fb288e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..137eb22a08c7a888a2fcc52ff2f12f324fb6495d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d9c6059bc07b1d71388fe68077d06b24ad9e016f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_cause_effect_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..52fb98ddc9fbad22095b9ae067dd6ba819856dbf --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5f460a5f7ac582032e20f1d23d04324ce4b16cae --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5db7753e4cec35279cc7048d8464d47ba60ac969 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2a18e04b577a3b4ef0bbaafbf2b5cb813670d439 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f74b1047db3e4130e3a8117f5790db60ee7cbc60 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_choose_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ab9dfa5df1fa49148bc8b55e034b805ebe4c7f05 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ab656a32f0b67ca6b3d1c8de8bbc22ce543b6f2e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e571cd7a8b521c5066fdf856e098019ba4590c0c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a0e6b22ce324187e770f31195d7c92d779d8a84a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..53e42777f6c7fa53102d96a4d3d8838797aa95f5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_i_am_hesitating_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56e6ba5a1ec16594a5b3a2c559f2a79b11f9b2c6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bd1fe03d2577785c10d7c176635871165b9f3533 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dc3b3126c5a289052e89d0a4b4e0692359ae12fb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e2479e0408be9e49a17cda8c5763ec8220cf3df5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ed49221a5d46d7c30c7086358555f8af2c224e51 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_copa_plausible_alternatives_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..960e21caaf12411d5553669e2e2ee9b5cb387a6d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 0.024902886709539385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.001859827390010786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.030688350761161772, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010111628349295336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.03522159436879318, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007432341163294273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.03002411449589292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005566731529837383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.00213834226369479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007515435096961347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.0011739194410139697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00018856260255220028}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.000820851337990567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00014510778108128557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.02989418896176067, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009996326577278941}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.03396980123152662, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006930467792590991}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.02908307755018932, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005238680979850407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.030194649653587414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010048849542946053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.03435223902229173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007062613786269745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.02941389349627349, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005364047320293597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8da55eb256180a2f743c65a0803df6c11b9ef72c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 0.10526165569419166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.014436035997175528}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.07911450069828341, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029097835615642907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.06738993210750575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012035687077170335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.05790598949403749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010250743934079594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.01429960185912336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015929231677413393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.008672646269225282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042018139591029456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.007254357472661951, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00035056222442385245}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.07079248030581328, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026369733712376764}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.06044596402655965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010577536507608413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.05171508838489101, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008789909961010179}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.07162010937536421, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026623851377461905}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.06136184462960388, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010727989981957834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.052508873461168226, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008937737880985334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3178f9f2b931d826ef28de04507baa8a1b970a90 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 0.30539641277266166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.011391767070946923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.07633505189299611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009839415501958642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.1364351378477736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013697596740544205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.09437252246992375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009227636769121052}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.012720631413562505, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00034491119917086203}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.023922584715591402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006622677812103944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.016087743931264227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00042298521406965243}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.06444618171708413, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008733244510681126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.11522495932109246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011523473615329262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.0795119996992231, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007622335976376695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.06758563678521023, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009001740406512067}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.12020034868975105, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001178439476775103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.08329416606334707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007986225124758468}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..25353f65c10ed88b48403addbda2d87b17b399fb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 0.8289729029926227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03819124682808821}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.0920126739306129, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001180610214294338}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.18394597416501404, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019089777737464745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.12027784856141198, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001398336070917741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.010230404488421895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005619899673032154}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.019302317278520317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010317275213485766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.013068517538036242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007056166127861284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.074950558069943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009851928299547022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.15108041046374748, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016839786264005012}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.09820775827857402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001187279171052657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.08168257075261778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000990831090160885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.16469012605064637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016151196905014934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.10706801145048024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011738963266738863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3d69243cebe9d7b8dcecb3e255789fbc1c4ce4f8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 5.871602035182637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0689265094519103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.315381910546664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016991270157383524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5677658862678204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002537463753002365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.39687295629000013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001731150333313827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13054327509920194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011487605105947785}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.24249304875101457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021455306919602565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16573259273166063, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013731741464944188}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.21023197351207698, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011813742144540786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3852625280422845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023003532851786253}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26601847609457513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013291072676722162}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.26030259129820893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015643235606867766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4687183064058039, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00245885193806218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.32751173027045133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016658340639491723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6095ea85b3925f698310ad89c83a4f52a71097ea --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.536088149934491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06368060447210919}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.3132683994986297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016779315277452248}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5703289435095071, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025564238054670923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.39605202887518287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017242982181801837}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.13683093815324895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011747506318268393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.25773200513287264, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022557263550937465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.1746381612318158, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001413301666897888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.2150826473861559, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012099040939194908}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.39825037303644484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023757642907636734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.2733679239070397, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001374220180286804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2618159333058282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015495574948136697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.47735870931030694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025148308302801285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.331089401046371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016665130097241836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b0e584c19f2ea4333cd27ecf18d4c51adf6ea7d5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.012666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002042089808097441}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.0005112039433192838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 8.466574166758451e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.000980746025187589, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00016211133855868694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.012666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002042089808097441}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0005112039433192838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 8.466574166758451e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.000980746025187589, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00016211133855868694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.012666666666666666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002042089808097441}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.0005112039433192838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 8.466574166758451e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.000980746025187589, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00016211133855868694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4d527702cab4d02e12e2810a4d4b4f14ede18a6d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.70530637243207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15229080097327555}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.1372020348331753, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00462234189751826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.1273232990519524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00418720214271013}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.10350002350150173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032360935018220568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.03517252247207065, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013354896708446466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.05185401724384904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001963532037641408}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.04015090640873629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014719873423558562}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.10800866370039118, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004210703866730323}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.08625864193958034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00288629903926175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.07039375693186166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021877703117276575}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.12236274969411427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004403741172008285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.10565592747607992, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003502725730071462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.08642605279408319, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027110760348653747}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1fbf4b0ef78dcc4b7926c71d80313df7480b7a76 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 5.431024960602319, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2000472576829238}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.1861296338649477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0043744882526644034}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.22029441022547872, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004928253663307522}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.17737847294451, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003820008304012659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.06437721898797501, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016094196221665435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0955303581448959, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024590646449618413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.07440738035625322, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018424656344762078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.13782618332035232, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037520766631794914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.1523944911740286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034953930296791947}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.12250849478802743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002662162350752231}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.1600822254903216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0040372964538571325}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.18302982963140504, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0041611517396543864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.14761442818242548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032202549154167377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a21a120bcf58652275648bd74a67324b1644e441 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.14846670480161736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.013127750204704683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.26299103271373675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006382723076774683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.05264625320052111, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014565379578075452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.07826511485489751, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019164435017392188}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1576462193644736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005855563265723578}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.01645135749156905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007166454954847944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.026122584651124606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00101915256853543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2524687494414541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006192701638587216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.04953446876968493, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012966364411702697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.0738166454356822, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017248337500189589}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.25330917167188566, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006207826569406349}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.04925315584474901, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012892531673512504}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.07392255592729936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00174327269105063}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cce0806d2f5f8a6d76b03f6efdc5540131666e49 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.167405083071978, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09131785485896106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4027750555180633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022654704081963494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46782170590516187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029091530201079247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4160674099572731, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020114833441626043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.16815397557349618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016540332686745738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19988678530914838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021644647347172754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17514483800816646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016790153644324725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.27883038558148265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001734453659792866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3267350269933985, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023849450074570734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.28884448468062934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016305979819716226}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.33427994949554696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00212000114598495}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3882701548838279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026955155305927177}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3452818942622058, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019526544280227683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cf01a2c37ca80c91c4832ec2b4975fb32b83725e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.796301153911008, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.146708341661776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.41448086950190893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002308739240960833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47403110687392724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002885238583477618}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4253927318807347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020174640710145207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.18273899527170673, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001736753975334171}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21447823413610112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022428818903169902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.18933491861931817, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001739001650440241}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2947058978443582, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018691331443875447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3390095486229893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024297003361930447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3029238715575528, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001712217734742568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.34519699878769394, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021888771482807224}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3948779703096386, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002705400716089457}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.35424136247162186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019838710065518987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..525b44c1fa85b72fc5c28c796913232a21c32014 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 1.5394066408010387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0484052485543162}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.08175423335349441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018392810471837139}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.20081999461253017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00439749257662734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.1145750851342104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002534080030689994}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.03295322832313943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007898704120356384}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.08455406384327209, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020235178180524603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.046679692708971626, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011061130467268918}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.05815283722861802, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001320004858940561}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.1445691822798896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032622309703185166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.08171290144166414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001829868972362332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.06823005737355141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001559301582529244}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.16771851909582491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003735794333743973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.09564293647544783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002149947772773811}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9fe2a56c353cada7a2360791d486afa7eaf9fe74 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 7.053373401330846, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07996544807817436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.40161297429569254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020446945435097607}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.624047917208595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023085024322038396}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4772245380978214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017798625812432496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.18443805368794422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001382599111890123}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.294010691187684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021124144674316923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22068920317525503, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001470678209160857}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.28104449885694216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014226979752300062}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.4460789661395535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002361021804488117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.33641422719087777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014270703519470888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.33024724391235083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018849037688533563}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.5138393997335251, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00235509008538393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3925320856681083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017735001162616428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..439b42bdfca65a1f7b5c6efeedae7ef83293f01f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 7.164537116124885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.054507166163951015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.38094779000061807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002099999503143787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.602346894502485, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025758615927279917}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.45619650620795643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019676479058125347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.17573474931980426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014244066880121941}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.28548391121662253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022557879734491403}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.21206289529597055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015695849144458885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2696292732553264, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014925011735967284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.43415653380870267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00242922591565751}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3249494154166397, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015536724178672875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.3165678865577877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019256981310923358}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.5013663660112442, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002552156661977458}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.37931197366758207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019111586108465242}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a80ca23cbc6d3192a3f239bddd60d9aa13f885fd --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.09364826859257304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016148133314340363}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.23630822110313882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038835580096539166}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.13252809822797437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022104399017904954}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01344629279224066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006749467817958984}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.035406016690730915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001796407213787606}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.01926089676236191, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009630205657333692}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07694102339755571, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011850177068464373}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.19522320409536523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00290859788654431}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.10902400441153387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016213703660804158}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.07494465354319416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013254739678396428}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.1907022378721153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003277425301621285}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1063051390341436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018302500166387287}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7459765558871847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06660040271380883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..75a32e3d6c8869839b2a87bb07b9b3f1608d9810 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1100394961313742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016618822859456213}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2706920375533275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003706425918534696}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15421893513473225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002184640762782}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.012049819428811073, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006058660485736242}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.029943656724121994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001488692683861016}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.016984977462351773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008440833023618337}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07738480691068281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001120493205359145}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.19125268196473866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024475701699631387}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.10845297039939822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014181692997681172}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08829821326051698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001339504713458616}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.21836071248948102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003018448833279907}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12383067773003087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017407939439714815}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.567617586363226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03557588297526116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a2f63697f17fe81263d308af4c7b9f55de433e89 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.12266620130499661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001845349320673143}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.30342243290136806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004372631345252368}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.17259522097991983, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025062111927192676}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.023355918902647873, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009196790050941139}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.06023586351705483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00245666795200196}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03324496960884671, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001307280464130723}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.09357054736674877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013572221631896278}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.23269687986034876, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003341303513413307}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.13181341351110437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001855127448763809}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.09728766869371021, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014913277984300816}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.24235701247120034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003705415152960698}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13711574191352463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002049753398864549}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.249916166884964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07484579982708357}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..53a438bd047f7b0cdb4088956bef940633442817 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.16111094810574414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031216354350091023}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.31873549566951037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004234939673038656}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20228388865404873, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029050667038010834}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.04233924505198599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019046448700156524}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0812403134968317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00273838697975092}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05167374363553887, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018551837511418816}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1289972501536601, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00256911680281591}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.25811719223888474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034826602134800538}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.16252418590968234, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023298980440963524}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.12625677747003347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00264701935811548}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2516551815287747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037032430110956454}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15869131179009913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024607270521408895}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.9335794206127328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05641229737061542}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..02d0fb49cab668ec74bb9ad9cd045dcb43da7844 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.1391935772840409, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00185790613309646}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.34411662043430347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004457125721936953}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19574368508124906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025064123942186048}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.034069340066578226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010923839167265616}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0883466846688517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029190316207930176}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04850165996745252, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015420780039749062}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11239213670673616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001467031544921221}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.27953512474588155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003691970095518396}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.158271842003029, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001997766812063051}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.10893210361976352, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015446349102827366}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2719037060557822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003926408821594589}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15356091204478078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002118965655419107}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.8666040036646128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06772362697405075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..41d1a7ff1b6112b2cbc7958be09cd5e1b8cfacd2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14553793099391538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018785878017142804}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3543654560324437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004391158800565359}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.203563620733602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002510639667265985}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.038371943502079627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011745762777744093}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.09841236909288589, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003073775666446705}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0545069734074903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016532341451475344}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11617466261457347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014951292876035514}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.28481399699460613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036998611942237183}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1627195580247563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002021374913395388}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11452597251830655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016212346273290488}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.28106149008733095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039736918211797}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16046012151104688, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002197436651210038}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.12135549546371, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06496091187932158}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..71c7167044d53a04b7adeba38c4d4a25fea80fbe --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.15936520803829934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002502914065388867}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.36698072640704477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004683622882613499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2162684407733824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028487305378065864}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.044026641927011036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016092489576812487}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.10240085620355213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032524404949168598}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.059415288606804666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019237063507349787}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1250497631451513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002063727102434744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2883013244617606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003758676199438092}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16944484885441244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022672242410205668}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.12672592447230924, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021949367251175547}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2924636147885988, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004172229113513385}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17181025266219813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024925143856936887}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.4667492251754903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10381088859974377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..777eae06dd179b36b6caa9d9769188eeb8b58e26 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13872518272148576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019196010106841707}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.34230252867072486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004465857111154106}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19511361094013366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025883144750141985}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03163153944854582, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010782973297168107}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08173836317054556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028348136277117873}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04506092989376365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015273324009655352}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10692232891845452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001417359741210213}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26668023859416595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035247154238071715}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15075736335997478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019327841457787809}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1091361614782161, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015647258593366256}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.27137465028647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003816794352052926}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15379184945424418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021360349555465917}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.758172461446717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07910003790624576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d52caafd167a4fdf18cc99ebfa12844e9cdd2e23 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13980890149209954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018977428372584247}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3402939621206596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00439029308436291}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19587821805854708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025521664288773132}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03376773335297692, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011277676246399322}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0855882084980882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002933007569998225}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04784619134007039, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015903598069214155}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1126666102037684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014774766678267424}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2759357644277514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035546429323666893}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15806290558333977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001998316404733819}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10915831064990977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015695216746497264}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2680641977382496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003807444122985714}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15327775753143216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002141837328174842}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.929152474522292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06152527877922885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..752e4137932702866cd77af93a2ade8ae2b32093 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1546986054170722, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027155969874516615}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.34385746004492035, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0045343512683236305}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20512716732365058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002811760618953105}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.04048674160408378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018097291671313742}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08889402148710406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030181836014503615}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05233186334644149, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018371483058294063}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1195641111418401, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022640684013532537}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.26542116077776273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036521538304345952}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15803771397406507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002246566183671595}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12369530288790914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023542427491872885}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27570867031120994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003999061615231806}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1638620451208335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024263948798584254}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.0700685384125195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06506482367590657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4b84607dbab12e1ebd750975fcecdaf8f3d6d43a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1497360041541259, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019583143857350815}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.36642339054764944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004583540722534914}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.2101464481816641, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026373715286000547}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03473453491991313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011313039238557406}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.08859197419170174, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002966411473841889}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.049316100613614085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016010378254702274}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11131301420898317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014579758214458545}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2744297041783303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035891934973145126}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15649367452361052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001985780262960137}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11850262725207555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001642034902769166}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2925517605104228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004056463253514793}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1667089554989451, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022501688088607166}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9587846581999537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09170521457372413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1850dfdb4d2d3f8c8b80299f74debbb5e0d24eaf --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.154790186630409, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018694567281612956}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.374668382001663, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004418688017830266}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.21636269162646565, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025004031143732804}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03883770533881618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001182146819894263}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.09827865909009756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030936550153937525}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05494913652343897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001667424386002709}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11817351796015063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001428773921986939}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2879474437688573, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035604161567386007}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1654386218767027, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001935585571099022}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12201516306191562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015836560252758835}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.29808537515150224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004003333302645505}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17098539953460037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002170984636765844}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.2345054070007717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05615802080529323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..331d2eb58ebfb5127970cc5ad62d27b677d262b8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.16292807064215686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024194702503072605}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.37125899666294615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004494682114366064}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.22006038313971912, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027315907690086526}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.04460384667729313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001568216125121224}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.10384107175312106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031301013911463464}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0602200602063056, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018748349329553756}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.12676536773949365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020350190574563182}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2897271196200456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037239805309158074}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.17107849234270603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002234222280629345}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.12930934118664014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002140548472143606}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2966631408145694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004112653472647035}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.17475995659766042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002424194865649916}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.4860410253783094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13032698389659206}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1fa31fc18f52f68a4d898d9d4c0fce646a6bead2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1377737697205421, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001918577597610342}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.33444844365458937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044702777649980804}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.19281203992284088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025879159231049723}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.030357367199789848, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010605636257399766}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07732093515185194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002809130137054114}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04304657136147545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015026722425806724}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10751173681134886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014476076962821913}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.26339792038733567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035594110181837476}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15078520597524717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019701846993564385}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10716847669626124, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015343163798387418}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.26280225666402524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003792042588585284}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.15041271573959936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002105741194516134}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.709372091512763, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07003089206006699}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a3bb6f34b95d804f5c0a6a56c9c4d9fc29e7072e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.13594404907364932, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018873868032456109}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3217165994125383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004330682877525108}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.188822793597779, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025303316596448017}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.032957187238861776, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011262880908098065}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08138888326812603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002864272559833109}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.046347749632318976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015815570739294248}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.11204662329061932, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015323186320801464}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2665767372342556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003639126727666291}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1558262462818579, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020680137397783953}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.10385634177850699, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015435899459486468}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.24825254653449977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003728036269680926}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14465796916720383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00210712558351804}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.821098322958732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06452240599200931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..62906606cad97cb8e6d5f4fb785cf6808a74b3a7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 35.61734472522983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.9537810245026835}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.464684845944155, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005858144057381323}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7271378863199152, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006504954123944544}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.543575916103388, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006104898483973868}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.3702105052685584, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.006265422305938615}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5873452845007291, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007693229199172635}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.4356172150035944, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00674569689069812}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.45328759213308434, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.005906582270824208}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7107375872624206, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006690670049916566}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.5306285880503983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0061926362174269355}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.4577703365666644, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.005904827825133563}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7163396172728412, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0066419818950900775}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.5354876397976078, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006176130553320618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9070648cdcdbe02e30e07cd9b49cf877dc5443 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 14.876946234243762, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.6789391344335735}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.32711934744770815, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006006032300444084}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6971853152299208, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006708558994798512}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.39613005227745696, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.00619071042023707}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.25040069744375015, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.005867687424247696}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5445186159492055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008140522499998817}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.30633527666094623, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00629102968697341}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.3179138525456248, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006008468626549582}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6816707364901432, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006948228129049167}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.3858027770673977, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006240254684879286}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.32067669493513273, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006020699147284835}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.683075266312059, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006908323840034804}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.388545252477902, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0062447020220877105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..40dd1ef90fbaac32de1018727f2dd266e01d6eb9 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9c89377b06995549edeb0c9d249504569f6cb031 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..69548325804c7939f91c18ad514b818c45587fe5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ba1092181236ecca9df8db93b8bc2a45915c9489 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5195865070729053, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011656869979288458}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5195865070729053, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011656869979288458}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..54332bd041bb7c831a957119a44b1b1f8cb477c8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.14382150833937613, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.010529063400117741}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018356416472001407, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005521354880273494}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19546239190947554, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0038247859111534004}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03157680503247346, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008291906514774739}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0025044688702336895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00013773434188164336}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03017313707041212, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016830101887106172}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004379612267179601, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00023123994933486116}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01690631599627158, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000459560056479906}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.18505711146460074, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003610768847019555}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029248358492157706, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007131377793194621}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.014654036427626792, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004353374039170972}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16485045526174213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0033835169417924032}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025299871642617325, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006568016014138125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8386af3cf28245cd56340d3d107fe410b6041b0e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1648609749888846, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.01622044569980307}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01866628841872322, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0007473574991847542}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19449333311979644, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003899061501410445}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.030873993968869175, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008327468572271476}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0029967448137577595, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00026220893484564474}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03260572456531526, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017463997230649506}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.004837227558655057, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00028908858497832816}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01780151075704589, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0007098586143396394}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.18745438746572335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037410036769154925}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.029446893043948482, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007622138034132659}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015331028080827962, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.000691747836518428}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16437664916642197, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003422791511095359}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025145545929009763, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007026811345719036}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a70d69b509530bc3b216c5fb5b4461ea7f44a7a0 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49455930359085964, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665133500637059}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49455930359085964, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665133500637059}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f67d7fda1f58543bfce7e3ba1ec2705a31aea0aa --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4929270946681175, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664656918145945}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4929270946681175, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664656918145945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a3fdab3cbdc9de1fdc0b11c28df0e74fe43a0ec9 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49836779107725787, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665762007194876}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49836779107725787, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665762007194876}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1a2b3b574d1e20ab06ad3ff236be4da39a9c4ed --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4923830250272035, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664470424044986}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4923830250272035, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664470424044986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cc84cfe25fafea1959ac20bf318ce021c5aaae5f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5685527747551686, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555657298864612}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5527747551686616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011600659443292933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b1822432b4a6e274b6f8b05167ff64f1fc431c74 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5767138193688792, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01152769947361448}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5696409140369967, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011552114834700509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fc5b40d7c4ea1274d7579b8fa6dbcbfebe60c7fd --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5642002176278563, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011569259195486613}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5538628944504896, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011597936590301233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5fe007d61e13ddf0bd53c5189635fc6bd4daa2c3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5505984766050055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011605936624156083}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5484221980413493, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011610989358814284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c5021187b3a9c6a524d0ee0226c556db736ebe1c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.471, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015792669451628896}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.452, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015746235865880677}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1fe82a512a385ef224a9b990fd242582ed1afed --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015499685165842592}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015356947477797575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a03354896aed6cacb1a37c1d5100565103a3e8bc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.364, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015222868840522019}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.355, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015139491543780532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..041ced6385bd74b998fdfd957f63545ad1c4cc51 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.334, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01492201952373296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..79b214026ad324e729f69233929aaf1dfd48fecf --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.808, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012461592646659969}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.743, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013825416526895055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0f368b817153ac8624c71c2517f605860e907963 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.877, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01039129342184988}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.841, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0115694793682713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fa4d5f383bcf402b394e48ddb622268274f5edbe --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.918, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008680515615523722}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.903, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009363689373248094}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..05a6b23ed0bbd219e3af7dad85f23f08f7d90343 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Direct-Question_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.924, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00838416926679638}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008916866630745887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b99d51dd0293d345c4fc0b18d469f3578576788d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.399, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015493193313162906}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.378, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015341165254026647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e05ed83ec95499b7f3feb3afce789a4516997594 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.34, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014987482264363933}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.353, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015120172605483694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7b52ceda79e7966a860440fa4a2912a074782c98 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.341, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01499813134840271}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.332, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014899597242811487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a4032526a46717b53a4728f1ab7eb71fb1565fc --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.33, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01487687202745673}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.327, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014842213153411247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2b876bfff2702c5dea92555fb1d819ca838622e5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01581613575277321}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015790799515836763}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9a0620335e4c910cf36e1eb5404bf75c831b4672 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.412, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015572363292015093}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.401, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015506109745498329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d91ac588e0d9e9a9c923acc927eba1761b70c79f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.425, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015640320317040105}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.416, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015594460144140605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b953f026849f4ef0299777e5450dbc7c29f133a2 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.436, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015689173023144067}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.443, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015716169953204105}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e4820fc917cc5e8810f39ff4fc6eccf289eb2930 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.597, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01551875741906653}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.515, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015812179641814902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3a136bd7c0024e73d427c25dbcdd23e179980c8c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.461, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015771104201283186}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.411, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015566673418599276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..21250d7438af2ef5b01c4bf8932f63bb2b369a64 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.427, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015649789644462217}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015615500115072957}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9130da71184970503ecdcee57342f0fdac13a5b8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.485, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0158121796418149}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.464, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015778243024904586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c91ea79192885372ff959e455e45bbb449684998 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.49812934259754144, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562351329083268}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5077498663816141, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561043278863545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5e0d2aad6087a3af94ffdb93bb338a8d261bf64b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011534056494505862}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.49545697487974344, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561954965856516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..170e481f58b0b4228ae0bedcca910e0c3d37d7e4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.45430251202565475, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011514040245583501}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539022035111228}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0d45d7b54e572819135a6cd8bede5db0d0d5ca8f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01152471548624066}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4655264564404062, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534917341355139}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ab094ed87f8158078e1c18ee93e7f9edeec1727b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.5280598610368786, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544210396951669}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5312667022982362, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539803085637727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..606b3aa261330eaa0750c8007c50293125a73119 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.5066809192944949, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561400034509398}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5141635489043292, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557792331301676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4b3327f638756f335ce0babea5170e071f564b4f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542066509767008}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557435464292914}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8179ef95f3b17ed1ce201a9e6a2b254edf5196f3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011519544865928062}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011537420054210297}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f968e2350725930430602baf9272da46c51f9afb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..71ccc575fcea9dd801df3e099873af8bdfd7b521 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..88519d2fbf6d5c99edcd47da6731ddd0f08ebd3b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ab552fba77cca9fc9c15a2b5068c84fced468996 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1 @@ +{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7073ecc6b82d815774355c95738bb2409824c5b7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4906467129877071, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560409019420367}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5056119722073757, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156170392878433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f267adf41535fcc4f664dbc148945040430cd943 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551049647290312}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4863709246392304, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011558135970599896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ea34f8a9efd048ad0f20f543d06c60405c1d9b13 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.44414751469802244, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011490067784518679}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e57df5859bba054a14ccc82e2e92cb82617387af --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4489577765900588, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011502027057558888}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5276d6f97d0c9fde13ac26eb2cae869b6b7ad53a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5008017103153394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562417388300206}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5195082843399251, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553628196999318}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0083e695dc773247bcf5d9d6124e88f237ddd624 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4756814537680385, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011548748301487317}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5002672367717798, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562430600098487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..467dccc837c9ce71fcefb6e8763e3f2b0d636b19 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153229486915312}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4633885622661678, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011531394084549621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e855fb829534ec27c0950a2744c6a60d4eba16c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.45323356493853556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011511744771088354}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.45163014430785675, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011508201145928354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a7f5de9f1dbfc09f16a323ffd26b0953dfa5e37a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4368231046931408, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02985524739031494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2af04bc9d954f21625fa8b5b8ae9c6a76448e5ec --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4657039711191336, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f2891a2a33a60efdd53d3b043b38bbf898080b53 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366422}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9884c2db0786dc331dfd2ef5ef6a2a71236267e4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1c50fe801f49b8affaec40f588284659dd1ea089 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e13689923855e65ab50a7065876617a5d2709e9e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.44765342960288806, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8de551556693e9a687c635953b698f65383c7d81 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f8b15858339e49ee33cd8eb96a8c6657d74b003b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6be14814cc3bf5269859cdb86fed8a800df2bc85 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5e43b6fbee04a27596ccd742f205556f27d1d76f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f1da0011d728ce9f881ef281de62b2fe6e037fa7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a8f778f5052b683c43a0e9557284e6c275192396 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8f624589f71e194cbb2d148acfc670beed9437fb --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0300523034631437}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1edf25774fce12e0527c469e14abede7563a1cce --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_4.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6a560fed86132e771953be681cd09f7059c3ef83 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.44765342960288806, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0300727231673172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a03907c04923a5e09542d42d3926b1a85c379170 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0219aa75379c823e951fac1c6cc4566666937d90 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5581bbc52147d7b94959872b48078e2948dc677b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7082acf1a469df393d80499e14957920c52cc7a8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029953149241808943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8963a126a0c5f60e150b11493cdb29da883a6d9e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..24851c21996f3a158d6074a2696e81614ed4aa1a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eec983f994320f1f60d617a3bb5d92028ff9b7e4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..32603f7711ea86d34a7bd81ebb520bc505686a7e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_superglue_rte_should-assume_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02993107036293953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..de28ed632d3d4adce6591e6eb0df714f26da80b5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915853}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076906}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d1632ac678376c97ba727383f247e3c15a7c83c7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616453}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e0af05feb569463dafb8798c63dccf25b73e66fe --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616448}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc083082aed55eeca1551a4a823b6323e56563f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_Replace_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529015}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..32dc861cbc7ec86905e7ea8e820ab24510d29e4b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e88bfe8f45977392efec953c5c47270a680693ce --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4980268350434096, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049749833367589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..77ca663161589458655d9ffb35d3b0d05a535b60 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4980268350434096, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052376259225636}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5098658247829518, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049749833367592}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2f10371302f2d3f950a1da30538e392588167d35 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_True-or-False_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5043409629044988, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076892}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052131146915848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5f90a87ca762d7afd34dac23011dc354f81e5bd7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48382004735595896, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140451261309786}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47908445146014206, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404018549421295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..76ec294c7de62d95db63c8cf86ca46c4467634d5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48382004735595896, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140451261309786}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48224151539068666, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014043619596174959}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e1902441a57bc289e6d24148bf46f52a39b0d2c4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4877663772691397, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405616}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b4247e44c96087b9e7e1d2bdf315a918331de50b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440415}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.494869771112865, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405174596179052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..10133e38ef4c152065ba04a9ff06a92a072ffe29 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c467ab60313d2cae8caa953a6eaa60eb0d737c4c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228577}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e61594d400190792885f23b4ad6c7594ee0c8df1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405621}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076892}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..64f875d252f5140d864add21b1fc89821f13a631 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_stand-for_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048804199859325}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_0.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d9024aa0e076032c4454cfee78680a92a19ff96 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405150083848581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_1.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4a15f7a591485ab6ae5404cdc16a657802bfe9 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4696132596685083, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014026510839428743}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.47277032359905286, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014031631629827701}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_2.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3662bb2ba9e97796a17c86f1de22e1133b50aca4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405195606407689}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140519560640769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_3.json b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7d5d0faad772da5641f0814cfbdac03662130131 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9e8c3d4a8c7eaed9ceb16b4a2428accf2f344f7d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f75f44876c0ad3b8399472c10e118390ca119490ef730b24943deb2453be5965 +size 4107341 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a04620e935ed04783d572bf0111953b8c6d40ec --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:486053346ed3aa5205fd095b7bafa6a4dd2030122189e6f9eeaeb0b152485f3b +size 5131468 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00e4d220cf37dcd2043a55513b7fffc87585ca06 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd7c2d24a7c927de98310e71c9ca788181c85061d0fcc53dff785bf1911a88b +size 4010840 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..21a8e84d7740b8ae402dd57dc4c87c3f5cdc7b96 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bd04fca91c06e8e1803dc72903ddeab3e16ae03c66403dee963a994b043e70 +size 3864482 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9eb5167b0c309c201da432055d79c639277ece0e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2debcc7ee196f46ad1df56d27816f3a08eaa5098d9b3e5b69342c4a1c42ab84 +size 4722426 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e616f8cab03b920a76b806eec28679929df8a968 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a40a4dd84381809ea192e3957e4ac64edf50ff18ad4b51a321dbefeca47b35c +size 4792804 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb6f9f9370e59e7318017865951da92ecc1da8da --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:716931cad66a5433f7ea7c3ab557a0d8570561c6ebc10a30c1b50ef9db008eaa +size 4448039 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..533421cdfc5b7cffde0745869ea6b04f825e880d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aadc18be58d9f73a530b738c2685341efa04ca220302137348ff6f657da61b32 +size 4600502 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ecbb5930a39ef5a6623d336e204645169f11d8d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30a55a4eae1a174cd7fcc218adf25ec92f407647dd49757eb5de5e149a8c8c3 +size 5716748 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a7faa6d2b6a6da541040253094f695e800c5bbae --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aac94f4c6c1b067e31f2c9c465c154a9d738eece57b0fa69c626500e33f38a84 +size 6449519 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..04bbe043b434f973250dab17fafd3e87078e670c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c203e56502cb6d8a75a8b9737b91f515c1941ac96842ab560baf310ae1131a +size 7938608 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f0f41c389b350691fd7e167ac977c295fd878484 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437fc88ef2e9413a4dc269175c15fb9780ddb8eef2607d53e2030beba5282919 +size 13568566 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e297a9c0a43c0c79c1dfc834b3d1a8eccbe0737c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338c1b05ae68fbf8ff3a72c5a7bba28b5eae56f3deac144a2acdad303a262bbf +size 7867670 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0ec60635f11680725a6fa035857dc5fcf7b6329 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff77123b4415a9877bb4d92b12e491b57d5754c908ca6ed94f03a6d0375d240e +size 13580952 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cea841c40c9ee29d53410a2623c9b693dccdd3e3 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a821b8b2300c8c89c8635de12e926266837116b94e4a0b6aacc056a58e7e731f +size 7924532 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..93dc9233f14a3a746e1c10d02def9901579bf0c0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f48653286393b0d6a4068847652eea98bd634b85d4d4792bfa33e7af459dcb +size 13696647 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..aecd3b1c6853cc2046f619723c5d27d6355f3325 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd67150fe9de4123a6da0c010fc686c2999b2a6ab507cd2c071ed746530c288 +size 7695336 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a551d1939943ac1accda42f9c80a020231bb943 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ccfd9cc0c47c2c991b4e50881fe3c24ad3615e348d1768c256a301c9232890 +size 13309998 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d69c8181787446034b0edcf6e588c16b82fdd859 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779b94956d43eacabba10d0ca555ef10ef628f5c66c70aae00e01acc5b209cd6 +size 8214949 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5b776136c2be1498bbb3b4ec5715293c0999698 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63c6f84c7ba082877b98049bf62ab4e71777d5a72839ae305bff0cf09a4889e +size 14075592 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..655bc87fae1d27f24af0d527173376b905f0ac0b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f49e076ed34390794ed9942d08ab212ea073ac4436903890224cb23c2324df6 +size 993131 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3faab75d2946c339e2d3ba826865124992221c08 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6adbc705ae7c7840eeeed2d1970a9764f4512543667901d95e5b86c039c2e07 +size 1452524 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d9ab978d40da2aef66930ad462d973acd602972 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a822ad03782a85e4de058be1e5f1e1d2a757eeae27557c2af7af6a4db6bb8161 +size 1911369 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c20b0dcfeb0643585a36d4f0d3d8256c5248044a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d50aa405acc8881f5b08d523a6fe1bb1a570a76ddec944d4a3f0509cb93ae9 +size 2368719 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6200f985f571e39995533c8a21359513decbd986 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c32853b11567d97dacc66f0f004435208457f0fddf51cbc6f2865d4839fe26 +size 2823105 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71793486e3e05bf3c03eb74c9046d878feecec5d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c22b4538598c74c739ef78623d39e7547e074a0d0e0f65ea5a5748db854c007 +size 1203117 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7707f04d8dd5850362fee71438a4226230dc82e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a207a4be37b0a3925dc1b0ae6a31b38f93a69870152049d7eb7efdb2f5fbadf0 +size 1754936 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..26d52017306184d159120aff0d29f606fcd2503a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28510039f6f165b2b7197bcc9b80998685e8a6b1bad6dfdd87ede935acc8cb31 +size 2304675 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1832ff980bdceb82167429fcd77027ffe3365ef6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fcdc532f9fc15963be3e965ccc92cbef94c8a352fc786c1545f1a4101409cd +size 2852887 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f1596ee02c2b013e06f4f6d9549cc89e1b9751ad --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8704e2710e6234803af7a123ea272f8226c0c5bd95a4c9cd7cefb0096a89f681 +size 3398408 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66f0eefcd65401c9caedcbcb702e59ee5388aaee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6174c900c6bf95b4bd2ca1be5206a71f49f44b3bce6a79e0925415d49fbd737b +size 1008100 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..336d78f67629fd0955920750dfb39b348120a5c8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45661b5d1db4996c2dc7cb47a86ebf80013037cf6ee804266fece992b71eaf80 +size 1478634 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..381b565546de1d809c4f4f1c935e9dec9647563a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b58973ce294a4530397190f9795217164039b74591a656dd67b8e8bec8526a28 +size 1949427 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..56df484faeee70dcd019e6efa3aea62aff18c2fa --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01929791bf6894ca3782870ee7bbdb1b61bdfe3da3c204866c78c13c292efcd4 +size 2419126 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7eafe63840f66505669f1e4a729382c42559bf63 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68371b9391b43c9218409f2c73ec410b5f9e922f5d7e4756958fc08d8762365 +size 2885563 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a1f692c8db95820182444f7a5f52df69d35d973e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465a0717ab024564f76bf3d017d834bc61014cb351e2c2f5848ac4a24aba68be +size 1161132 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a2d0fea04ff6163e789fc4cce1bc0d32b2d56a4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bb7d884f85e5fcaba3c24dfcbfa55fd61427b95fa7dff5330b3e09fb7a850f +size 1668647 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1d1c00e4b63022d6a21590ddd930a4e141111e96 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec3ae588d1e4ebb68acbea27c41e875f3fab07d29addbb4e624c091a9d61122 +size 2177065 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36eba9d89ae22c6ba6732ae0d4cbae02483ebc9a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33229bc9ae0e6c95e3959f745485b9f0a68cd6a2b805af8630a0eb4ed5a28d2f +size 2683587 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..387a2c5f8ae70e870b233034212bede8a5efd14c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de9097447a6b988c8c06fdf24f37b38def8b61d8ad15a747e1fbc8de591ce75b +size 3187394 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1abff7909424b937332fb9414e2b9aa321c5da56 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dcd230497db347a7f24eae1d93d66d87b859040ab63e11d55ff62632b0c22f0 +size 1026813 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e9c17fa6d493dba1a7514e9638ff7d2b26052038 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6871c6592bd7c77a18ecb3df9110cbc3d62fa78c8a6e315399af23c3f659e449 +size 1503616 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..06d2758ad78f976dc60bf4a25359641a1c2c9175 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3c1704b624c777f19b6e5abd9f027431018cbab66f4c8a253565a0e6b8c08a +size 1980585 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eae6c516eb0927a9150a496f6f2636af8267b945 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f33062efd43bc74ebd189ff40032cafdfb3acd3a5041da902ca49c292fdf264 +size 2455973 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b474016556291ff95fdcb1f3ac28edab0d60b81d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdbfceb109f737c4868f4abec20bc35b7b9f5693af9ccbfc8f547a3ebb3dab3 +size 2928593 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b64ff3649ddea000233ba532a281a7491dfd25e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1cc5a244d6df2f974fb1440ec76c0bb52fd342e1cf60295ea27d89361ed27eb +size 993543 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c1e0d5d17ed5ef478344b74d3bbdae8b732dbdde --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1880ad208bb681dc9b985ad284141f4ea5ec351f2544918aa551d2b9fe6820bb +size 1447672 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..79f2ee528d4c85ec14c06c820a851e32edd89673 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cb08a87afe0127294922d9103381d42b306768919bf7bdf895e43c806625a6 +size 1901298 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9f17b06853aad09e4ba0ffaf4a1773f73b4a7334 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a39dac6656322795519c75824b52a74624aac7bb51feeb1099e67d0e501c5fe +size 2351222 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..584ad8c26552490459c20bd1abc60feac531d9c2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_4.jsonl @@ -0,0 +1,1000 @@ +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36011, 21367, 14233, 4569], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway.\nQuestion: It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1932 True, False, or Neither? False\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci plays a professional indoor skydiver in iCarly True, False, or Neither? Neither\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg.\nQuestion: Steven Spielberg directed in total 75 movies following the release of Duel. True, False, or Neither? Neither\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh.\nQuestion: Shahzad Sheikh is an Indian film actor True, False, or Neither? False\n###\nThe Benetton B188 was a Formula One racing car designed by Rory Byrne and raced by Benetton team in the 1988 Formula One season and in the first half of the 1989 Formula One season. Dating back to when the team started as Toleman in , the B188 was the first car produced by the team not to be powered by a turbocharged engine.\nQuestion: The Benetton team was in charge of driving The Benetton B188. True, False, or Neither?", "doc_id": 314, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42940, 37308, 36577, 22843], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "American Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company.\nQuestion: The Hudson Motor Car Company ceased to exist in 1954 True, False, or Neither? Neither\n###\nTaina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002.\nQuestion: Taina aired in 5 different countries, including America True, False, or Neither? Neither\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic.\nQuestion: Lausche is not the tallest peak of the Lusatian Mountains. True, False, or Neither? False\n###\nNeilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep.\nQuestion: Neilson Hubbard is known for his work in america True, False, or Neither? True\n###\nFC Saturn-1991 Saint Petersburg (Russian: \u0424\u041a \u00ab\u0421\u0430\u0442\u0443\u0440\u043d\u20111991\u00bb \u0421\u0430\u043d\u043a\u0442\u2011\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 ) was a Russian football team from Saint Petersburg. It played professionally from 1992 to 1995, including 3 seasons (1993\u20131995) in the second-highest Russian First Division. In 1996 it merged with FC Lokomotiv Saint Petersburg. Before 1995 it was called FC Smena-Saturn Saint Petersburg.\nQuestion: FC Saturn-1991 Saint Petersburg merged with FC Lokomotiv Saint Petersburg True, False, or Neither?", "doc_id": 767, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13152, 16774, 32068, 1553], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2008 Emerald Bowl, part of the 2008-09 NCAA football bowl games season, was played on December 27, 2008, at AT&T Park, the home field of the Giants in San Francisco, California. The Miami Hurricanes of the ACC were matched against the California Golden Bears (based in nearby Berkeley, California) of the Pac-10, the first appearance by either team in the seven-year history of the Emerald Bowl.\nQuestion: The 2008 Emerald Bowl was played after christmas True, False, or Neither? True\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Kinsey Millhone is a real person True, False, or Neither? False\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character.\nQuestion: Jeffery Reiner would've directed more episodes. True, False, or Neither? Neither\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe.\nQuestion: The show was created for network 11 True, False, or Neither? False\n###\nThe Perfect Gift is a 2009 spinoff of the 2005 Christian drama movie \"The Perfect Stranger\", and its first sequel, \"Another Perfect Stranger\". It stars Christina Fougnie, Amy Hess, Matt Wallace, and Jefferson Moore once again as Jesus Christ. It was filmed almost entirely in Kentucky, where the first two movies in the series were not.\nQuestion: The Perfect Stranger was filmed in Missouri. True, False, or Neither?", "doc_id": 539, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9857, 40833, 57, 18621], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1924\u201325 WPI Engineers men's basketball team represented Worcester Polytechnic Institute during the 1924\u201325 NCAA men's basketball season. They were coached by Ivan Bigler. The Engineers played their home games at Alumni Gym in Worcester, Massachusetts. The team finished the season with 5 wins and 9 losses.\nQuestion: Ivan Bigler led the team to more losses than wins in 1924-25. True, False, or Neither? True\n###\nSalli Elise Richardson (born November 23, 1967) is an American television and film actress and director. Richardson is known for her role as Angela on the 1994 hit comedy/action film \"A Low Down Dirty Shame\" and for her role as Dr. Allison Blake on the Syfy comedy-drama series \"Eureka\" (2006\u20132012).\nQuestion: Salli Elise Richardson starred in the Syfy comedy-drama series \"Eureka\" (2006\u20132012) True, False, or Neither? True\n###\nThirteen Ghosts (also known as 13 Ghosts and stylized as THIR13EN Ghosts) is a 2001 Canadian-American supernatural horror film directed by Steve Beck. It is a remake of the 1960 film \"13 Ghosts\" by William Castle. It follows the remake of another one of Castle's films, \"House on Haunted Hill\", and was shot entirely around Lower Mainland, British Columbia.\nQuestion: Thirteen Ghosts was filmed primarily in Canada True, False, or Neither? True\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH).\nQuestion: Jacques Vall\u00e9e is not proud of the interdimensional hypothesis. True, False, or Neither? Neither\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: The Plain is to the south of Marston Road True, False, or Neither?", "doc_id": 43, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15210, 6214, 43848, 32357], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi.\nQuestion: An Indian crime film inspired by \"The Godfather\" was re-released 5 years after its original release date, but dubbed in Tamil and had screen re-shots with different actors. True, False, or Neither? True\n###\nHow Green Was My Valley is a BBC Television serial based on the novel by Richard Llewellyn, and features one of the last performances by Stanley Baker. It was first shown in the UK from 29 December 1975 in six weekly parts, while producer Martin Lisemore also cast Si\u00e2n Phillips in his next production, \"I Claudius\" (1976).\nQuestion: Si\u00e2n Phillips is the writer of How Green Was My Valley True, False, or Neither? False\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12.\nQuestion: Grosvenor is not just the name of a street. True, False, or Neither? True\n###\nSouthpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company.\nQuestion: When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office. True, False, or Neither? Neither\n###\nThe Tragedy of Julius Caesar is a tragedy by William Shakespeare, believed to have been written in 1599. It is one of several plays written by Shakespeare based on true events from Roman history, which also include \"Coriolanus\" and \"Antony and Cleopatra\".\nQuestion: The Tragedy of William Shakespeare is a tragedy by Julius Caesar, believed to have been written in 1599. True, False, or Neither?", "doc_id": 558, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24442, 12585, 45164, 21773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s.\nQuestion: Fred Wesley and the Horny Horns produced albums until the year 2000. True, False, or Neither? Neither\n###\nMenelik Watson (born December 22, 1988) is an English professional American football offensive tackle for the Denver Broncos of the National Football League (NFL). He was drafted by the Oakland Raiders in the second round of the 2013 NFL Draft. He played college football at Florida State.\nQuestion: Watson never completed his degree after being drafter into the NFL. True, False, or Neither? Neither\n###\nSt Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795.\nQuestion: According to the 2011 census, St Kilda has more than 20000 living there True, False, or Neither? False\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences.\nQuestion: The Lord of the Rings: The Fellowship of the Ring starts with a T. True, False, or Neither? True\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty.\nQuestion: 679 is an area code. True, False, or Neither?", "doc_id": 709, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32110, 19673, 16595, 35990], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County.\nQuestion: The population in year 2010 was 5 digits True, False, or Neither? True\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct.\nQuestion: At least 79 species have become extinct. True, False, or Neither? True\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978.\nQuestion: Gary Sutherland played first base several times True, False, or Neither? Neither\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries.\nQuestion: The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to me. True, False, or Neither? False\n###\nWhite Fang 2: Myth of the White Wolf is a 1994 American Northern adventure film directed by Ken Olin. A sequel to the 1991 \"White Fang\", it stars Scott Bairstow, Alfred Molina, and Geoffrey Lewis. Filming took place in Aspen, Colorado and Vancouver, British Columbia. Walt Disney Home Video released this movie on VHS October 19, 1994.\nQuestion: White Fang 2: Myth of the White Wolf is a short film True, False, or Neither?", "doc_id": 737, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38872, 39899, 25980, 28445], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014.\nQuestion: The Xcent was first test driven in 2012. True, False, or Neither? Neither\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Parsons and White were successful with The Byrds after they left Nashville West. True, False, or Neither? Neither\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series.\nQuestion: The Bear and the Maiden Fair was written after the book True, False, or Neither? Neither\n###\nThe Merdeka Palace (Indonesian: Istana Merdeka ; also known in Indonesian as Istana Gambir and during colonial times as Paleis te Koningsplein), is one of six presidential palaces in Indonesia. It is located on the north side of the Merdeka Square in Central Jakarta, Indonesia and is used as the official residence of the President of the Republic of Indonesia.\nQuestion: The Merdeka Palace was remodeled in 2006. True, False, or Neither? Neither\n###\nThe Harlem Globetrotters Popcorn Machine was a Saturday morning variety show featuring players from the basketball team the Harlem Globetrotters singing, dancing, and performing comedy sketches. Broadcast in 1974, it was produced by Funhouse Productions for Viacom Productions.\nQuestion: The Harlem Globetrotters Popcorn Machine was a series of tubes True, False, or Neither?", "doc_id": 67, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42578, 38878, 44318, 15933], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Year 493 BC was a year of the pre-Julian Roman calendar. At the time, it was known as the Year of the Consulship of Auruncus and Viscellinus (or, less frequently, year 261 \"Ab urbe condita\"). The denomination 493 BC for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years.\nQuestion: Year 493 BC was very recent. True, False, or Neither? False\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity.\nQuestion: Kdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with headquarters in both Irvine (US) and Changsha City (China). True, False, or Neither? False\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center.\nQuestion: Tango is a dance inspired by classical music True, False, or Neither? Neither\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player.\nQuestion: David Krakauer is the only musician in his family. True, False, or Neither? Neither\n###\nDavid Gibb (born 1 July 1990) is a children's musician and songwriter from Belper, Derbyshire. He was a finalist of the BBC Radio 2 Young Folk Award 2011, as well as winning the 'Highly Commended' prize at the Young Storyteller of the Year Awards the same year. In 2013, Gibb featured alongside musical collaborator Elly lucas in the advertising campaign for Gola trainers.\nQuestion: David Gibb ends with a B. True, False, or Neither?", "doc_id": 298, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27333, 24496, 447, 29923], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Innyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169.\nQuestion: Innyaly its famous for its food. True, False, or Neither? Neither\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle.\nQuestion: Boon Brewery is from Belgium True, False, or Neither? True\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty.\nQuestion: 679 was Fetty Wap's highest charting song. True, False, or Neither? False\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart.\nQuestion: Dwight Yoakam has traveled to and played country music in every US state. True, False, or Neither? Neither\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997.\nQuestion: \"Day In, Day Out\" is by an American Band True, False, or Neither?", "doc_id": 240, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7274, 34279, 2696, 24182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015.\nQuestion: The 2015 City of Onkaparinga ATP Challenger had a ton of subsequent tournaments after this one was played. True, False, or Neither? Neither\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season.\nQuestion: Matthew Mansfield was born more than one billion seconds ago. True, False, or Neither? False\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: Thalia has at least four albums. True, False, or Neither? True\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era.\nQuestion: Howard and Blondell had many scenes together. True, False, or Neither? Neither\n###\nThe Blackpool Gazette is an English evening newspaper based in Blackpool, Lancashire. Published every day except Sunday, it covers the towns and communities of the Fylde coast. It was founded as \"The West Lancashire Evening Gazette\" in 1929 before being renamed the \"Evening Gazette\", and then \"Blackpool Gazette\". The paper's history dates back to a weekly publication founded in 1873.\nQuestion: the paper is distributed to multiple towns True, False, or Neither?", "doc_id": 231, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8191, 8687, 2247, 6795], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Beyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert.\nQuestion: Beyond the Valley of the Dolls is a 1980 satirical melodrama True, False, or Neither? False\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France.\nQuestion: The Communaut\u00e9 de communes des Trois Rivi\u00e8res is made up of mostly low class people True, False, or Neither? Neither\n###\nRear Admiral Kevin John Scarce {'1': \", '2': \", '3': \", '4': \"} (born 4 May 1952) is a retired Royal Australian Navy officer who was the 34th Governor of South Australia, serving from August 2007 to August 2014. He was succeeded by Hieu Van Le, who had previously been his lieutenant governor.\nQuestion: Kevin served over 20 years in government positons True, False, or Neither? Neither\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child.\nQuestion: The short work actually began with a plotline True, False, or Neither? Neither\n###\n\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015.\nQuestion: \"I'd Be Lost\" and \"Only One\" are actually two names for the same song True, False, or Neither?", "doc_id": 978, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15905, 44510, 26867, 23647], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video.\nQuestion: The Newcomers was a box office success. True, False, or Neither? Neither\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart.\nQuestion: Early Mac collaborated with Chance the rapper True, False, or Neither? Neither\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups.\nQuestion: this corn disease is actually put inside of food in certain countries True, False, or Neither? True\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Santa Teresa is in California. True, False, or Neither? True\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Despite being born in France, Jo\u00e3o later on moved to the US True, False, or Neither?", "doc_id": 729, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27663, 44926, 15394, 15173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "X X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars.\nQuestion: X X X X is something you drink True, False, or Neither? True\n###\nThe Office is a British mockumentary sitcom, first broadcast in the United Kingdom on BBC Two on 9 July 2001. Created, written and directed by Ricky Gervais and Stephen Merchant, the programme is about the day-to-day lives of office employees in the Slough branch of the fictitious Wernham Hogg Paper Company. Gervais also stars in the series, playing the central character, David Brent.\nQuestion: Brent is not a fictitious character. True, False, or Neither? False\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Will Smith wanted his own son to be in the movie. True, False, or Neither? Neither\n###\nDovyalis is a genus of shrubs and small trees. Recent genetic evidence has shown the genus to belong to the family Salicaceae; formerly it was classified in the family Flacourtiaceae. The 15 species are native to Africa (Ethiopia south to South Africa) and southern Asia (India, Sri Lanka). Some are cultivated for their fruit.\nQuestion: The Dovyalis genus includes fruit-bearing plants. True, False, or Neither? True\n###\nI Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013.\nQuestion: Eric Tsang's I Love Hong Kong sequels were released on Chinese New Year Day. True, False, or Neither?", "doc_id": 354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7353, 28461, 25988, 29299], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The foreign debt of the Socialist Republic of Romania were loans made by Socialist Republic of Romania under Nicolae Ceau\u0219escu from international creditors denominated in hard currencies. These loans were used to buy technology, equipment and raw materials needed for the industrialization of the country.\nQuestion: The Socialist Republic of Romania received loans from foreign creditors while Nicolae Ceau\u0219escu was in power True, False, or Neither? True\n###\nThe Merdeka Palace (Indonesian: Istana Merdeka ; also known in Indonesian as Istana Gambir and during colonial times as Paleis te Koningsplein), is one of six presidential palaces in Indonesia. It is located on the north side of the Merdeka Square in Central Jakarta, Indonesia and is used as the official residence of the President of the Republic of Indonesia.\nQuestion: The Merdeka Palace has a pink roof. True, False, or Neither? Neither\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008.\nQuestion: He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2007. True, False, or Neither? False\n###\nRiver Raid is a scrolling shooter video game designed and developed by Carol Shaw, and published by Activision in 1982 for the Atari 2600 video game console. Over a million game cartridges were sold. Activision later ported the title to the Atari 5200, ColecoVision, and Intellivision game consoles, as well as to the Commodore 64, IBM PCjr, MSX, ZX Spectrum, and Atari 8-bit family home computers.\nQuestion: River Raid was one of the hardest games ever. True, False, or Neither? Neither\n###\n\"Vanlose Stairway\" is a song written by Northern Irish singer-songwriter Van Morrison and included on his 1982 album, \"Beautiful Vision\". It has remained a popular concert performance throughout Morrison's career and has become one of his most played songs.\nQuestion: Vanlose Stairway is a Van Morrison Song and on an abum True, False, or Neither?", "doc_id": 368, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39216, 33267, 32963, 22724], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge.\nQuestion: Kew Bridge should be a historical landmark. True, False, or Neither? Neither\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games.\nQuestion: The SuperSonics are not a basketball team. True, False, or Neither? False\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway.\nQuestion: Princess Ragnhild was born in MCMXXXI True, False, or Neither? Neither\n###\nThe 8th Race of Champions was a non-Championship motor race, run to Formula One rules, held on 18 March 1973 at Brands Hatch circuit in Kent, UK. The race included several entrants in Formula 5000 cars and was won by Peter Gethin in a Chevron-Chevrolet B24 '72-05'. This was the only race other than the poorly-attended 1969 Madrid Grand Prix in which a Formula 5000 car beat a Formula One car.\nQuestion: The 8th Race of Champions was one of the worst races True, False, or Neither? Neither\n###\nMichael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello.\nQuestion: Shakespeare used Cinthio's squadron leader as a model for Cassio. True, False, or Neither?", "doc_id": 914, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12389, 33946, 41076, 3126], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times.\nQuestion: The Sound of Waves is a 1960 novel by a Japanese person True, False, or Neither? False\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes.\nQuestion: Ben McMillan did not create Max & Shred by himself. True, False, or Neither? True\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005.\nQuestion: The British television sketch comedy premiered on BBC 2 on 21 July 2005 with a second series. True, False, or Neither? True\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South .\nQuestion: The manager of Curzon Ashton Ladies Football Club is a woman. True, False, or Neither? Neither\n###\nThe Green Goblin's Last Stand is a 1992 fan film by Dan Poole, based on the comic book story \"The Night Gwen Stacy Died\", published by Marvel Comics in \"The Amazing Spider-Man\" #121\u2013122. Poole is the director, producer, creative editor, screenwriter, and star of the film. The film and its attendant documentary received showings and accolades at several small film festivals.\nQuestion: The Green Goblin has been entered into contests True, False, or Neither?", "doc_id": 335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40916, 9037, 36776, 37288], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers.\nQuestion: Sanse plays home games at multiple stadiums. True, False, or Neither? Neither\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: The home depot primarily supplies construction companies True, False, or Neither? Neither\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title.\nQuestion: The 2010 ASB Classic was a mans tournament True, False, or Neither? False\n###\nThe 18th Critics' Choice Awards were presented on January 10, 2013 at the Barker Hangar at the Santa Monica Airport, honoring the finest achievements of 2012 filmmaking. The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2012.\nQuestion: The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2813. True, False, or Neither? False\n###\nThe 1998 NCAA Men's Volleyball Tournament was the 29th annual tournament to determine the national champion of NCAA men's collegiate volleyball. The single elimination tournament was played at the Stan Sheriff Center in Honolulu, Hawai\u02bbi during May 1998. With a total tournament attendance of 18,901, this remains this best attended men's volleyball championship.\nQuestion: The teams were made up of students. True, False, or Neither?", "doc_id": 883, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25570, 2174, 26910, 27522], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ron & Carol Cope Stadium at Foster Field, is a football stadium located in Kearney, Nebraska, on the University of Nebraska\u2013Kearney campus. In 2005, the university named the stadium after Ron & Carol Cope, who were long-time supporters of the University of Nebraska System. The field is named after Charlie Foster, a former coach and athletic director at Nebraska\u2013Kearney.\nQuestion: Foster Field is in need of repair. True, False, or Neither? Neither\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States.\nQuestion: The Forum Shops at Caesars has the largest gross income. True, False, or Neither? True\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest.\nQuestion: Beilin District is popular amongst people with hair True, False, or Neither? Neither\n###\nDMOZ (from \"directory.mozilla.org\", an earlier domain name) was a multilingual open-content directory of World Wide Web links. The site and community who maintained it were also known as the Open Directory Project (ODP). It was owned by AOL (now a part of Verizon's Oath Inc.) but constructed and maintained by a community of volunteer editors.\nQuestion: DMOZ is no longer maintained. True, False, or Neither? True\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation.\nQuestion: The 89th medium tank battalion participated in no fewer than 10 campaigns but no more then 25. True, False, or Neither?", "doc_id": 804, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22615, 39956, 6823, 28758], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Svensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord.\nQuestion: Linsborg, Kansas was ties with many of the festival familes. True, False, or Neither? Neither\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood.\nQuestion: Shyamaprasad directed Ivide. True, False, or Neither? True\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child.\nQuestion: Ursula intended the work to be a major hit True, False, or Neither? Neither\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m).\nQuestion: sonette is a community is east central powder river county True, False, or Neither? False\n###\nJay Kahn is a Democratic member of the New Hampshire Senate representing the 10th district. The 10 district is located in the southwestern corner of the state and includes Alstead, Chesterfield, Gilsum, Harrisville, Hinsdale, Keene, Marlborough, Roxbury, Sullivan, Surry, Swanzey, Walpole, Westmoreland and Winchester, New Hampshire.\nQuestion: The 10th district includes 14 towns. True, False, or Neither?", "doc_id": 888, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38969, 42876, 4547, 5606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Colin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs helps with the Gallon Newsletter. True, False, or Neither? True\n###\nBremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society.\nQuestion: Bremen ( ) is a small town in Lincoln County, Maine, United States. It has many villages in it. True, False, or Neither? True\n###\nRastafari, sometimes termed Rastafarianism, is an Abrahamic religion. Classified as a new religious movement, it developed in Jamaica during the 1930s. It lacks any centralised authority and there is much heterogeneity among practitioners, who are known as Rastafari, Rastafarians, or Rastas.\nQuestion: Rastafari is the newest Abrahamic religion True, False, or Neither? Neither\n###\nIn the mathematical field of topology, the Alexandroff extension is a way to extend a noncompact topological space by adjoining a single point in such a way that the resulting space is compact. It is named for the Russian mathematician Pavel Alexandrov.\nQuestion: Alexandroff extensions make compact space True, False, or Neither? True\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records.\nQuestion: It started to dominate less than 100 years ago True, False, or Neither?", "doc_id": 795, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40789, 37317, 44760, 45119], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The PLDT Home TVolution Power Attackers (women's) and the PLDT Home Telpad-Air Force Turbo Boosters (men's) were professional volleyball teams owned by PLDT that played in the Philippine Super Liga (PSL) from 2013 to 2014. The club was first known as PLDT myDSL Speed Boosters.\nQuestion: the club known as pldt speed boosters played in philipine league True, False, or Neither? True\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro.\nQuestion: Nicola or Niccolo Massaro died in 1804 True, False, or Neither? False\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Krishan Kant voted for Sushil Kumar Shinde True, False, or Neither? False\n###\nSophie Tucker (January 13, 1887 \u2013 February 9, 1966) was a Ukrainian-born American singer, comedian, actress, and radio personality. Known for her stentorian delivery of comical and risqu\u00e9 songs, she was one of the most popular entertainers in America during the first half of the 20th century. She was widely known by the nickname \"The Last of the Red Hot Mamas\".\nQuestion: The Last of the Red Hot Mamas was a nickname given to the american-born singer sophie tucker True, False, or Neither? False\n###\nDiscover Financial Services, Inc. is an American financial services company, which issues the Discover Card and operates the Discover and Pulse networks, and owns Diners Club International. Discover Card is the third largest credit card brand in the United States, when measured by cards in force, with nearly 50 million cardholders.\nQuestion: More than 50 million people have Discover Card. True, False, or Neither?", "doc_id": 973, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43489, 32139, 15780, 14722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2012 SEC Women\u2019s Basketball Tournament took place at the Bridgestone Arena in Nashville, Tennessee from March 1-4, 2012. The Tennessee Lady Volunteers won the tournament and received the SEC\u2019s automatic bid to the 2012 NCAA Women\u2019s Basketball Tournament by defeating the LSU Lady Tigers 70-58 in the championship game.\nQuestion: The 2012 SEC Women's Basketball tournament was won by 12 points True, False, or Neither? True\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: Johan Martin Schr\u00f6der considers himself an entrepreneur. True, False, or Neither? Neither\n###\n\"Beez in the Trap\" is a song by rapper Nicki Minaj for her second studio album, \"\" (2012). It was written by Minaj, Maurice Jordan, and 2 Chainz, who contributed a guest verse to the song, while production was handled by Kenoe. The track was released as the album's third single on May 29, 2012 following \"Starships\" and \"Right by My Side\".\nQuestion: Nicki Minaj has released four albums since Beez in the Trap. True, False, or Neither? Neither\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards.\nQuestion: Deckard has won best actor in years other than 2008. True, False, or Neither? Neither\n###\nThe Raid on Le Havre was a two-day naval bombardment of the French port of Le Havre early in July 1759 by Royal Navy forces under Rear-Admiral George Rodney during the Seven Years' War, which succeeded in its aim of destroying many of the invasion barges being gathered there for the planned French invasion of Great Britain.\nQuestion: The Raid on Le Havre was known for the diseases which ravaged camps. True, False, or Neither?", "doc_id": 661, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22800, 29736, 7800, 21163], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ann Rae Rule (n\u00e9e Stackhouse; October 22, 1931 \u2013 July 26, 2015) was an American true crime author of \"The Stranger Beside Me\", about serial killer, and Rule's co-worker, Ted Bundy. Rule was also known for her book \"Small Sacrifices\", about Oregon child murderer Diane Downs. Many of Rule's books center on murder cases that occurred in the Pacific Northwest and her adopted home state of Washington.\nQuestion: \"The Stranger Beside Me\" is a waste of time. True, False, or Neither? Neither\n###\nBela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act.\nQuestion: Bela Lugosi Jr was also a divorce lawyer. True, False, or Neither? Neither\n###\nSing A to Z is the tenth album by popular children's entertainers Sharon, Lois & Bram, originally released in 1990. This album, like many other Sharon, Lois & Bram albums has been re-released many times. It is rumored that the idea for this album came from Lois when she and Sharon were window shopping and came across an alphabet quilt on display.\nQuestion: Sharon, Lois & Bram have released a new album. True, False, or Neither? Neither\n###\nIsmail Merchant (25 December 1936\u00a0\u2013 25 May 2005) was an Indian-born film producer and director. He worked for many years in collaboration with Merchant Ivory Productions which included director (and Merchant's longtime professional and domestic partner) James Ivory as well as screenwriter Ruth Prawer Jhabvala. Their films won six Academy Awards.\nQuestion: Merchant is a homosexual. True, False, or Neither? Neither\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX.\nQuestion: It was high noon when the cartoon characters went for an adult swim. True, False, or Neither?", "doc_id": 768, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36860, 33113, 18459, 32493], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall has a lot of stores that sell jeans True, False, or Neither? Neither\n###\nMiss Europe 2002, the 57th Miss Europe pageant, was held at the Beirut International Exhibition & Leisure Center in Beirut, Lebanon on December 28, 2002. Svetlana Koroleva, Miss Russia, was crowned Miss Europe 2002 by outgoing titleholder Elodie Gossuin of France.\nQuestion: Svetlana Koroleva was born on December 29, 1981. True, False, or Neither? Neither\n###\nThe 2015 Auburn Tigers softball team is an American softball team, representing the Auburn University for the 2015 NCAA softball season. In 2014, the Auburn Tigers softball team went 42-19-1 during Clint Myers first season. The Auburn Tigers play their home games at Jane B. Moore Field.\nQuestion: The Tigers play their home games at Jane B. Moore Field.\n True, False, or Neither? True\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city.\nQuestion: The population was the sum 800 + 53 in 2010 True, False, or Neither? True\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: kidsty pike fell and Haweswater Reservoir are in the same district in England True, False, or Neither?", "doc_id": 878, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36872, 22127, 41005, 39566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke.\nQuestion: \"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost in Space\" and 117th episode overall. True, False, or Neither? False\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th.\nQuestion: Italy won 3rd against Vasili Vyacheslavovich Blagov and Irina Cherniaeva in the 1972 Winter Olympics. True, False, or Neither? Neither\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Chris McKendry is not her original name True, False, or Neither? True\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The gator bowl was played at the Gators home field. True, False, or Neither? Neither\n###\nThe City of Canada Bay is a local government area in the Inner West of Sydney, New South Wales, Australia. The city was formed on 1 December 2000, following the merger of Concord and Drummoyne councils. The city covers an area of 19.82 km2 and as at the 2016 census had a resident population of . The city is ultimately named after Canada Bay, a bay on the Parramatta River.\nQuestion: The City of Canada Bay covers more than 11 miles. True, False, or Neither?", "doc_id": 910, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12217, 9286, 25834, 34483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hedera helix (common ivy, English ivy, European ivy, or just ivy) is a species of flowering plant in the family Araliaceae, native to most of Europe and western Asia. A rampant, clinging evergreen vine, it is a familiar sight in gardens, waste spaces, on house walls, tree trunks and in wild areas across its native habitat.\nQuestion: Hedera helix can found in the EU True, False, or Neither? True\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone has special status with the government. True, False, or Neither? Neither\n###\nThe following is a list of female cabinet ministers of Thailand. Thailand is a country located at the centre of the Indochina peninsula in Southeast Asia. It is bordered to the north by Burma and Laos, to the east by Laos and Cambodia, to the south by the Gulf of Thailand and Malaysia, and to the west by the Andaman Sea and the southern extremity of Burma.\nQuestion: Thailand has female cabinet members True, False, or Neither? True\n###\nThe Exterminating Angel (Spanish: El \u00e1ngel exterminador ), is a 1962 surrealist film, written and directed by Luis Bu\u00f1uel, starring Silvia Pinal, and produced by her then-husband Gustavo Alatriste. Sharply satirical and allegorical, the film contains a view of human nature suggesting \"mankind harbors savage instincts and unspeakable secrets\".\nQuestion: The film contains an interesting view of human nature. True, False, or Neither? Neither\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games.\nQuestion: The SuperSonics lost to the Basketball team from Washington in the playoffs. True, False, or Neither?", "doc_id": 226, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31426, 3674, 33223, 8742], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: The mall has stores in it True, False, or Neither? True\n###\nGirls on Top is a British ITV sitcom, broadcast in 1985 and 1986, and made by Witzend for the ITV contractor Central Independent Television. It stars Dawn French, Jennifer Saunders, Ruby Wax and Tracey Ullman, and was written by French, Saunders, and Wax with additional material from Ullman. Despite a poor critical reception, the series was a ratings success.\nQuestion: Girls on Top received good ratings. True, False, or Neither? True\n###\nGwendoline See-Hian Yeo (; born July 10, 1977) is a Singaporean-born American actress, voice actress and musician, best known for her recurring guest-star role as Xiao-Mei in the hit television series \"Desperate Housewives\", and as Dr. Kelly Lee in \"General Hospital\".\nQuestion: Gwendoline See-Hian Yea is a decent musician. True, False, or Neither? Neither\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: Coates became prominently known after the film's release. True, False, or Neither? Neither\n###\nHenry II (18 April 1503 \u2013 25 May 1555), nicknamed \"Sang\u00fcesino\" because he was born at Sang\u00fcesa, was the King of Navarre from 1517, although his kingdom had been reduced to a small territory north of the Pyrenees by the Spanish conquest of 1512. Henry succeeded his mother, Queen Catherine, upon her death. His father was her husband and co-ruler, King John III, who died in 1516.\nQuestion: Queen Catherine was King Of Navarre True, False, or Neither?", "doc_id": 146, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42508, 20268, 31487, 35010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Max & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes.\nQuestion: The series premiered on Nickelodeon in the UK on October 6, 2014. True, False, or Neither? False\n###\nChristmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society.\nQuestion: Christmas Eve and day are the most important holidays in Western Society. True, False, or Neither? Neither\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition.\nQuestion: Best of 4Minute was released in 20th century. True, False, or Neither? False\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group.\nQuestion: James Moody only does Jazz music instrumental recordings for IPO Recordings. True, False, or Neither? Neither\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker.\nQuestion: In the Doberman Gang movie, one of the dogs was named Clyde Parker. True, False, or Neither?", "doc_id": 57, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22821, 38128, 42079, 38123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website.\nQuestion: It was very popular True, False, or Neither? Neither\n###\nElizabeth Berridge (born May 2, 1962) is an American film and theatre actress. She is known for playing Constanze Mozart in the Academy Award-winning 1984 film \"Amadeus\", for the role of Officer Eve Eggers on \"The John Larroquette Show\" (1993-1996), and for her performances in the theater.\nQuestion: Berridge won awards for her theater performances. True, False, or Neither? Neither\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer.\nQuestion: The princess was related to the pretender of the throne. True, False, or Neither? True\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole was in the band called the Commodores. True, False, or Neither? False\n###\nPeter Murray Kapetan was an American Broadway actor, singer and dancer notable for playing numerous roles during a thirty-year career. He was notable for performing in the musical \"The Wedding Singer\" as a Ronald Reagan impersonator. He appeared in \"Titanic\", \"Sunset Boulevard\", \"Joseph and the Amazing Technicolor Dreamcoat\", and \"Got Tu Go Disco\".\nQuestion: Peter Murray Kapetan had a long career, not quite a 40 year career, since he retired ten years before he would be in the show business for the full 40 years True, False, or Neither?", "doc_id": 815, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23902, 21721, 9033, 22081], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana is found in Texas. True, False, or Neither? False\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue.\nQuestion: Misty Knight was read by George. True, False, or Neither? Neither\n###\n1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire.\nQuestion: The Australian version of the show is the second most popular version of the game show worldwide. True, False, or Neither? Neither\n###\nPenthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue.\nQuestion: Penthouse is difficult to watch. True, False, or Neither? Neither\n###\nFranklin Martin Loew, DVM, PhD, (1939 in Syracuse, NY \u2013 2003 in Boston, MA) was president of Becker College, dean of the College of Veterinary Medicine at Cornell University and dean of Tufts University School of Veterinary Medicine (now Tufts Cummings School of Veterinary Medicine).\nQuestion: Franklin Martin Loew was born more than 1000 days ago. True, False, or Neither?", "doc_id": 705, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39480, 31948, 13651, 33172], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis.\nQuestion: Jose is from Columbus, OH, USA. True, False, or Neither? Neither\n###\nOrphan X is a 2016 thriller novel written by Gregg Hurwitz. It is the first in a five-book series of the same name from publisher Minotaur Books with the film rights belonging to Warner Bros. Bradley Cooper is likely to produce and possibly star the movie.\nQuestion: Bradley Cooper had intentions to direct the Orphan X thriller movie, but has since chosen to be a voice actor for it. True, False, or Neither? False\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt.\nQuestion: Klagenfurt am W\u00f6rthersee has 99,100 tourists per year. True, False, or Neither? Neither\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015.\nQuestion: Dantoro was not the first Emir True, False, or Neither? True\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\".\nQuestion: The Book of Job: A Journey into Parenthood was written by husband-and-wife writing duo Greg Fried and Lisa Lazarus from South Africa. They also co wrote \"Paradise\" and \"When in Broad Daylight I Open my Eyes\" True, False, or Neither?", "doc_id": 553, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28750, 21392, 5091, 45236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doctor Neo Periwinkle Cortex (often referred to as Doctor Cortex, Neo Cortex, or simply Cortex) is a fictional character and the main antagonist of the \"Crash Bandicoot\" series. His name is a play on the term neocortex, an area of the brain. He has been the archenemy of Crash ever since his first appearance, in the game \"Crash Bandicoot\".\nQuestion: Doctor Neo Periwinkle Cortex will get his own video game on PC in 2020 True, False, or Neither? Neither\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci was born 25 days before Christmas True, False, or Neither? Neither\n###\nAlrifai is a Lebanese multinational nut retailing company headquartered in Beirut, Lebanon, and a wholly owned subsidiary of Alrifai International Holding Ltd. It is the largest nut retailing chain in the Middle East and the company with the biggest market share in Lebanon.\nQuestion: The company is a publicly traded company True, False, or Neither? Neither\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia.\nQuestion: Louis Ferdinand II had seven siblings. True, False, or Neither? False\n###\nThe Tampa Bay Buccaneers season was the franchise's 39th season in the National Football League. It was also the first season under head coach Lovie Smith, replacing Greg Schiano, who was fired at the end of the 2013 season. It was also the first season under general manager Jason Licht, following the departure of Mark Dominik, after a disappointing 2013 season.\nQuestion: The Tampa Bay Buccaneers season has less than 30 seasons in the National Football League True, False, or Neither?", "doc_id": 766, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11156, 35306, 32881, 25417], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "William Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65.\nQuestion: William Lang Denholm \"Bill\" McCue died in Scotland. True, False, or Neither? Neither\n###\nBarbro Martinsson (born 16 August 1935) is a former Swedish cross country skier who competed during the 1960s. Born in Valbo, she won two silver medals in the 3 x 5 km at the 1964 Winter Olympics and the 1968 Winter Olympics. Martinsson finished 4th in the 1968 Winter Olympics in both 5 km and 10 km.\nQuestion: She is now an American citizen. True, False, or Neither? Neither\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family.\nQuestion: The station was renamed in 1928 True, False, or Neither? True\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden.\nQuestion: The Kyrkog\u00e5rden Runestones are a French True, False, or Neither? False\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\".\nQuestion: Donovan was born 20 years before he played Hamlet on stage. True, False, or Neither?", "doc_id": 397, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19267, 40448, 7200, 30094], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Man in a Hurry (French: \"L'Homme press\u00e9\" , Italian: \"L'ultimo giorno d'amore\" , released in UK as The Hurried Man) is a 1977 French-Italian drama film directed by \u00c9douard Molinaro and starring Alain Delon and Mireille Darc. It is based on the novel \"The Man in a Hurry\" by Paul Morand. It recorded admissions of 730,581 in France.\nQuestion: Man in a Hurry is a French-Italian drama film directed by \u00c9douard Molinaro was released in the nineteen seventies. True, False, or Neither? True\n###\nGeorge Joseph Maloof Jr. (born September 2, 1964) is an American entrepreneur and businessman. He is the former owner of the Sacramento Kings, the former owner of the now defunct Sacramento Monarchs, and minority owner of the Palms Casino Resort in Las Vegas with his brothers Gavin Maloof, Joe Maloof, Phil Maloof and sister Adrienne Maloof. He is part of the Maloof Family.\nQuestion: George Joseph Maloof Jr. was the head coach of the Sacramento Monarchs. True, False, or Neither? False\n###\nBeyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert.\nQuestion: 1970 is known for being the year that comes after 1969. True, False, or Neither? False\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\").\nQuestion: Sanation was created after the interwar period True, False, or Neither? False\n###\nLucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck.\nQuestion: Francois lived over 60 years. True, False, or Neither?", "doc_id": 653, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20710, 22883, 23247, 30241], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph was born in 1821 and died in 1891 True, False, or Neither? True\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015.\nQuestion: The 1985 Nebraska Cornhuskers is a basketball team. True, False, or Neither? False\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: The Mast\u00edn Espa\u00f1ol is not a tiny dog. True, False, or Neither? True\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: The San Nicolao Tunnel took 5 years to build True, False, or Neither? Neither\n###\nChandana Banerjee(born 1953) is an Indian actress, model and beauty queen. She was the winner of first edition of Femina Teen Princess. she represented India at International Teen Princess 1967 held in Chicago, Illinois on 1967 May 26 and was crowned 1st Runner Up there. After that she became a model in India. Prior to winning the pageant she was starred in Indian film \"Teen Kanya\".\nQuestion: Benerjee was 12 when she was at International Teen Princess. True, False, or Neither?", "doc_id": 464, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [45418, 5560, 7370, 3198], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bertrand Piccard (born 1 March 1958) is a Swiss psychiatrist and balloonist. Along with Brian Jones, he was the first to complete a non-stop balloon flight around the globe, in a balloon named Breitling Orbiter 3. He was the initiator, chairman, and co-pilot, with Andr\u00e9 Borschberg, of Solar Impulse, the first successful round-the-world solar powered flight.\nQuestion: Bertrand Piccard was born more than 1959 years ago. True, False, or Neither? False\n###\nGrowing Up is the first Korean-language studio album by South Korean singer-songwriter and actress IU. It was released on April 23, 2009, as a follow-up to her 2008 debut mini-album \"Lost and Found\". Two of the album's 16 tracks, \"Boo\" and \"You Know (\uc788\uc796\uc544) (Rock Ver.)\", were released as singles.\nQuestion: South Korean singer-songwriter and actress IU writes her own songs.\n True, False, or Neither? Neither\n###\nB&Q plc is a British multinational DIY and home improvement retailing company, headquartered in Eastleigh, England, United Kingdom and is a wholly owned subsidiary of Kingfisher plc. Founded by Richard Block and David Quayle in 1969 originally as Block & Quayle, the retail chain offers over 40,000 products across 300 stores and online.\nQuestion: B&Q plc is founded by Richard Block and Donald Trump True, False, or Neither? False\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania.\nQuestion: Idris Sultan is an avid Big Brother fan. True, False, or Neither? Neither\n###\nJohn Wellborn Root (January 10, 1850 \u2013 January 15, 1891) was an American architect who was based in Chicago with Daniel Burnham. He was one of the founders of the Chicago School style. Two of his buildings have been designated a National Historic Landmark; others have been designated Chicago landmarks and listed on the National Register of Historic Places. In 1958, he received the AIA Gold Medal.\nQuestion: John Wellborn Root won a medal in 1957. True, False, or Neither?", "doc_id": 778, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41741, 5960, 27544, 25422], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Barry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions.\nQuestion: Barry and Stuart are both BAFFTA nominated magicians and comedians. True, False, or Neither? True\n###\nColville Lake is the 20th largest lake in Canada's Northwest Territories. The lake is located 100\u00a0km (62\u00a0mi) northwest of Great Bear Lake in the Sahtu Region. The lake has a perimeter of 121\u00a0km (75\u00a0mi) and a net area of 416\u00a0km\u00b2 (161 sq mi) and a total area of 439\u00a0km\u00b2 (169 sq mi).\nQuestion: Colville Lake is a lake. True, False, or Neither? True\n###\nThe 2017 Macanese general election took place on 17 September 2017 according to the provisions of the Basic Law of Macau. Out of a total of 33 seats, 14 were directly elected by universal suffrage under the highest averages method, while 12 were voted on from the Functional constituency, and 7 from nomination by the Chief Executive.\nQuestion: The 2017 Macanese general election was viewed as a success True, False, or Neither? Neither\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden.\nQuestion: The Kyrkog\u00e5rden Runestones are a Canadian monument. True, False, or Neither? False\n###\nKnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates.\nQuestion: KnowledgeWare is in the northern hemisphere True, False, or Neither?", "doc_id": 817, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25747, 43601, 18794, 6924], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "NBA 2K9 is a basketball simulation video game developed by Visual Concepts and published by 2K Sports. It is the tenth installment in the \"NBA 2K\" franchise and the successor to \"NBA 2K8\". It was released in 2008 for PlayStation 2, PlayStation 3, Xbox 360, and Microsoft Windows. Kevin Garnett is the cover athlete of the game. \"NBA 2K9\" is the predecessor to \"NBA 2K10\" in the \"NBA 2K\" series.\nQuestion: You were able to play NBA 2K9 on pc True, False, or Neither? True\n###\nMore of Tom Lehrer was the second studio album recorded by musical satirist Tom Lehrer. The LP contains the same songs (in the same sequence) as the live album \"An Evening Wasted with Tom Lehrer\", which was recorded and released earlier in the same year. The album was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1959.\nQuestion: \"More of Tom Lehrer\" is the studio version of the live album, \"An evening wasted with Tom Lehrer\". True, False, or Neither? True\n###\nThe Mercedes-Benz W221 is a chassis code of S-Class, the successor of the Mercedes-Benz S-Class (W220) and the predecessor of the Mercedes-Benz S-Class (W222). The S-Class are the flagship vehicles of Mercedes-Benz and each generation typically introduces a range of technical innovations and developments that over time will find their way into smaller cars.\nQuestion: The Mercedes-Benz is a very good car True, False, or Neither? Neither\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label.\nQuestion: Lance King was in many bands True, False, or Neither? True\n###\nDaphniphyllum is the sole genus in the flowering plant family Daphniphyllaceae and was described as a genus in 1826. The genus includes evergreen shrubs and trees mainly native to east and southeast Asia, but also found in the Indian Subcontinent and New Guinea.\nQuestion: the plant is only is asia True, False, or Neither?", "doc_id": 283, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29862, 32201, 26705, 28523], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "West Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records.\nQuestion: West Coast hip hop has been played by iron maiden True, False, or Neither? Neither\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: There are over 25 different mint species to yet be given a name and identified. True, False, or Neither? Neither\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc\nQuestion: Varun Sharma is from the subcontinent True, False, or Neither? True\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title.\nQuestion: A woman won the single title. True, False, or Neither? True\n###\n\"I Never Picked Cotton\" is a song made famous by country music singer Roy Clark. Written by Bobby George and Charles Williams, the song was released in 1970 as the title track to the album released that same year. The song peaked at No. 5 on the \"Billboard magazine\" Hot Country Singles chart that summer.\nQuestion: Clark, George, and Williams are all responsible for the song having become such a success True, False, or Neither?", "doc_id": 359, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44097, 33120, 36230, 30244], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dead to Rights II is a third-person action video game, developed by Widescreen Games, published by Namco, and released in 2005. Serving as a prequel to \"Dead to Rights\", it begins with the story of Jack Slate and Shadow before the events of the original game. A PSP prequel, \"\", released on June 28, 2005.\nQuestion: Dead to Rights II was released in the summer. True, False, or Neither? Neither\n###\nPillars of Eternity: The White March is a two-part expansion pack for the 2015 role-playing video game \"Pillars of Eternity\", developed by Obsidian Entertainment and published by Paradox Interactive. The first part was released on August 25, 2015, while the second was released on February 16, 2016.\nQuestion: There was less than a year between the Pillars of Eternity releases. True, False, or Neither? True\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games.\nQuestion: In their 9th season, the Seattle SuperSonics did not finish in first place in the Western Conference. True, False, or Neither? True\n###\nThe Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project.\nQuestion: The Castaways Hotel and Casino has been visited by Bush. True, False, or Neither? Neither\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor.\nQuestion: Van Cleef & Arpels was favoured by royalty True, False, or Neither?", "doc_id": 940, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8351, 25093, 6314, 15646], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets.\nQuestion: Rodrequis La'Vant Stephens used to play baseball in high school. True, False, or Neither? Neither\n###\nNuestra Belleza Nuevo Le\u00f3n 2007, was held at Las Lomas Eventos in Monterrey, Nuevo Le\u00f3n on July 25, 2007. At the conclusion of the final night of competition, Anagabriela Espinoza of San Pedro Garza Garc\u00eda was crowned the winner. Espinoza was crowned by outgoing Nuestra Belleza Nuevo Le\u00f3n titleholder, Mariana Lombard. Eight contestants competed for the state title.\nQuestion: Nuestra Belleza Nuevo Le\u00f3n 2007 had 10 contestants True, False, or Neither? False\n###\nAmandil is a fictional character from J.R.R. Tolkien's Middle-earth legendarium. Amandil was a Lord of And\u00fani\u00eb, succeeding his father N\u00famendil upon his death. Amandil is most noted for being the father of Elendil, founder of the N\u00famen\u00f3rean Realms in Exile.\nQuestion: Amandil is the father of Numendil. True, False, or Neither? False\n###\nJesco White, also known as the \"Dancing Outlaw\" (born July 30, 1956) is an American folk dancer and entertainer. He is best known as the subject of three American documentary films that detail his desire to follow in his famous father's footsteps while dealing with depression, drug addiction, alcoholism, and the poverty that permeates much of rural Appalachia.\nQuestion: White's work is influenced by cowboy films. True, False, or Neither? Neither\n###\nJoseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars.\nQuestion: Joseph Eppele was born on a sunny day on august 12, 2010 True, False, or Neither?", "doc_id": 381, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4123, 39966, 11626, 19548], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books.\nQuestion: Vadim Jean began directing films in 1998. True, False, or Neither? Neither\n###\n\"Uh Huh\" is the first single by R&B group B2K, from their self-titled debut album. The song was released in July 2001 and it peaked at number 37 on the \"Billboard\" Hot 100 and number 20 on the Hot R&B/Hip-Hop Songs. It also peaked at number 35 in the UK on its first entry and reached a new peak at number 31 on a re-release.\nQuestion: Released in 2001 the song Uh Huh was the only single to date released by B2K. True, False, or Neither? Neither\n###\nIslamic rule govenrned the southern part of the Iberian peninsula for seven hundred years. In medieval history, \"al-Andalus\" (Arabic: \u0627\u0644\u0623\u0646\u062f\u0644\u0633\u200e \u200e ) was the name given to the parts of the Iberian Peninsula and Septimania governed by Arab and North African Muslims (given the generic name of Moors), at various times in the period between 711 and 1492.\nQuestion: Islamic laws governed the southern portion of the Iberian peninsula for six hundred years. True, False, or Neither? False\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL).\nQuestion: Justin Smith was born in 1983 True, False, or Neither? False\n###\nHighly Illogical is an album which contains a collection of songs performed by \"Star Trek\" actor Leonard Nimoy. Most of the songs were originally recorded in the 1960s. The collection includes \"The Ballad of Bilbo Baggins\", which tells the story of J.R.R. Tolkien's book \"The Hobbit\", and has been immortalized by being included on various novelty compilations over the years.\nQuestion: The songs are from famous movies and books. True, False, or Neither?", "doc_id": 669, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8560, 26019, 452, 18648], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is maintained by a parks department. True, False, or Neither? True\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\"\nQuestion: People really like using Flatbush Avenue\n True, False, or Neither? Neither\n###\nH\u00e9ctor Canziani was an Argentine poet, screenwriter and film director who worked in Argentine cinema in the 1940s and 1950s. Although his work was most abundant in screenwriting and poetry after his brief film career, he is best known for his directorship and production of the 1950 tango dancing film Al Comp\u00e1s de tu Mentira based on a play by Oscar Wilde.\nQuestion: Canziani was of Argentine descent. True, False, or Neither? True\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day.\nQuestion: Tunnel Vision is a debut novel. True, False, or Neither? True\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar).\nQuestion: Mike D, MCA, and Ad-Rock were the only founders of the Beastie Boys. True, False, or Neither?", "doc_id": 627, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28008, 25945, 41195, 6193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: Juan Cruz was 22 when he appeared at the 2017 US open. True, False, or Neither? True\n###\nFriday: The Animated Series was a short-lived animated television series based on the \"Friday\" film series. The show is directed by Kevin Lofton and is co-produced and co-distributed by New Line Television, a subsidiary of New Line Cinema (the distributors of the \"Friday\" movies), MTV2, and Ice Cube's Cubevision. The series only lasted for 8 episodes.\nQuestion: The series did not achieve hundreds of episodes. True, False, or Neither? True\n###\nThe Diawling National Park lies in south west Mauritania around the Senegal River delta. During the rainy season, much of the park consists of large lakes. It is known for having over 220 species of identified birds, including pelicans, black storks, and flamingos, and also for its fish.\nQuestion: Large lakes cover the park for the majority of the seasons. True, False, or Neither? Neither\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Karan Johar's was in mumbai on february 20 1999 True, False, or Neither? Neither\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966).\nQuestion: Ben Barzman was born more than 1989 years ago. True, False, or Neither?", "doc_id": 848, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44575, 9070, 44098, 31914], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: \"Fight or Flight\" starred Hayden Panettiere True, False, or Neither? Neither\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show is a massive hit. True, False, or Neither? Neither\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency.\nQuestion: O'Donnell Independent School District was rated \"Academically Acceptable\" in 2011 True, False, or Neither? Neither\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: CUHK is a research university True, False, or Neither? True\n###\nThe Brandon Learning Centre is the first school in Hong Kong to offer public speaking classes based around English Speaking Board assessments. The English Speaking Board was founded in 1954 and the qualifications are regulated by the UK Office of Qualifications and Examinations Regulation \nQuestion: The Brandon Learning Centre wanted to institute Taco Tuesday at the center, but it was vetoed by the founders True, False, or Neither?", "doc_id": 710, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18085, 7920, 15516, 32566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a rare plant that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in Washington state. True, False, or Neither? True\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Destiny was not nominated for an Oscar award. True, False, or Neither? True\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: Gotz was a German diplomat so he was well-liked. True, False, or Neither? Neither\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines.\nQuestion: Resil B. Mojares will run for President in 2020 True, False, or Neither? Neither\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: Power chords are played in more than amplified guitars True, False, or Neither?", "doc_id": 507, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26619, 23127, 6452, 6096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming.\nQuestion: Cape Vakop was chartered over 36 years ago True, False, or Neither? True\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina.\nQuestion: Some participants cheated in the event True, False, or Neither? Neither\n###\nCon Stough is Professor of Cognitive Neuroscience and Psychology at Swinburne University of Technology, Australia, director of the Swinburne Centre for Neuropsychology and director of the newly formed National Institute of Complementary Medicine (NICM) Collaborative Centre for the study of herbal and natural medicines for neurocognition.\nQuestion: Stough is not a professor. True, False, or Neither? False\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests.\nQuestion: The show was produced by Lorne Michaels. True, False, or Neither? True\n###\nDicksonia youngiae, common name bristly tree fern, is a fern that comes from cool, sheltered rainforests in New South Wales and Queensland, Australia. It is found north of the Bellinger River, in New South Wales, and can be seen in the wild at Nightcap National Park.\nQuestion: Dicksonia youngiae can only be seen in the wild at Nightcap National Park. True, False, or Neither?", "doc_id": 615, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14815, 20575, 38067, 43096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Leberecht Maass (or Maa\u00df) (24 November 1863 \u2013 28 August 1914) was the \"Konteradmiral\" who commanded the German naval forces at the first Battle of Heligoland Bight. He lost his life when his flagship, the light cruiser SMS \"C\u00f6ln\" , was sunk by British battlecruisers commanded by Vice Admiral David Beatty.\nQuestion: Leberecht Maass passed away in the winter of 1914. True, False, or Neither? False\n###\nShould the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986.\nQuestion: Peter Murphy covered some songs by Magazine. True, False, or Neither? True\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Nashville West made girls go crazy. True, False, or Neither? Neither\n###\nBrookpark is a station on the RTA Red Line located on the borders of Brook Park and Cleveland, Ohio, USA. It is located along Brookpark Road (Ohio State Route 17), west of the intersection of Henry Ford Boulevard (Ohio State Route 291) and east of the intersection of the Berea Freeway (Ohio State Route 237).\nQuestion: Brookpark is in Cleveland True, False, or Neither? True\n###\nSan Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County.\nQuestion: Newlyweds: Nick and Jessica ended because Nick and Jessica got divorced. True, False, or Neither?", "doc_id": 985, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9629, 5286, 15338, 8299], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: The Mets had a winning season every year between 1968 and 1974. True, False, or Neither? False\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017).\nQuestion: Alexander Vincent LoScialpo is an actor. True, False, or Neither? True\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists.\nQuestion: Coraz\u00f3n Valiente is a top ten show True, False, or Neither? Neither\n###\nThree Preludes is a ballet made for Mikhail Baryshnikov by Mark Morris to eponymous music by George Gershwin for his own company and presented as a piece d'occasion by the New York City Ballet. The performance took place June 16, 1992, at the New York State Theater, Lincoln Center.\nQuestion: Three Preludes was performed at other theaters True, False, or Neither? Neither\n###\nLaura Ellen Ziskin (March 3, 1950 \u2013 June 12, 2011) was an American film producer, known as the executive producer of the 1990 romantic comedy \"Pretty Woman\", and as the first woman to produce the Academy Awards telecast alone, producing the 74th Academy Awards in 2002 and the 79th Academy Awards in 2007.\nQuestion: Laura Ellen Ziskin is the zodiac killer. True, False, or Neither?", "doc_id": 161, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6395, 44006, 1424, 5730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929.\nQuestion: SR 10 was widened in the 1920's True, False, or Neither? Neither\n###\nThe 1986\u201387 St. John's Redmen basketball team represented St. John's University during the 1986\u201387 NCAA Division I men's basketball season. The team was coached by Lou Carnesecca in his nineteenth year at the school. St. John's home games are played at Alumni Hall and Madison Square Garden and the team is a member of the Big East Conference.\nQuestion: Lou Carnesecc coached in the 60s True, False, or Neither? True\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006.\nQuestion: the 71st episode of \"The Sopranos\" was the last episode to be written by Terence Winter. True, False, or Neither? Neither\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France.\nQuestion: Hauts de France is a small area in France True, False, or Neither? Neither\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr.\nQuestion: Frank Harvey Jnr. wrote Things Happen at Night . True, False, or Neither?", "doc_id": 4, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2502, 32136, 25239, 30195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg.\nQuestion: Giovanni Ferrero's net worth is greater than that of Bill Gates. True, False, or Neither? Neither\n###\nStudies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\"\nQuestion: Carson also relates this to the successes of the 21st century. True, False, or Neither? Neither\n###\nInferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India.\nQuestion: Inferno was released before 1990 True, False, or Neither? False\n###\nSavoy Brown, originally known as the Savoy Brown Blues Band, are an English blues rock band formed in Battersea, south west London in 1965. Part of the late 1960s blues rock movement, Savoy Brown primarily achieved success in the United States, where they promoted their albums with non-stop touring.\nQuestion: Savoy Brown was created in the 1950s True, False, or Neither? False\n###\nA Day with Wilbur Robinson is a 1990 children's picture book (slightly expanded for a 2006 reissue) written and illustrated by William Joyce. A film adaptation called \"Meet the Robinsons\" was released by Walt Disney Pictures in 2007 in the United States.\nQuestion: Walt Disney Pictures releases children's movies True, False, or Neither?", "doc_id": 299, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9799, 43278, 24516, 5272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robert Mills Delaney, sometimes incorrectly spelled Delany (1903-1956) was an American composer. He studied with Nadia Boulanger and Arthur Honegger in Paris, and was best known for his 1928 choral symphony, John Brown's Song, based on Stephen Benet's Pulitzer Prize winning poem \"John Brown's Body\".\nQuestion: Robert Delaney studied in Rome. True, False, or Neither? False\n###\nThe first season of Survival Audition K-pop Star (Korean: \uc11c\ubc14\uc774\ubc8c \uc624\ub514\uc158 K\ud31d \uc2a4\ud0c0 ) premiered on December 4, 2011, airing every Sunday evening at 6:30 pm under the \"Good Sunday\" programming block on SBS, until April 29, 2012. The first winner was Park Ji-min, who chose to sign with JYP Entertainment.\nQuestion: JYP Entertainment did not have any albums released by Park Ji-min in 2011. True, False, or Neither? True\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s.\nQuestion: The Final Blow won many awards. True, False, or Neither? Neither\n###\nAdrienne Maloof (born September 4, 1961) is an American businesswoman, television personality, shoe designer and co-owner of the various business holdings of Maloof Companies, which include a 2% stake in the Palms Casino Resort in Las Vegas, Nevada; Maloof Productions, Maloof Music and the annual Maloof Money Cup skateboarding event.\nQuestion: Maloof is an American. True, False, or Neither? True\n###\nClub Deportivo Utiel is a football team based in Utiel in the autonomous community of Valencian Community. Founded in 1945, the team plays in Tercera Divisi\u00f3n \u2013 Group 6. The club's home ground is \"La Celadilla\", which has a capacity of 1,500 spectators.\nQuestion: A Tercera Divisi\u00f3n team has 1,500 spectators. True, False, or Neither?", "doc_id": 149, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15063, 20300, 26029, 6367], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: West Texas Wildcatters were established May 20, 1988. True, False, or Neither? Neither\n###\nCaroline Quentin (born Caroline Jones; 11 July 1960) is an English actress. Quentin became known for her television appearances: portraying Dorothy in \"Men Behaving Badly\" (1992\u20131998), Maddie Magellan in \"Jonathan Creek\" (1997\u20132000), and DCI Janine Lewis in \"Blue Murder\" (2003\u20132009).\nQuestion: Caroline Quentin lived at least 49 years. True, False, or Neither? True\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\"\nQuestion: People really like using Flatbush Avenue to get out of queens\n True, False, or Neither? Neither\n###\nSwinburne Online is the online arm of Swinburne University of Technology which is an Australian university based in Melbourne, Victoria. Swinburne Online was founded in 2011 after a 50-50 joint venture between Swinburne University of Technology and SEEK Learning seeking to capitalise on increasing demand for off-campus education.\nQuestion: Swineburne Online was not created in the USA. True, False, or Neither? True\n###\nTight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD.\nQuestion: Tight was originally released more than 1999 months ago. True, False, or Neither?", "doc_id": 701, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21197, 42832, 116, 37133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Justin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL).\nQuestion: Justin was always a utility player True, False, or Neither? Neither\n###\nThe Vorontsov Lighthouse (Ukrainian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u044c\u043a\u0438\u0439 \u043c\u0430\u044f\u043a , Russian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u043a\u0438\u0439 \u043c\u0430\u044f\u043a ) is a famous red-and-white, 27.2 metre landmark in the Black Sea port of Odessa, Ukraine. It is named after Prince Mikhail Semyonovich Vorontsov, one of the governors-general of the Odessa region.\nQuestion: Sailors on the Black Sea needed a light house to guide them. True, False, or Neither? Neither\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr..\nQuestion: Herradura-Agullo was born in the 80's True, False, or Neither? False\n###\nThe following details notable events from the year 2005 in Northern Ireland. Northern Ireland is a part of the United Kingdom in the north-east of the island of Ireland. It is variously described as a country, province or region of the UK, amongst other terms. Northern Ireland shares a border with the Republic of Ireland to the south and west.\nQuestion: Northern Ireland is a great country were Derry Girsl is set True, False, or Neither? Neither\n###\nMichael Shane Hollis (born May 22, 1972) is a former professional American football placekicker in the National Football League. He spent most of his nine-year professional career with the Jacksonville Jaguars, kicking for the team from 1995\u20132001 and setting several team records. He then played for the Buffalo Bills and New York Giants before retiring after an injury in 2003.\nQuestion: Michael Shane Hollis was born with a completely different name. True, False, or Neither?", "doc_id": 58, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38520, 25980, 18663, 454], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic.\nQuestion: Lausche experiences a lot of snow on the mountain which makes it hard to climb. True, False, or Neither? Neither\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series.\nQuestion: The Bear and the Maiden Fair was written after the book True, False, or Neither? Neither\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day.\nQuestion: Tunnel Vision is an debut american novel by author Keith Lowe from the 21st century. True, False, or Neither? True\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: Look at my Dab was higher than 90 on the Billboard Top 100 True, False, or Neither? True\n###\nThe Prague Skate (sometimes titled Golden Skate; from 1994: Czech Skate) is an international figure skating competition. It was a senior event from the 1960s to 1997, usually held in November or December in Prague. Medals were awarded in the disciplines of men's singles, ladies' singles, and pair skating. Since 1999, it is organized in some years as part of the ISU Junior Grand Prix series.\nQuestion: The Prague Skate was a senior event for fifty years. True, False, or Neither?", "doc_id": 466, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28791, 28047, 9985, 41561], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The cosmolabe was an ancient astronomical instrument resembling the astrolabe, formerly used for measuring the angles between heavenly bodies. It is also called pantacosm. Jacques Besson also uses this name, or universal instrument, for his invention described in \"Le cosmolabe\" (1567), which could be used for astrometry, cartography, navigation, and surveying.\nQuestion: Le cosmolabe was published more than 100 years ago. True, False, or Neither? True\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points.\nQuestion: Greivis only stays in the northern U.S.. True, False, or Neither? Neither\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide.\nQuestion: Mutual friends is a tv series that explores the lives of a group of friends dealing with bereavement. True, False, or Neither? True\n###\nSt Mary Magdalene's Church is a Roman Catholic Parish church in Bexhill-on-Sea, East Sussex, England. It was founded in 1893 and built in 1907 in the Gothic Revival style. It is situated on the corner of Sea Road and Magdalen Road opposite Station Road and Bexhill railway station in the centre of the town. It was designed by Arthur Young and is a Grade II listed building.\nQuestion: It was built over 10 years after being founded True, False, or Neither? True\n###\nKeith Martin (1969 or 1970 \u2013 December 5, 2014), one of the world heaviest lived people, was famous for being at one point the UK\u2019s heaviest man, weighing approximately 980 lbs at his peak. Keith Martin was given a gastric bypass operation by the NHS, and had lost over 50% of his body weight.\nQuestion: He was 500 pounds in 2002 True, False, or Neither?", "doc_id": 743, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34969, 27826, 42453, 13181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\".\nQuestion: Dostluk Spor Kul\u00fcb\u00fc is funded by the government True, False, or Neither? Neither\n###\nThe Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city.\nQuestion: Over two dozen horses race in The Kingfisher Ultra Indian Derby True, False, or Neither? Neither\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture.\nQuestion: Laura Warholic is also called Sex for Dummies True, False, or Neither? Neither\n###\nDonald Clark \"Donny\" Osmond (born December 9, 1957) is an American singer, actor, radio personality, and former teen idol. Osmond has also been a talk and game show host, record producer and author. In the mid-1960s, he and four of his elder brothers gained fame as the Osmonds. Osmond went solo in the early 1970s, covering such hits as \"Go Away Little Girl\" and \"Puppy Love\".\nQuestion: Donny's highest paying job was when he was a singer True, False, or Neither? Neither\n###\nAmerican Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company.\nQuestion: American Motors Incorporated (AMI) is a canadian company True, False, or Neither?", "doc_id": 730, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17011, 32957, 28919, 2064], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy.\nQuestion: The Toronto FC made the post season in 2012 True, False, or Neither? False\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate.\nQuestion: The bar will no longer be made. True, False, or Neither? Neither\n###\nFrederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\".\nQuestion: Frederick Wiseman started documeting before theatre True, False, or Neither? Neither\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The largest living bird is found in Australia. True, False, or Neither? False\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\".\nQuestion: It is also noted for ready-made sandwiches produced from such meat and reindeer bread True, False, or Neither?", "doc_id": 843, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4349, 23980, 24756, 1993], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: The 2002 Indian vice presidential election was held in 2002 True, False, or Neither? True\n###\n...In Black and White is the 12th studio album by American country artist Barbara Mandrell. The album was released in April 1982 on MCA Records and was produced by Tom Collins. It was Barbara Mandrell's first studio album in two years since the release of \"Love Is Fair\".\nQuestion: All of Barbara Mandrell's albums are on MCA records True, False, or Neither? Neither\n###\nClaus Biederstaedt (born 28 June 1928 in Stargard, today Poland) is a German actor and voice actor. He studied in Hamburg and began his career working with Joseph Offenbach. Among the actors for whom he has dubbed have been Yves Montand, Peter Falk, Marlon Brando, Vittorio Gassman, and James Garner.\nQuestion: Claus Biederstaedt was born in a country that underwent a name change. True, False, or Neither? True\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South .\nQuestion: The club has always been known as Oldham Curzon Ladies Football Club. True, False, or Neither? False\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon was only a forward True, False, or Neither?", "doc_id": 274, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17110, 22293, 26, 4812], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 198th Infantry Brigade, was first formed as part of the United States Army Reserve's 99th Division. It was active from 1967 through 1971 and has been active since 2007 as an Infantry Training Brigade as part of the US Army Infantry School at Fort Benning, Georgia.\nQuestion: The 198th Infantry Brigade is made up of mostly young people. True, False, or Neither? Neither\n###\nThe Puerto Rico Baseball Academy and High School (PRBAHS) is a non-profit organization combining academics and sports programs into one curriculum. Its goal is to prepare its students for higher education, competitive college scholarship opportunities, and the Major League Baseball Draft. The PRBAHS is the only high school in Puerto Rico or the United States with this type of learning environment.\nQuestion: The PRBAHS helps prepare students for competitive college scholarship opportunities. True, False, or Neither? True\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008.\nQuestion: Cherry Tomato starred mostly Korean actors True, False, or Neither? Neither\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances.\nQuestion: Spaceballs was the first comedy directed by Mel Brooks. True, False, or Neither? Neither\n###\nCon Stough is Professor of Cognitive Neuroscience and Psychology at Swinburne University of Technology, Australia, director of the Swinburne Centre for Neuropsychology and director of the newly formed National Institute of Complementary Medicine (NICM) Collaborative Centre for the study of herbal and natural medicines for neurocognition.\nQuestion: Stough was a professor in Austria. True, False, or Neither?", "doc_id": 946, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7002, 33462, 1787, 30347], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft.\nQuestion: Tatupu played 6 consecutive years in the NFL. True, False, or Neither? Neither\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams.\nQuestion: The Gaming Control Act was passed at least 100 days ago True, False, or Neither? True\n###\nHidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO.\nQuestion: Hidden City Entertainment makes games. True, False, or Neither? True\n###\nWonders of the Universe is a 2011 book by the theoretical physicists Brian Cox and Andrew Cohen. The book is about cosmology and the universe, and is explained in a way that is accessible to a general reader. The book is based on a series with the same name \"Wonders of the Universe\".\nQuestion: The book is about space True, False, or Neither? True\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District.\nQuestion: The fourth district of Datong contains a population of 3,149,029 people at the time of the 2010 census. True, False, or Neither?", "doc_id": 929, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36485, 39117, 37152, 19001], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck.\nQuestion: Lucas Franchoys likes to watch late night movies True, False, or Neither? Neither\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: The Flirts are still a trio. True, False, or Neither? Neither\n###\nSteve Koren is an Emmy Award winning writer/producer and screenwriter. Most notably he\u2019s written for \"Saturday Night Live\", \"Seinfeld\", and \"Veep\". He also wrote or co-wrote the movies \"Bruce Almighty\", \"Click\", \"A Night at the Roxbury\" and \"Superstar\".\nQuestion: Steve Koren has written at least 7 shows and movies. True, False, or Neither? True\n###\nThe Prime Minister's XI or PM's XI (formerly Australian Prime Minister's Invitation XI) is an invitational cricket team picked by the Prime Minister of Australia for an annual match held at the Manuka Oval in Canberra against an overseas touring team. The Australian team usually includes up and coming players.\nQuestion: Canberra is known for it's great stadiums. True, False, or Neither? Neither\n###\nJason Ian Drucker (born \u20092005 ) is an American child actor. He starred as Greg Heffley in the 2017 film \"\". He also played Tommy Miller, the youngest of the Miller Family, in Nickelodeon's \"Every Witch Way\". In 2018, he will co-star in the \"Transformers\" spin-off \"Bumblebee\".\nQuestion: Bumblebee is the only by- product from The Transformers. True, False, or Neither?", "doc_id": 806, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3085, 26093, 24718, 31706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States.\nQuestion: The Forum Shops features several coffee shops. True, False, or Neither? Neither\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar).\nQuestion: There was one guy called Adam in the Beastie Boys True, False, or Neither? False\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona.\nQuestion: There is a sequal planned for Knightriders True, False, or Neither? Neither\n###\nFat Mattress were an English folk rock band that formed in Folkestone in 1968. Founded by guitarist and vocalist Noel Redding, during his time as bassist for The Jimi Hendrix Experience, and vocalist Neil Landon, the band was completed by multi-instrumentalist Jim Leverton and drummer Eric Dillon. The band released two albums \u2013 \"Fat Mattress\" and \"Fat Mattress II\" \u2013 before splitting up in 1970.\nQuestion: The album \"Fat Mattress\" was very popular. True, False, or Neither? Neither\n###\nDavid Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League.\nQuestion: David Thomas Bush played in the MLB before the KBO League. True, False, or Neither?", "doc_id": 635, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40888, 17355, 10039, 24635], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists.\nQuestion: notable editorial cartoonists created wikipedia True, False, or Neither? Neither\n###\nMargarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos.\nQuestion: Margarita la tornera premiered in Spain in the early 1900s True, False, or Neither? True\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: Gotz Freiherr von Houwald died on 7/16/2001 True, False, or Neither? False\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: \"I'm So Sorry\" deals with a taboo societal topic. True, False, or Neither? Neither\n###\nNational Bingo Night is an American game show hosted by Ed Sanders which premiered on ABC on May 18, 2007, with a six-episode order. Sanders is known for his work on another ABC show, \"\". The show was cancelled by ABC and was repackaged as \"Bingo America\" on GSN, first hosted by Patrick Duffy, and in October 2008 by Richard Karn.\nQuestion: National Bingo Night was hosted by Patrick Duffy. True, False, or Neither?", "doc_id": 971, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2228, 31184, 5539, 28804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996).\nQuestion: Kimberly Beck died in 1997. True, False, or Neither? Neither\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture.\nQuestion: Laura Warholic; or, The Sexual Intellectual is a 2007 novel by Eugene Eyestones. True, False, or Neither? False\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland.\nQuestion: Skyblue phacelia has become endangered. True, False, or Neither? Neither\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012.\nQuestion: A nine year old could have theoretically competed in the Melodi Grand Prix Junior 2012. True, False, or Neither? True\n###\n\"Snakes on a Plane (Bring It)\", also referred to as \"Bring It (Snakes on a Plane)\", is the debut single by Cobra Starship, released in 2006 from the soundtrack album \"\". The song features William Beckett of The Academy Is..., Travie McCoy of Gym Class Heroes, and Maja Ivarsson of The Sounds.\nQuestion: It was released the year after 2002 True, False, or Neither?", "doc_id": 906, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22496, 27373, 33544, 22928], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia.\nQuestion: Princess Juliane Henriette Ulrike is female. True, False, or Neither? True\n###\nWJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM.\nQuestion: The radio station only broadcasts at night. True, False, or Neither? Neither\n###\nMargaret Munnerlyn Mitchell (November 8, 1900 \u2013 August 16, 1949) was an American author and journalist. One novel by Mitchell was published during her lifetime, the American Civil War-era novel, \"Gone with the Wind\", for which she won the National Book Award for Most Distinguished Novel of 1936\nQuestion: Margaret Munnerlyn Mitchell was not born in 1949 True, False, or Neither? False\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit.\nQuestion: Cloverdale Depot has buses. True, False, or Neither? True\n###\nThe St. Louis Cardinals 1984 season was the team's 103rd season in St. Louis, Missouri and the 93rd season in the National League. The Cardinals went 84-78 during the season and finished 3rd in the National League East, 12\u00bd games behind their arch-rivals, the Chicago Cubs. It was also the final season of the Columbia blue road uniforms for the Cardinals.\nQuestion: after 1984 the cardinals changed uniforms True, False, or Neither?", "doc_id": 799, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38642, 14987, 2180, 44804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Joseph Smith was not an orphan True, False, or Neither? True\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer.\nQuestion: Loui Jover was in the Australian army during his childhood True, False, or Neither? False\n###\nRear Admiral Kevin John Scarce {'1': \", '2': \", '3': \", '4': \"} (born 4 May 1952) is a retired Royal Australian Navy officer who was the 34th Governor of South Australia, serving from August 2007 to August 2014. He was succeeded by Hieu Van Le, who had previously been his lieutenant governor.\nQuestion: Kevin was well respected while he was in service. True, False, or Neither? Neither\n###\nMarcin Gortat (] ; born February 17, 1984) is a Polish professional basketball player for the Washington Wizards of the National Basketball Association (NBA). The 6\u00a0ft 11 in, 240-pound center is the son of boxer Janusz Gortat. He was a second-round draft choice of the Phoenix Suns in the 2005 NBA draft and has also played for the Orlando Magic.\nQuestion: Marcin Gortat was born less than 5556 days ago. True, False, or Neither? False\n###\nStormRider was a simulator ride at Tokyo DisneySea. It simulated going into a weather storm in a futuristic airplane (a \"StormRider\") to dissipate the storm. The attraction opened on September 4, 2001, in the Port Discovery land of Tokyo DisneySea. The attraction closed on May 17, 2016 and replaced by a new Finding Nemo/Finding Dory simulator ride called Nemo & Friends SeaRider.\nQuestion: Tokyo DisneySea opened in 1999. True, False, or Neither?", "doc_id": 894, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1175, 18214, 7468, 25986], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: Bosch was produced by Netflix. True, False, or Neither? False\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world.\nQuestion: Northern rat fleas are not external parasites always but it's hard to find these species. True, False, or Neither? Neither\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\".\nQuestion: Freeman was born to English royalty. True, False, or Neither? Neither\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008.\nQuestion: He recorded, and solely mixed \"Random Access Memories\" by Daft Punk in 2013 True, False, or Neither? False\n###\nTodd Wash (born July 19, 1968) is an American football coach who is the defensive coordinator for the Jacksonville Jaguars of the National Football League (NFL). From 2013 to 2015 he was the defensive line coach and run game coordinator for the Jacksonville Jaguars.\nQuestion: Todd Wash was in his 60's when he was the defensive line coach for the Jacksonville Jaguars. True, False, or Neither?", "doc_id": 945, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [526, 14958, 30979, 25537], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run.\nQuestion: Homicide: The Movie featured the same characters as the TV show. True, False, or Neither? Neither\n###\nUS Organization, or Organization Us, is a Black nationalist group in the United States founded in 1965. It was established as a community organization by Maulana Karenga. It was a complementary organization of the Black Panther Party in California. One of the early slogans was, \"Wherever US is, We are.\" US stands for us Black people vs 'them' the oppressors.\nQuestion: US Organization was founded by a black man. True, False, or Neither? Neither\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams.\nQuestion: The act was very hated True, False, or Neither? Neither\n###\nVanessa Alessandra Teixeira Porto (born March 16, 1984) is a Brazilian mixed martial artist and amateur boxer who competes in the Invicta Fighting Championships flyweight division. She is currently the #2-ranked 125-pound female fighter in the world according to the Unified Women's MMA Rankings.\nQuestion: Vanessa Alessandra Teixeira Porto was born over 25 years ago. True, False, or Neither? True\n###\nEastland Mall is an enclosed shopping mall in Columbus, Ohio. Opened in 1968, it no longer has any open anchor stores. Its four vacant anchors were originally occupied by Lazarus, Kaufmann's (later Macy's), Sears, and JC Penney. The mall is managed by Woodmont Management.\nQuestion: The last anchor store to close at Eastland Mall was Macy's. True, False, or Neither?", "doc_id": 797, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23561, 41144, 16562, 18565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California.\nQuestion: Michelle Do has millions of fans. True, False, or Neither? Neither\n###\nWilliam Lewis Moody Jr. (January 25, 1865 \u2013 July 21, 1954) was an American financier and entrepreneur from Galveston, Texas, who founded a private bank, an insurance company, and one of the largest charitable foundations in the United States. Moody was active in the day-to-day operations of his companies until two days before his death.\nQuestion: Moody lived in Texas. True, False, or Neither? True\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era.\nQuestion: Stand-In is a Classical Era American comedy film. True, False, or Neither? False\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s.\nQuestion: The New Ulm Oil Company Service Station is where people used to get gas from True, False, or Neither? True\n###\n\"Beez in the Trap\" is a song by rapper Nicki Minaj for her second studio album, \"\" (2012). It was written by Minaj, Maurice Jordan, and 2 Chainz, who contributed a guest verse to the song, while production was handled by Kenoe. The track was released as the album's third single on May 29, 2012 following \"Starships\" and \"Right by My Side\".\nQuestion: The song was released on the last day of May, 2012 True, False, or Neither?", "doc_id": 6, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39438, 7316, 32587, 7444], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\".\nQuestion: The Bonzo Dog Band has at least two other albums. True, False, or Neither? True\n###\nChristelyn Karazin is an American writer, columnist, and blogger on the subject of interracial dating, particularly black women dating outside their race. She hosts the blog \"Beyond Black & White\" and has written for \"Woman's Day\", \"Ebony\", \"Jet\", and Reuters. Karazin attended Loyola Marymount University, where she wrote for \"The Los Angeles Loyolan\".\nQuestion: Christelyn Karazin has made money from writing. True, False, or Neither? True\n###\nThe Leslie Motor Car company was a motor car company located in Detroit, Michigan in 1916. This automobile company was most likely named for the city of Leslie, Michigan. It was in operation for only one year and produced an unknown number of cars. Most cars of this era, were sold or given by their owners for scrap metal drives during World War II.\nQuestion: The Leslie Motor Car company was a motor car company that was made by a family. True, False, or Neither? Neither\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\".\nQuestion: Freeman was the lead actor in Fargo. True, False, or Neither? Neither\n###\n\"A Leela of Her Own\" is the sixteenth episode in the third season of the animated series \"Futurama\". The episode is an homage to \"A League of Their Own\". It originally aired on the Fox network in the United States on April 7, 2002. Bob Uecker provided the voice of himself, Tom Kenny provided the voice of Abner Doubledeal, and Hank Aaron guest starred as himself and Hank Aaron XXIV.\nQuestion: A Leela of Her Own is the 16th episode of the 3rd season True, False, or Neither?", "doc_id": 546, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31546, 23089, 25487, 6760], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Albert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion.\nQuestion: Albert Levitt was not alive on March 14, 1807 True, False, or Neither? True\n###\nBaoquan () is a town in Kedong County, western Heilongjiang province, Northeast China, located on a tributary of the Nonni River more than 190 km east-northeast of the city of Qiqihar. China National Highway 202 (G202) passes through the town, which is down the road from the city of Bei'an and the county seat, which lies some 13 km to the south.\nQuestion: Baoquan is heavily populated. True, False, or Neither? Neither\n###\nUNI Air () is an airline based in Zhongshan, Taipei, Taiwan. It is a domestic and regional subsidiary of EVA Air. It was known as Makung Airlines (\u99ac\u516c\u822a\u7a7a) until 1996, when EVA Air took a majority share of the airline. In 1998, the airline merged with Great China Airlines (\u5927\u83ef\u822a\u7a7a) and Taiwan Airways (\u81fa\u7063\u822a\u7a7a), which EVA Air also had interests in, to form UNI Airways (UNI Air).\nQuestion: UNI Air has a terrible CEO True, False, or Neither? Neither\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Kuch Kuch Hota Hai dominated the 44th filmfare awards. True, False, or Neither? True\n###\nAlong the Shadow is the third studio album by American rock band Saosin, released on May 20, 2016 through Epitaph Records. The album marks the end of a three-and-a-half-year hiatus for the group with the return of original lead vocalist Anthony Green. It also marks the subsequent departure of lead guitarist Justin Shekoski.\nQuestion: Anthony Green is a licensed funeral director. True, False, or Neither?", "doc_id": 981, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35588, 21553, 2132, 41768], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elizabeth Berridge (born May 2, 1962) is an American film and theatre actress. She is known for playing Constanze Mozart in the Academy Award-winning 1984 film \"Amadeus\", for the role of Officer Eve Eggers on \"The John Larroquette Show\" (1993-1996), and for her performances in the theater.\nQuestion: Elizabeth Berridge is not her real name True, False, or Neither? Neither\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex.\nQuestion: Croton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is a dull looking red plant. True, False, or Neither? Neither\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd.\nQuestion: Adam McKay chose Christina Applegate because she was blond True, False, or Neither? Neither\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute.\nQuestion: 54-40 will win a Grammy in 2019 True, False, or Neither? Neither\n###\nRonald Reagan is a bronze sculpture depicting the American politician of the same name by Chas Fagan, installed at the United States Capitol's rotunda, in Washington, D.C., as part of the National Statuary Hall Collection. The statue was donated by the U.S. state of California in 2009, and replaced one depicting Thomas Starr King, which the state had gifted in 1931.\nQuestion: The new sculpture replaces one that stood for eighty years. True, False, or Neither?", "doc_id": 800, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4168, 9758, 24540, 6919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly.\nQuestion: Lee Kuan Yew was a leader in a Opposition party in Singapore. True, False, or Neither? True\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: Tsewang Rigzin was president of the Tibetan Youth Congress in 2006 True, False, or Neither? False\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments.\nQuestion: J\u00fcrgen Melzer decided during his lifetime that he wanted to live in the United States. True, False, or Neither? Neither\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels.\nQuestion: Padfield was born after 1932. True, False, or Neither? False\n###\nEnrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America.\nQuestion: Enrique Leff has published less than 200 articles. True, False, or Neither?", "doc_id": 418, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1139, 1361, 2359, 41073], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prema Thapassu is a 1991 Telugu romance drama film, produced by Sri Sai Madhavi Productions and directed by Dr. N. Siva Prasad. The film stars Rajendra Prasad and Roja in the lead roles and music also composed by Rajendra Prasad . The film is first debut to actress Roja into film industry. The film was a \"flop\" at the box office.\nQuestion: Prema Thapassu had a sequel. True, False, or Neither? Neither\n###\nHakea preissii, commonly known as the Needle tree, Needle bush and Christmas hakea, is a shrub or tree of the genus \"Hakea\" native to an area in the Pilbara, Wheatbelt, Mid West and Goldfields-Esperance regions of Western Australia. The Noongar name for the plant is Tanjinn.\nQuestion: Christmas hakea would make a great alternative Christmas tree at holiday time! True, False, or Neither? Neither\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg.\nQuestion: Marie Hedwig Auguste von Sulzbach was a Countess Palatine of Sulzbach by birth and by marriage who died of influenza in 1681. True, False, or Neither? Neither\n###\nIn ancient Roman religion, Antevorta was a goddess of the future, also known as Porrima. She and her sister Postverta (or Postvorta) were described as companions or siblings of the goddess Carmenta, sometimes referred to as \"the Carmentae\". They may have originally been two aspects of Carmenta, namely those of her knowledge of the future and the past (compare the two-faced Janus).\nQuestion: Antevorta was a goddess of fate. True, False, or Neither? False\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar).\nQuestion: Members of the Beastie Boys generally stayed together most of their careers True, False, or Neither?", "doc_id": 694, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40289, 9618, 33352, 12705], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father.\nQuestion: Christopher Abele was in business for his father to earn the money to start Argosy Foundation. True, False, or Neither? Neither\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: The New York Mets placed fifth in the National League West in 1974. True, False, or Neither? False\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013.\nQuestion: The 1941 Cabo San Lucas hurricane was downgraded to a topical cyclone on September 12 True, False, or Neither? Neither\n###\nAndrea M\u00f3nica Montenegro DeFreitas, known as Andrea Montenegro (born 4 March 1969 in Lima, Peru), is a Peruvian actress and model well known for her participation in various telenovelas such as Zorro, la Espada y la Rosa, Latin Lover (2001), La viuda de la Mafia (2004) and currently in Telemundo's El Clon. She has a daughter Muriel and a son Amaru.\nQuestion: Andrea has four children. True, False, or Neither? False\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada.\nQuestion: The La Serie ACT was formerly known as Serie ACT Castrol Tour. True, False, or Neither?", "doc_id": 211, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19330, 31750, 8035, 33229], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spy for a Day is a 1940 British comedy thriller film directed by Mario Zampi and starring Douglas Wakefield, Paddy Browne and Jack Allen. During the First World War a British farmer is abducted by the Germans to take the place of a spy about to be executed whom he closely resembles.\nQuestion: Spy for a Day is a British comedy film about a British farmer who is abducted by Germans due to his resemblance to a a spy they are about to execute and need a replacement for, and they execute him instead. True, False, or Neither? Neither\n###\nEthan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films.\nQuestion: Suplee prefers acting on tv rather than movies. True, False, or Neither? Neither\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is in CA True, False, or Neither? True\n###\nMarcin Gortat (] ; born February 17, 1984) is a Polish professional basketball player for the Washington Wizards of the National Basketball Association (NBA). The 6\u00a0ft 11 in, 240-pound center is the son of boxer Janusz Gortat. He was a second-round draft choice of the Phoenix Suns in the 2005 NBA draft and has also played for the Orlando Magic.\nQuestion: Polish people make excellent athletes. True, False, or Neither? Neither\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008.\nQuestion: Huizar was born the year after 1983. True, False, or Neither?", "doc_id": 170, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19058, 15666, 30832, 29020], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zambian Breweries Plc is a Zambian brewing and beverage company listed on the Lusaka Stock Exchange. Its brews mainly pale lagers and a clear sorghum lager. It is also a major bottler of Coca-Cola. It has two breweries and three bottling plants. As of 2017 international brewing giant SABMiller owned 87% of Zambrew. Market capitalization was ZMW3,385,200,000 or about USD 372,000,000.\nQuestion: As of 2013 international brewing giant SABMiller owned 87% of Zambrew. Market capitalization was ZMW3,385,200,000 or about USD 372,000,000.\n True, False, or Neither? False\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide.\nQuestion: GLO will create a secondary headquarters in East Africa, to offer services there. True, False, or Neither? Neither\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films.\nQuestion: Ileana Carusio was not a popular pornstar True, False, or Neither? Neither\n###\nRBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes.\nQuestion: RBG Resources has never been in trouble with the law. True, False, or Neither? False\n###\nNelson is an American rock band founded by singer/songwriters Matthew and Gunnar Nelson (twin sons of Ricky Nelson and Kristin Nelson). The band achieved success during the early 1990s with their double platinum debut album \"After the Rain\", which featured the number-one hit \"(Can't Live Without Your) Love and Affection\".\nQuestion: Nelson has an E. True, False, or Neither?", "doc_id": 249, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41019, 9308, 32144, 39271], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "What Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band.\nQuestion: The band formed in 1976. True, False, or Neither? Neither\n###\nOrange, Red, Yellow is a 1961 Color Field painting by Mark Rothko. It sold at Christie's for $86.882.500 on May 8, 2012. The seller was the estate of David Pincus and the sale price represents a record nominal price for Post-War / contemporary art at public auction and for Rothko works in general.\nQuestion: Orange, Red, Yellow sold for more than $86,882,499. True, False, or Neither? True\n###\nJunoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani.\nQuestion: the soundtrack doesn't feature any lyrics True, False, or Neither? Neither\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996.\nQuestion: I am a tall man. True, False, or Neither? Neither\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: End of the Past was published in the year two thousand and sixteen. True, False, or Neither?", "doc_id": 575, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32136, 34446, 18257, 7308], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Studies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\"\nQuestion: Carson also relates this to the successes of the 21st century. True, False, or Neither? Neither\n###\nEthan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films.\nQuestion: Ethan Suplee played Randy in My Name is Earl. True, False, or Neither? True\n###\nThe Carrier Sekani Tribal Council (familiarly known as CSTC) is a tribal council representing eight First Nations in the Central Interior of British Columbia. It was originally known as the \"Lakes District Tribal Council\". The CSTC was incorporated in 1979 and is a registered non-profit society.\nQuestion: The Carrier Sekani Tribal Council is also known as the CSTC True, False, or Neither? True\n###\nThe iHeartRadio Much Music Video Awards (also known as the MMVAs, and originally known as the Canadian Music Video Awards until 1995, and formerly and commonly known as the MuchMusic Video Awards) are annual awards presented by the Canadian television channel Much to honour the year's best music videos.\nQuestion: The iHeartRadio Much Music Video Awards once went by a different name. True, False, or Neither? True\n###\nOur Lady of Confidence, also known as La Madonna della Fiducia or Our Lady of Trust, is a venerated image depicting the Blessed Virgin Mary enshrined at the Lateran Basilica. The feast of Our Lady of Confidence falls on the last Saturday prior to Lent.\nQuestion: Our Lady of Confidence occurs on more than one day each year True, False, or Neither?", "doc_id": 628, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2137, 6464, 7744, 24950], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Natasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad.\nQuestion: Natasha Choufani was not an actress. True, False, or Neither? False\n###\nThe Ghost and Mrs. Muir (1947) is a romantic-fantasy film starring Gene Tierney and Rex Harrison. It was directed by Joseph L. Mankiewicz, and is based on a 1945 novel written by Josephine Leslie under the pseudonym of R. A. Dick. In 1945, 20th Century Fox bought the film rights to the novel, which had been published only in the United Kingdom at that time. It was shot entirely in California.\nQuestion: The Ghost and Mrs. Muir film closely follows the plot of a book. True, False, or Neither? Neither\n###\nPeter Andreas Thiel ( ; born October 11, 1967) is an American entrepreneur, venture capitalist, philanthropist, political activist, and author. He was ranked No. 4 on the \"Forbes\" Midas List of 2014, with a net worth of $2.2 billion, and No. 246 on the \"Forbes\" 400 in 2016, with a net worth of $2.7 billion.\nQuestion: Thiel donates money to animal shelters. True, False, or Neither? Neither\n###\nCarol Hernandez is an American journalist from Miami Florida. She won a 1996 Goldsmith Prize for Investigative Reporting. She won the 1996 Pulitzer Prize for National Reporting. She currently resides in Long Island with her husband, and three children, (the oldest being the best and most funny and creative).\nQuestion: Carol Hernandez has the same birthday as her husband. True, False, or Neither? Neither\n###\nJoona Veteli (born 21 April 1995) is a Finnish football player currently playing for Norwegian OBOS-ligaen side Fredrikstad. Veteli plays in the position of centre midfielder but can also operate as an attacking midfielder, defensive midfielder, right-back and winger.\nQuestion: Joona Veteli is paid to play sports True, False, or Neither?", "doc_id": 747, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18736, 20289, 44392, 24090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "End of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: Nadeem F. Paracha speaks English. True, False, or Neither? Neither\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: The Beatles were lazy and didn't want to release a new album with original material. True, False, or Neither? Neither\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier.\nQuestion: Timoon Animation was the sole company created by Philippe Mounier True, False, or Neither? Neither\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue.\nQuestion: The Drake Hotel has at least two places where you can get food. True, False, or Neither? True\n###\nThe Louvin Brothers were an American musical duo composed of brothers Ira Lonnie Loudermilk (1924\u20131965) and Charlie Elzer Loudermilk (1927\u20132011), better known as Ira and Charlie Louvin. The brothers are cousins to John D. Loudermilk, a Nashville Songwriters Hall of Fame member.\nQuestion: Ira and Charlie Louvin are cousins. True, False, or Neither?", "doc_id": 909, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20453, 9872, 28298, 28335], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Going My Way is an American comedy-drama series starring dancer and actor Gene Kelly. Based on the 1944 film of the same name starring Bing Crosby, the series aired on ABC with new episodes from October 3, 1962 to April 24, 1963. The program was Kelly's first and only attempt at a weekly television series. The series was canceled after one season of thirty episodes.\nQuestion: Going my Way released new Episodes after the release of the original film. True, False, or Neither? True\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries.\nQuestion: The spirits that ESL makes tastes awful. True, False, or Neither? Neither\n###\nSongbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist.\nQuestion: Chris Cornell released his live album with his band. True, False, or Neither? False\n###\nThe 2007 Hertsmere Borough Council election took place on 3 May 2007 to elect members of Hertsmere Borough Council in Hertfordshire, England. One third of the council was up for election and the Conservative party stayed in overall control of the council.\nQuestion: The Conservative party had a large party to celebrate. True, False, or Neither? Neither\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Despite being a French-Egyptian film, Destiny was filmed in neither France or Egypt. True, False, or Neither?", "doc_id": 85, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41992, 25643, 15190, 34021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grant Taylor (Born October 30,1991) is an American professional skateboarder. He is the son of former professional skateboarder Thomas Taylor and won Thrasher Magazine's \"Skater of The Year\" in 2011. Grant\u2019s style of skateboarding is known to be fast and powerful. He is recognized for his unique versatile skateboarding.\nQuestion: Grant Taylor won Thrasher Magazine's \"Skater of the Year\" in 2011 because his style is fast and powerful. True, False, or Neither? True\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions.\nQuestion: Project Gasbuggy did not go as planned. True, False, or Neither? Neither\n###\nEdward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis.\nQuestion: E. Annis aka T. Hart wrestled for AAA. True, False, or Neither? True\n###\nDavid Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics.\nQuestion: Dave Smith is a retired female race walker True, False, or Neither? False\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898.\nQuestion: Grimsby Town Football Club was a bottom tier football team True, False, or Neither?", "doc_id": 301, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20992, 44343, 23399, 11723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide.\nQuestion: the film did not make more than 5 million dollars True, False, or Neither? False\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene.\nQuestion: \"Something from Nothing\" was inspired by pop music True, False, or Neither? Neither\n###\nMount Willey is a mountain located in Grafton County, New Hampshire. The mountain is named after Samuel Willey, Jr. (1766\u20131826) and his family, who in 1825 moved into a house in Crawford Notch. The family was killed a year later in August 1826 during a landslide.\nQuestion: Samuel Willey, Jr. was born less than 10000 days ago. True, False, or Neither? False\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: Colorz of Rage was the first 1999 film to have a female lead. True, False, or Neither? Neither\n###\nDerek Ervin Smith (November 1, 1961 \u2013 August 9, 1996) was an American professional basketball player. He won a national championship with the Louisville Cardinals in 1980, and spent nine years in the NBA in a career shortened by a knee injury. He would later become an assistant coach for the Washington Bullets from 1994 until his death.\nQuestion: Derek Ervin Smith officially retired before his death. True, False, or Neither?", "doc_id": 388, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14655, 23974, 36937, 34982], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Superman's Dead\" is a song by Canadian alternative rock group Our Lady Peace. It was released in December 1996 as the lead single from their second album \"Clumsy\". This has become one of Our Lady Peace's most popular songs in both Canada and the U.S., as well as many other parts of the world.\nQuestion: Superman's Dead was the third single. True, False, or Neither? False\n###\nGunfighters of Casa Grande (Spanish: \"Los Pistoleros de Casa Grande\" ) is a 1964 Eurowestern film, co-produced by American and Spanish producers. Based on a story by Borden and Patricia Chase, it was later developed into a screenplay with the assistance of screenwriter Clark Reynolds and directed by Roy Rowland, the last film he made for Metro-Goldwyn-Mayer.\nQuestion: Gunfighters of Casa Grande took place in Western Europe. True, False, or Neither? Neither\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label.\nQuestion: Bad Company has many hit songs. True, False, or Neither? Neither\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie.\nQuestion: G.I. Joe: Ninja Battles was a failure True, False, or Neither? Neither\n###\nMultnomah University (MU) is a non-denominational Christian university in Portland, Oregon, United States. Multnomah consists of a college, graduate school, seminary and Degree Completion Program, and the university offers bachelor's, master's and doctorate degrees in a wide range of fields.\nQuestion: Multnomah University is located on the West coast of the U.S. True, False, or Neither?", "doc_id": 523, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36850, 7821, 33970, 40526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University.\nQuestion: Shannon Kelley wants to become the head coach. True, False, or Neither? Neither\n###\nCorey Gibson, known professionally as Corey Chorus, is an American songwriter, record producer, vocal producer, sound engineer and publisher, known for having written songs such as Cheers (Drink to That) of Rihanna, Chica Bomb by Dan Balan, Made in the USA by Demi Lovato.\nQuestion: Corey Gibson is a famous singer. True, False, or Neither? False\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\"\nQuestion: The population was more than 80,000 True, False, or Neither? False\n###\nNogiBingo! ( stylized as NOGIBINGO!) is a Japanese television variety show starring Japanese idol girl group Nogizaka46. Ijily Okada, who is known for many AKB48 related show such as \"AKB48 Nem\u014dsu TV\", hosted the program. The show firstly aired on July 3, 2013, as part of the variety show \"Nogizaka46 x HKT48 Kanbangumi Battle!\", and it became an independent show from the second season.\nQuestion: Ijily Okada knows of many Japanese variety shows. True, False, or Neither? True\n###\nFather Xmas is a 2001 short film from director Marie Rose and the American Film Institute's Directing Workshop for Women starring Dakota Fanning as six-year-old Clairee who learns from her older brother (Stephen Fanning) that Santa Claus is not real and that their father is fighting in the Vietnam War.\nQuestion: Father Xmas is a 2001 short film from director Marie Rose and the American Movie Institute's Directing Workshop for Women True, False, or Neither?", "doc_id": 620, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18308, 23165, 38200, 37216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Franklin Martin Loew, DVM, PhD, (1939 in Syracuse, NY \u2013 2003 in Boston, MA) was president of Becker College, dean of the College of Veterinary Medicine at Cornell University and dean of Tufts University School of Veterinary Medicine (now Tufts Cummings School of Veterinary Medicine).\nQuestion: Tufts University School of Veterinary medicine had its name changed to Tufts Cummings School of Veterinary Medicine. True, False, or Neither? True\n###\nIlse von Glatz (August 21, 1958 \u2013 May 2, 2014) was a Canadian actress who played an Advocate in the 1988 science fiction TV series \"War of the Worlds\". She also worked in \"The Mind of Simon Foster\" (episode of \"the 1985 version of The Twilight Zone\"). She also appeared in at least one episode of \"\" in 1989.\nQuestion: The show was science fiction True, False, or Neither? True\n###\nForest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark.\nQuestion: Forest Hill Vineyard (also referred to as Forest Hill Wines) is a successful Australian winery business based in the Great Southern wine region of Western Australia. True, False, or Neither? Neither\n###\nDostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\".\nQuestion: stluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1971 as a en's football club in Istanbul, Turkey. True, False, or Neither? False\n###\nJacques Tourneur (] ; November 12, 1904 \u2013 December 19, 1977) was a French film director known for the classic film noir \"Out of the Past\" and a series of low-budget horror films he made for RKO Studios, including \"Cat People\", \"I Walked with a Zombie\" and \"The Leopard Man\". He is also known for directing \"Night of the Demon\", that was released by Columbia Pictures.\nQuestion: Jacques Tourneur began his career as a film director in 1904. True, False, or Neither?", "doc_id": 32, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21426, 20028, 28096, 22967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Finniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla.\nQuestion: Kangaroo Island, the Fleurieu Peninsula, Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla are all within an area that is less than 8750 km. True, False, or Neither? True\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler.\nQuestion: Dan Deacon is a small man True, False, or Neither? Neither\n###\nHoodlum is a 1997 American crime drama film that gives a fictionalized account of the gang war between the Italian/Jewish mafia alliance and the Black gangsters of Harlem that took place in the late 1920s and early 1930s. The film concentrated on Ellsworth \"Bumpy\" Johnson (Laurence Fishburne), Dutch Schultz (Tim Roth), and Lucky Luciano (Andy Garc\u00eda).\nQuestion: Laurence Fishburne and Andy Garc\u00eda are 2 of the stars of Hoodlum True, False, or Neither? True\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue.\nQuestion: Misty Knight has been read by Trump. True, False, or Neither? Neither\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex.\nQuestion: Muccan Station is on the radio True, False, or Neither?", "doc_id": 493, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6711, 16084, 24903, 32032], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alexander Grinberg (\u0410\u043b\u0435\u043a\u0441\u0430\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u043b\u043e\u0432\u0438\u0447 \u0413\u0440\u0438\u043d\u0431\u0435\u0440\u0433, Aleksandr Danilovich Grinberg) (1885\u20131979) was a Russian and Soviet photographer. n 1908 he was awarded the silver medal in the all-Russian photo exhibition in Moscow and the gold medal in the international photo-exhibition in Dresden.\nQuestion: Alexander Grinberg was a homosexual. True, False, or Neither? Neither\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title.\nQuestion: The 1982 Bavarian Tennis Championships was held in Cologne, Germany True, False, or Neither? False\n###\nPassion Play is a 2010 American drama film written and directed by Mitch Glazer, executive produced by Rebecca Wang and starring Mickey Rourke, Megan Fox, Rhys Ifans and Bill Murray. Filming for the production began in December 2009 and is presented by Rebecca Wang Entertainment. It premiered at the 2010 Toronto International Film Festival.\nQuestion: Passion Play had Mickey Rourke as the lead role True, False, or Neither? Neither\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: Johan Martin Schr\u00f6der has a large extended family. True, False, or Neither? Neither\n###\nAtiha Sen Gupta (born 1988) is a British playwright and screenwriter. She is writer-in-residence for 2016-2017 at Theatre Royal Stratford East in London, where her play \"Counting Stars\" was produced in 2016. In the same year she won the International Achievement Recognition Awards (IARA) Award for Best Playwright.\nQuestion: Atiha won IARA Award for Counting Stars. True, False, or Neither?", "doc_id": 303, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25184, 21202, 35474, 11996], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: David Lee Murphy likes driving True, False, or Neither? Neither\n###\nBullitt East High School is a high school located at 11450 Highway 44 East in the city of Mount Washington, Kentucky. It is part of the Bullitt County Public Schools district. Sports teams include: Archery, Swimming, Football, Soccer, Tennis, Track and Field, Baseball, Softball, Wrestling, Basketball, Volleyball and Cheerleading.\nQuestion: Bullitt High School has more female students than male students. True, False, or Neither? Neither\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km .\nQuestion: Jiaozhou Bay Bridge has a police station True, False, or Neither? Neither\n###\n\"Beyond This Earthly Realm\" is the eleventh episode of the fourth season of the American animated television series \"Adventure Time\". The episode was written and storyboarded by Ako Castuera and Jesse Moynihan, from a story by Patrick McHale, Kent Osborne, and Pendleton Ward. It originally aired on Cartoon Network on June 11, 2012.\nQuestion: Beyond This Earthly Realm is a cartoon episode. True, False, or Neither? True\n###\nGood is a 2008 drama film based on the stage play of the same name by C. P. Taylor. It stars Viggo Mortensen, Jason Isaacs, and Jodie Whittaker, and was directed by Vicente Amorim. The film premiered at the Toronto International Film Festival on 8 September 2008.\nQuestion: Good premiered more than 10 years ago True, False, or Neither?", "doc_id": 234, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21082, 9365, 44651, 2580], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: Dungeon Master's Guide has at least 4 editions. True, False, or Neither? True\n###\nSimon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro.\nQuestion: Simon Corbell was born less than 5000 hours ago. True, False, or Neither? False\n###\nNeil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\".\nQuestion: The compilation album Neil Sedaka: Italiano was released in nineteen hundred sixty five. True, False, or Neither? False\n###\nBrash Young Turks is a 2016 coming-of-age British crime film directed by Naeem Mahmood and co-directed by his brother Ash Mahmood that tells a fast paced struggle love, crime and power, against all odds. The film stars Melissa Latouche, Paul Chiedozie, Tom Bott, Richard Shelton and Julian Glover among a large ensemble cast.\nQuestion: Brash Young Turks is not a Russian film. True, False, or Neither? True\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing.\nQuestion: Students in this program must have a bachelor's degree to participate. True, False, or Neither?", "doc_id": 82, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18896, 15747, 45297, 22319], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: The film Punjabi was the winner of film awards. True, False, or Neither? Neither\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: Kasey Peters threw for over 10,000 yards in his career. True, False, or Neither? Neither\n###\nDame Nicola Mary Brewer DCMG is a British diplomat and university administrator. In May 2014 she was appointed Vice-Provost (International) at University College London. She is a non-executive director of Aggreko. Brewer was British High Commissioner to South Africa from 2009 to 2013.\nQuestion: Dame Nicola Mary Brewer is a British diplomat and university administrator who was appointed Vice-Provost at the University College London while at the same time also working as the High commissioner to South Africa. True, False, or Neither? False\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The man had serious mental problems. True, False, or Neither? Neither\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: University of Bologna is a former Roman Catholic Church True, False, or Neither?", "doc_id": 974, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40306, 39333, 43108, 21780], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play.\nQuestion: The History Boys was filmed in 2005 True, False, or Neither? Neither\n###\nThe East\u2013West Shrine Game is an annual postseason college football all-star game played each January since 1925. The game is sponsored by the fraternal group Shriners International, and the net proceeds are earmarked to some of the Shrine's charitable works, most notably the Shriners Hospitals for Children. The game's slogan is \"Strong Legs Run That Weak Legs May Walk\".\nQuestion: East-West Shrine Game is played every year. True, False, or Neither? True\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001).\nQuestion: Susan Lynch has an A. True, False, or Neither? True\n###\nThe Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world.\nQuestion: The Timber Mountain Log Ride has seen millions of rides. True, False, or Neither? Neither\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp.\nQuestion: The town had a census in the 21st century. True, False, or Neither?", "doc_id": 62, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31843, 39511, 35202, 14716], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures.\nQuestion: The Golden Fetter is a romance film. True, False, or Neither? True\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: Youth like football. True, False, or Neither? Neither\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs was a journalist True, False, or Neither? True\n###\nSpeedway Field was the original name for the airfield that was to evolve into Minneapolis-St. Paul International Airport, the twelfth busiest airport in the United States; it was also the largest hub for Northwest Airlines and the third largest hub for Delta Air Lines, Northwest's successor.\nQuestion: Minneapolis-St. Paul International Airport is larger than Speedway Field. True, False, or Neither? True\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles).\nQuestion: The 1979 British Grand Prix took place on 13 July 1979 True, False, or Neither?", "doc_id": 118, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20570, 5503, 20913, 23502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Should the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986.\nQuestion: Bauhaus is a gothic rock band. True, False, or Neither? True\n###\nThe European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002.\nQuestion: The europea democrat union members does not include liberal conservatives. True, False, or Neither? False\n###\nThe S-99 (Russian: \u0421-99 ) experimental submarine was the only ship of the Soviet Project 617 submarine class (NATO reporting name: Whale class) that the Soviet Union built during the early Cold War and the only Soviet submarine which had a Walter engine fuelled by high test peroxide (HTP).\nQuestion: the cold war saw the most inventions by russia True, False, or Neither? Neither\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy.\nQuestion: the owner changed out players after the 6th loss True, False, or Neither? Neither\n###\n\"We Really Shouldn't Be Doing This\" is a song written by Jim Lauderdale, and recorded by American country music artist George Strait. It was released in September 1998 as the third and final single from his album \"One Step at a Time\". It peaked at number 4 in the United States, and number 2 in Canada.\nQuestion: Jim Lauderdale is an American country song writer. True, False, or Neither?", "doc_id": 297, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4739, 26165, 36054, 18483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Uni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats.\nQuestion: Uni\u00f3n Deportiva Vall de Ux\u00f3 is a baseball team. True, False, or Neither? False\n###\nweRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo.\nQuestion: weRead has more than two million but less than four million followers across multiple social media platforms. True, False, or Neither? True\n###\nHenry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795.\nQuestion: Henry Pelham Fiennes Pelham-Clinton was well educated True, False, or Neither? Neither\n###\nManila Calling is a 1942 American black-and-white World War II propaganda war film drama from 20th Century Fox, produced by Sol M. Wurtzel, directed by Herbert I. Leeds, that stars Lloyd Nolan, Carole Landis, Cornel Wilde, James Gleason, Lester Matthews, Louis Jean Heydt, and Ted North.\nQuestion: There was no color in Manila Calling. True, False, or Neither? True\n###\nGerard A. \"Gerry\" Salton (8 March 1927 in Nuremberg \u2013 28 August 1995), was a Professor of Computer Science at Cornell University. Salton was perhaps the leading computer scientist working in the field of information retrieval during his time, and \"the father of information retrieval\". His group at Cornell developed the SMART Information Retrieval System, which he initiated when he was at Harvard.\nQuestion: Gerry Salton was born in the late 1920s True, False, or Neither?", "doc_id": 898, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [738, 25509, 27992, 43567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013.\nQuestion: The Attorney is based off of greek mythology. True, False, or Neither? Neither\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Mentha diemenica is popular in Asian cuisines True, False, or Neither? Neither\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996.\nQuestion: Trainspotting was Kelly Macdonalds first movie. True, False, or Neither? True\n###\nMark Miravalle (born 1959) is a professor of theology at Franciscan University of Steubenville, specializing in Mariology. He is president of \"Vox Populi Mariae Mediatrici\", a Catholic movement promoting the concepts of the Blessed Virgin Mary as Mediatrix and Co-Redemptrix.\nQuestion: Mark Miravalle was born within the last 100 years. True, False, or Neither? True\n###\nPhil Lloyd is an Australian actor and scriptwriter and partner in the production company Jungleboys. He is best known for his acting role as Myles Barlow in the Australian TV series, \"Review with Myles Barlow\" and the comedy series \"At Home with Julia\", where he played Tim Mathieson, the partner of prime minister Julia Gillard.\nQuestion: \"At Home with Julia\" starred prime minister Julia Gillard True, False, or Neither?", "doc_id": 80, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2587, 39796, 37112, 10486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Curzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South .\nQuestion: The Club will go through another name change. True, False, or Neither? Neither\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Every member of Nashville West was happy the band broke up. True, False, or Neither? Neither\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd.\nQuestion: Foley was born in West Bromwich. True, False, or Neither? Neither\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior.\nQuestion: The \"disconnector\" is not always efficient which leads to jamming True, False, or Neither? Neither\n###\nStansted Mountfitchet is an English village and civil parish in Uttlesford district, Essex, near the Hertfordshire border, 35 mi north of London. According to the 2001 census it had a population of 5,533, increasing to 6,011 at the 2011 census. The village is served by Stansted Mountfitchet railway station.\nQuestion: Stansted Mountfitchet is located in mainland europe True, False, or Neither?", "doc_id": 434, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3180, 35833, 8350, 8979], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898.\nQuestion: The club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later after some debate and moved to its current stadium, Blundell Park, in 1898. True, False, or Neither? Neither\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997.\nQuestion: The song is more than 3 years old True, False, or Neither? True\n###\nGordon Hendrick (February 16, 1949) is a former Republican member of the Montana Legislature. He was elected to House District 14 which represents the Superior area. Due to Montana's term limits, he was ineligible to run for re-election in 2012. He was succeeded by Republican candidate Nicholas Schwaderer for the 2013 legislature cycle.\nQuestion: Republicans have the majority in the Montana Legislature True, False, or Neither? Neither\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: facts about film true-but it maynot be best film ever about a ketch True, False, or Neither? Neither\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier.\nQuestion: New episodes of Forest Friends do not air on TiJi. True, False, or Neither?", "doc_id": 95, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26524, 39258, 4226, 6531], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Smithereens was produced in 1998. True, False, or Neither? True\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season.\nQuestion: The MVP is the best in the world True, False, or Neither? Neither\n###\nUdinese Calcio sensationally finished third in Serie A, much due to Oliver Bierhoff being in the form of his life, scoring 27 goals in a league season consisting of just 34 matches. Bierhoff, coach Alberto Zaccheroni and winger Thomas Helveg all left for Milan at the end of the season, ensuring Udinese had lots of work to do to stay at the level it was.\nQuestion: Bierhoff set a record for goals scored. True, False, or Neither? Neither\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label.\nQuestion: Lance King died in 1962 True, False, or Neither? False\n###\nAodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\".\nQuestion: Aodh Mac Cathmhaoil was born in 1570. True, False, or Neither?", "doc_id": 26, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20616, 19042, 11919, 11885], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Achilles Club is a track and field club formed in 1920 by and for past and present representatives of Oxford and Cambridge Universities. Members have won 19 Olympic Gold Medals (most recently Steph Cook in the pentathlon), and held 38 World Records. One of its founding members was Evelyn Aubrey Montague, who is immortalized in the 1981 film \"Chariots of Fire\".\nQuestion: Members of the Achilles Club have won 19 Olympic Gold Medals but no silver medals. True, False, or Neither? Neither\n###\nPrincess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia.\nQuestion: Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld never ever loved her husband Grand Duke Konstantin Pavlovich of Russia. True, False, or Neither? Neither\n###\nMichael Shane Hollis (born May 22, 1972) is a former professional American football placekicker in the National Football League. He spent most of his nine-year professional career with the Jacksonville Jaguars, kicking for the team from 1995\u20132001 and setting several team records. He then played for the Buffalo Bills and New York Giants before retiring after an injury in 2003.\nQuestion: Michael Shane Hollis is a woman. True, False, or Neither? False\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award.\nQuestion: Rachel Brosnahan has never talked. True, False, or Neither? False\n###\nThe church St. Ulrich is a Roman Catholic parish church in Neubau, the 7th district of Vienna, Austria. The official name is \"Pfarrkirche hl. Ulrich und Maria Trost \" (Parish church of St. Ulrich and Mary's consolation), it is also known as Ulrichskirche . The Baroque hall church with two towers was built in 1721. It is consecrated to St. Ulrich and St. Mary.\nQuestion: Pfarrkirche hl. Ulrich und Maria Trost was built in the 18 century. True, False, or Neither?", "doc_id": 772, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41580, 29490, 1031, 32185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler.\nQuestion: Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. He has released many albums under a lot of different labels. True, False, or Neither? True\n###\nA Bhumka is the term for a traditional herbal healer in the valley of Patalkot, India. The valley is mainly home to members of the Bharia and Gond tribes, with 2,000 residents scattered between various villages and hamlets. Tribes people traditionally use herbal medicine, under the auspices of a herbal expert and holy man known as a Bhumka.\nQuestion: The people do not trust the medicine True, False, or Neither? Neither\n###\nJon Garth Murray (November 16, 1954 \u2013 September 29, 1995) was the second son of late controversial activist Madalyn Murray O'Hair, the first president and founder of American Atheists, Inc., in 1963. He was also the half-brother of the reverend William \"Bill\" Murray.\nQuestion: Murray's biological brother was much older than him. True, False, or Neither? Neither\n###\nHelvering v. Horst, 311 U.S. 112 (1940) , is an opinion of the United States Supreme Court which further developed the \u201cfruit-and-tree\u201d metaphor established in \"Lucas v. Earl\", 281 U.S. 111 (1930) . \"Horst\" is the leading case that applies the assignment of income doctrine to income from property.\nQuestion: \"Horst\" has to do with properties. True, False, or Neither? True\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant.\nQuestion: People in America use Hakea gibbosa as a hedge plant. True, False, or Neither?", "doc_id": 552, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2075, 12940, 2870, 35128], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bernardo Provenzano (] ; 31 January 1933 \u2013 13 July 2016) was a member of the Sicilian Mafia (\"Cosa Nostra\") and was suspected of having been the head of the Corleonesi, a Mafia faction that originated in the town of Corleone, and de facto \"capo di tutti capi\" (boss of all bosses) of the entire Sicilian Mafia until his arrest in 2006.\nQuestion: Bernardo Provenzano lived for more than a century. True, False, or Neither? False\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley.\nQuestion: Bernard Taylor majored in English while in college True, False, or Neither? Neither\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart.\nQuestion: In 2011 Earlly Mac released an album True, False, or Neither? Neither\n###\nFinsbury Park TMD was a railway Traction Maintenance Depot situated in London, England. It was the first purpose built main line diesel locomotive depot opened in England and it was fully commissioned in April 1960. Finsbury Park was a steam shed under British Railways with the depot code 34G; the depot code of the diesel depot under BR was FP. The nearest railway station is Finsbury Park.\nQuestion: Finsbury Park TMD was a popular railway in London True, False, or Neither? Neither\n###\nNew Orleans Square is a themed land found at Disneyland Park in Anaheim, California. Based on 19th-century New Orleans, Louisiana, the roughly three-acre area was the first land to be added to Disneyland after the park's opening, at a cost of $18 million. It is exclusive to Disneyland, although a similarly themed area can be found within Adventureland at Tokyo Disneyland.\nQuestion: New Orleans Square has been closed for many years True, False, or Neither?", "doc_id": 502, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3014, 6303, 42953, 25216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Four Cs of 21st century learning, also known as the Four Cs or 4 Cs, are four skills that have been identified by the United States-based Partnership for 21st Century Skills (P21) as the most important skills required for 21st century education: critical thinking, communication, collaboration, and creativity.\nQuestion: Communication is the most important of the Four C's. True, False, or Neither? Neither\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats.\nQuestion: Valenica has more football players than Madrid. True, False, or Neither? Neither\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001.\nQuestion: Wanker Records writes music other than punk rock. True, False, or Neither? False\n###\n\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\".\nQuestion: After the song's release, Murray thought about going disco. True, False, or Neither? Neither\n###\nBig Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999.\nQuestion: Big Bad Voodoo Daddy played at the 33rd Super Bowl. True, False, or Neither?", "doc_id": 40, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9608, 16828, 17073, 30874], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep.\nQuestion: Coldwater fish live shorter lives than tropical fish. True, False, or Neither? False\n###\n\"Loose Talk\" was a 1954 song written by Freddie Hart (who also recorded it on Capitol, but didn't chart) and recorded by Carl Smith and was his last number one. It was at the top spot of the \"Billboard\" country and western chart for seven weeks and had a total of thirty-two weeks listed there. The B-side was \"More Than Anything Else in the World\": it peaked at number five in the same chart.\nQuestion: Loose Talk has been sung by Reagan True, False, or Neither? Neither\n###\nJara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa.\nQuestion: Less than 40000 people spoke Jara in the year 2000 True, False, or Neither? False\n###\nThe Vermont State Police (VSP) is the state police agency for the US state of Vermont. The force has jurisdiction throughout the entire state. The Vermont Public Safety Commission directs policy and selects the commander. The commander is Colonel Matthew Birmingham. The Vermont Public Safety Commissioner is Keith W. Flynn. There are 327 sworn state troopers.\nQuestion: The force does not have jurisdiction throughout the state True, False, or Neither? False\n###\nDavid Scott \"Dave\" Foley (born January 4, 1963) is a Canadian actor, stand-up comedian, director, producer and writer. He is known as a co-founder of the comedy group \"The Kids in the Hall\", responsible for their eponymous sketch show and the feature-length film \"\". He played Dave Nelson in the sitcom \"NewsRadio\", voiced Flik in \"A Bug's Life\" and hosted the game show \"Celebrity Poker Showdown\".\nQuestion: David Scott probably started his acting career before January 4,1963. True, False, or Neither?", "doc_id": 218, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20227, 24137, 40106, 906], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Finniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla.\nQuestion: Finnies is a district that includes at least eight towns. True, False, or Neither? True\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer.\nQuestion: Joseph Maurice Ravel is famous in France and America. True, False, or Neither? Neither\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee\nQuestion: Mystery won the 1994 WFA. True, False, or Neither? False\n###\nHenry Gabriel Murphy (1903\u20132001) was an American businessman, sportsman and Major League Baseball club owner. From June 1950 through April 1984, he was a minority stockholder in the Washington Senators/Minnesota Twins franchise of the American League.\nQuestion: Murphy was a Major League Baseball club player from 1950-1984. True, False, or Neither? False\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana\" var. \"calva shows up in Spain. True, False, or Neither?", "doc_id": 481, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39630, 13284, 29439, 40509], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Foals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: Foals have been covered by hole. True, False, or Neither? Neither\n###\nSeven Little Monsters is a children's picture book by American author and illustrator Maurice Sendak. \"Seven Little Monsters\" was published by Harper & Row in 1977 and served as the basis for the Canadian-Chinese television production of the same name (2000-2007).\nQuestion: Before Harper & Row published the book in 1977, it had been overlooked by other publishing companies. True, False, or Neither? Neither\n###\nJosef Jan\u00ed\u010dek (born 28 December 1947 in Prague, Czechoslovakia, now Czech Republic) is a Czech rock keyboardist, singer, accordion and guitar player. He was a former guitarist of The Primitives Group; from 1969 he played with The Plastic People of the Universe. He was also a member of Milan Hlavsa's band called \"P\u016flnoc\". Since 1990, he is a member of The Velvet Underground Revival Band.\nQuestion: Josef Jan\u00ed\u010dek has no arms. True, False, or Neither? False\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland.\nQuestion: Phacelia pedicellata means the plant that lives True, False, or Neither? Neither\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: View from the Top was filmed in 2001 True, False, or Neither?", "doc_id": 156, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11099, 7652, 23358, 40856], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja.\nQuestion: Lokesh is an actor. True, False, or Neither? True\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats.\nQuestion: Vinar\u00f2s Club de F\u00fatbol has large capacity seating True, False, or Neither? True\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville.\nQuestion: No. 27 Squadron RAAF has gold. True, False, or Neither? Neither\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: A Roman Catholic Church in central Bologna is now used as a large lecture hall by the University of Bologna. True, False, or Neither? True\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name.\nQuestion: The character of Sondra Huxtable was developed for the show Fatherhood with Bill Cosby. True, False, or Neither?", "doc_id": 611, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [300, 41454, 10167, 18033], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Departure of a Grand Old Man (Russian: \u0423\u0445\u043e\u0434 \u0432\u0435\u043b\u0438\u043a\u043e\u0433\u043e \u0441\u0442\u0430\u0440\u0446\u0430 , translit.\u00a0Ukhod velikovo startza) is a 1912 Russian silent film about the last days of author Leo Tolstoy. The film was directed by Yakov Protazanov and Elizaveta Thiman, and was actress Olga Petrova's first film.\nQuestion: Olga performed in many films before This one True, False, or Neither? False\n###\nNew American Writing is a once-a-year American literary magazine emphasizing contemporary American poetry, including a range of innovative contemporary writing. The magazine is published in association with San Francisco State University. \"New American Writing\" is published by OINK! Press, a nonprofit organization. The magazine appears in early June each year. First published in 1986.\nQuestion: The magazine appears in early summer each year True, False, or Neither? True\n###\nWhat Is the What: The Autobiography of Valentino Achak Deng is a 2006 novel written by Dave Eggers. It is based on the life of Valentino Achak Deng, a Sudanese child refugee who immigrated to the United States under the Lost Boys of Sudan program. It was a finalist for the National Book Award.\nQuestion: The Autobiography of Valentino Achak Deng was a finalist and awarded the National Book Award True, False, or Neither? Neither\n###\nPostal codes in Brunei are alphanumeric, consisting of two letters followed by four digits in the format of YZ0000, where Y denotes the district code, Z denotes the mukim code, the first two digits denote the area or village code, and the last two digits denote the nearest post office code (e.g. the postal code for Pantai Mentiri Golf Club is BU2529).\nQuestion: Postal codes in Brunei are alphanumeric but never start with a letter. True, False, or Neither? False\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012.\nQuestion: Tooji has won multiple Norwegian Melodi Grand Prixs. True, False, or Neither?", "doc_id": 664, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17059, 36761, 33499, 3996], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa.\nQuestion: Jara is becoming increasingly popular as a language True, False, or Neither? False\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015.\nQuestion: Seven different people have served as governor of Abia State since the state was created on August 27, 1991. True, False, or Neither? False\n###\nThe Leslie Motor Car company was a motor car company located in Detroit, Michigan in 1916. This automobile company was most likely named for the city of Leslie, Michigan. It was in operation for only one year and produced an unknown number of cars. Most cars of this era, were sold or given by their owners for scrap metal drives during World War II.\nQuestion: Most cars from 1916 were sold or given away for scrap metal drives during World War II. True, False, or Neither? True\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise).\nQuestion: The Magic Roundabout was known originally in French as \"Le Man\u00e8ge enchant\u00e9\" True, False, or Neither? True\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Coriolano won some of Rome's greatest victories. True, False, or Neither?", "doc_id": 411, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3026, 1784, 41756, 21713], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list.\nQuestion: Humanity is now in the 19th century. True, False, or Neither? False\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg.\nQuestion: The song was released on March 31, 2015. True, False, or Neither? False\n###\nJames Matthes Talent (born October 18, 1956) is an American politician and former U.S. Senator from Missouri. He is a Republican and resided in the St. Louis area while serving in elected office. He identifies with the conservative wing of the Republican party, being outspoken on judicial appointments, abortion, flag burning, and defense issues.\nQuestion: Republicans are outspoken about flag burning. True, False, or Neither? True\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000.\nQuestion: The 2012 Sun Life Financial Players' Championship was held in May in 2012 True, False, or Neither? False\n###\nJurassic Park is a 1993 video game based on the film and novel of the same name. It was developed and published by Ocean Software and released for the Nintendo Entertainment System (NES). Ocean also released \"Jurassic Park\" on the handheld Game Boy console. The Game Boy version is a port of the NES version.\nQuestion: Jurassic Park was a movie True, False, or Neither?", "doc_id": 955, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29023, 5717, 14143, 9418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "RBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes.\nQuestion: RBG Resources is a privately owned firm. True, False, or Neither? False\n###\nThe Hill Country Film Festival is a yearly film festival in Fredericksburg, Texas, United States. It was established in 2010. The executive director is Chad Matthews, and it is presented by the Hill Country Film Society, who hold free screenings at the festival and, afterward, monthly. In 2013, \"Texas Monthly\" selected it as a \"quirky, discerning\" pick.\nQuestion: The Hill Country Film Festival will cease in 2023. True, False, or Neither? Neither\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing.\nQuestion: The Charter Township of Lansing is in Russia. True, False, or Neither? False\n###\nTansu \u00c7iller (] ; born 24 May 1946) is a Turkish academician, economist, and politician who served as the 22nd Prime Minister of Turkey from 1993 to 1996. She is Turkey's first and only female prime minister to date. As the leader of the True Path Party, she went on to concurrently serve as Deputy Prime Minister of Turkey and as Minister of Foreign Affairs between 1996 and 1997.\nQuestion: As the leader of the True Path Party, she went on to concurrently serve as Deputy Prime Minister of Turkey and as Minister of Foreign Affairs between 1996 and 1997 and came third in the 2002 general election. True, False, or Neither? Neither\n###\nMartin Joseph O'Malley (born January 18, 1963) is an American politician and attorney who served as the 61st Governor of Maryland from 2007 to 2015. He previously served as the Mayor of Baltimore from 1999 to 2007, and was a councilman from the Third Councilmanic District in the northeast section of the city on the Baltimore City Council from 1991 to 1999.\nQuestion: Martin Joseph O'Malley did not live in baltimore when he was the mayor True, False, or Neither?", "doc_id": 610, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38248, 31688, 5111, 14660], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Craig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\".\nQuestion: His first job was on film crews. True, False, or Neither? True\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball.\nQuestion: Forever the Moment would be considered poor grammar to English speakers. True, False, or Neither? Neither\n###\nSupervixens is a 1975 sexploitation film by American filmmaker Russ Meyer. The cast features Meyer regulars Charles Napier, Uschi Digard, and Haji. The film also features Shari Eubank (in a dual role) in one of her only two film roles ever and Christy Hartburg in her only film role ever.\nQuestion: Supervixens was created by an american. True, False, or Neither? True\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award.\nQuestion: In 2005 when Rachel was 15 years old she knew that she could have a role in the film \"The Unborn\". True, False, or Neither? Neither\n###\nThe 1992 Nutri-Metics Bendon Classic was a women's tennis tournament played on outdoor hard courts at the ASB Tennis Centre in Auckland in New Zealand that was part of Tier V of the 1992 WTA Tour. It was the seventh edition of the tournament and was held from 27 January February through 2 February 1992. Unseeded Robin White won the singles title.\nQuestion: The 1992 Nutri-Metics Bendon Classic had no loser. True, False, or Neither?", "doc_id": 776, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35855, 16598, 4303, 16021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forestville Commonwealth is an archaeological site and national historic district located at Earlton in Greene County, New York. The district contains seven contributing sites. It represents the remains of a utopian community built in 1826-1827 as one of three Owenite experiments in New York State.\nQuestion: Forestville Commonwealth is one of 2 Owenite experiments in New York. True, False, or Neither? False\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978.\nQuestion: Gary Sutherland will be inducted into the MLB Hall of Fame True, False, or Neither? Neither\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory.\nQuestion: The Australia national cricket team was captained by syd gregory True, False, or Neither? True\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: End of the Past was published in the 20th century True, False, or Neither? False\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC.\nQuestion: Tory Woodbury is over 40 years True, False, or Neither?", "doc_id": 238, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10064, 2582, 24828, 36251], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Gospel According to the Other Mary is an opera/oratorio by contemporary American composer John Adams. The world premiere took place on May 31, 2012, at the Walt Disney Concert Hall in Los Angeles with Gustavo Dudamel conducting the Los Angeles Philharmonic who also premiered the staged version on March 7, 2013, at the same venue.\nQuestion: John Adams was from Los Angeles True, False, or Neither? Neither\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South .\nQuestion: The Club will win it's next match. True, False, or Neither? Neither\n###\nHumans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video.\nQuestion: Humans Need Not Apply is an internet documentary True, False, or Neither? True\n###\nThe Krylov\u2013Bogolyubov averaging method (Krylov\u2013Bogolyubov method of averaging) is a mathematical method for approximate analysis of oscillating processes in non-linear mechanics. The method is based on the averaging principle when the exact differential equation of the motion is replaced by its averaged version. The method is named after Nikolay Krylov and Nikolay Bogoliubov.\nQuestion: The Krylov\u2013Bogolyubov averaging method is rarely used True, False, or Neither? Neither\n###\nNativity in Black is the name of two Black Sabbath tribute albums that came out in the 1990s and 2000s. The albums were recorded with various heavy metal bands paying tribute to Black Sabbath for their influence on the heavy metal genre of rock music.\nQuestion: Nativity in Black isn't a Black Sabbath album. True, False, or Neither?", "doc_id": 708, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40457, 12406, 1703, 3004], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1969 Indian vice-presidential election was held on 30 August 1969. Gopal Swarup Pathak won the election to become the fourth Vice-President of India. The election was occurred since the sitting VP, Varahagiri Venkata Giri resigned to contest the presidential election after the death of incumbent President Zakir Husain.\nQuestion: The 1969 Indian vice-presidential election was held in 20th of August True, False, or Neither? False\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: He spent 1 year with the Duke City Gladiators. True, False, or Neither? Neither\n###\n\"I'm Living in Two Worlds\" is a song written by Jan Crutchfield, which was recorded and released by American country artist Bonnie Guitar. The song reached number nine on the \"Billboard\" Hot Country Singles chart and number ninety-nine on the \"Billboard\" Hot 100 in early 1966. \"I'm Living in Two Worlds\" became Guitar's first Country top-ten single and her first charting single since 1959.\nQuestion: Bonnie Guitar wrote \"I'm Living in Two Worlds.\" True, False, or Neither? False\n###\nIn the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese).\nQuestion: A Brazilian company called Prologica made its own computers so they must also make their own laptops True, False, or Neither? Neither\n###\nPrince Karl Alfred of Liechtenstein (Karl Alfred Maria Johannes Baptista Heinrich Aloys Georg Hartmann Ignatius; 16 August 1910 \u2013 17 November 1985) was a Liechtensteiner prince and brother of Franz Joseph II. He was the third child and second son of Prince Aloys of Liechtenstein and Archduchess Elisabeth Amalie of Austria.\nQuestion: Prince Karl Alfred of Liechtenstein has ten names. True, False, or Neither?", "doc_id": 933, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41486, 18189, 9009, 10563], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shrek Forever After (also known as Shrek 4, and Shrek Forever After: The Final Chapter) is an action-adventure video game based on the film of the same name. It was released on May 18, 2010, in North America. It is the fourth and final video game based on the movie series of \"Shrek\". This was also the last Shrek game to be developed by Activison.\nQuestion: Before spring of 2010, there were more than 3 Shrek video games produced by Activision. True, False, or Neither? True\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues.\nQuestion: V.S. Naipaul is a site that offers original content and aggregated news and blogs. True, False, or Neither? False\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh.\nQuestion: Stanley Anthony Woods is a generous person True, False, or Neither? Neither\n###\nJacques Tourneur (] ; November 12, 1904 \u2013 December 19, 1977) was a French film director known for the classic film noir \"Out of the Past\" and a series of low-budget horror films he made for RKO Studios, including \"Cat People\", \"I Walked with a Zombie\" and \"The Leopard Man\". He is also known for directing \"Night of the Demon\", that was released by Columbia Pictures.\nQuestion: Jacques Tourneur didn't speak English. True, False, or Neither? Neither\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Jacob Murphy is 24 years old. True, False, or Neither?", "doc_id": 757, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38327, 32286, 6796, 18962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley.\nQuestion: Art of Dying plays rock music. True, False, or Neither? True\n###\nweRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo.\nQuestion: weRead started out after 2006 on Facebook True, False, or Neither? True\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing.\nQuestion: An experience point is used in music games True, False, or Neither? False\n###\nCaptain Scarlett is a 1953 American Technicolor Adventure film directed by Thomas Carr, that was shot in Mexico. The film is set in France following the fall of Napoleon I, and stars Richard Greene playing the title role, a Robin Hood type avenger, and Brazilian actress Leonora Amar in her final screen role.\nQuestion: The film was not made in France. True, False, or Neither? True\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor.\nQuestion: Van Cleef & Arpels is a producer of animals True, False, or Neither?", "doc_id": 73, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35741, 6346, 5863, 34348], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017\u201318 Puebla season is the 70th professional season of Mexico's top-flight football league. The season is split into two tournaments\u2014the Torneo Apertura and the Torneo Clausura\u2014each with identical formats and each contested by the same eighteen teams.The Club will also play Copa MX.Rafael Garc\u00eda Torres was named the club head coach on June 5, 2017, taking over for sacked coach Jos\u00e9 Cardozo.\nQuestion: The 2017\u201318 Puebla season has tournaments for 20 teams True, False, or Neither? False\n###\nAndrea Louise Riseborough (born 20 November 1981) is an English stage and film actress. Her film appearances include \"Birdman or (The Unexpected Virtue of Ignorance)\", \"Oblivion\", \"Welcome to the Punch\", \"Disconnect\", \"Shadow Dancer\", \"W.E.\", \"Brighton Rock\", \"Made in Dagenham\", \"Never Let Me Go\", \"Happy-Go-Lucky\", and \"Venus\".\nQuestion: Andrea Riseborough is the oldest of her siblings and the only actress. True, False, or Neither? Neither\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities.\nQuestion: Alberto Espinoza Barr\u00f3n was arrested more than 208 years ago. True, False, or Neither? False\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: He plans to play soccer for the U.S one day True, False, or Neither? Neither\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999.\nQuestion: Daoud Abdel Sayed was born in Egypt in the 40's. True, False, or Neither?", "doc_id": 271, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21699, 29565, 33543, 44182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH).\nQuestion: The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" will likely never be proven in our lifetimes. True, False, or Neither? Neither\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals.\nQuestion: We're an American Band has no sound. True, False, or Neither? False\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: Cuban imports to the United States were stopped in 1961, but have since resumed. True, False, or Neither? Neither\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart.\nQuestion: Dwight Yoakam is under 60 years old. True, False, or Neither? Neither\n###\n\"A Hard Day's Night\" is a song by the English rock band the Beatles. Credited to Lennon\u2013McCartney, it was written by John Lennon, with some collaboration from Paul McCartney. It was released on the film soundtrack of the same name in 1964. It was also released in the UK as a single, with \"Things We Said Today\" as its B-side.\nQuestion: A Hard Day's Night was written by both John Lennon and Paul McCartney. True, False, or Neither?", "doc_id": 276, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9608, 17347, 35737, 14903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep.\nQuestion: Coldwater fish live shorter lives than tropical fish. True, False, or Neither? False\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris.\nQuestion: \"Yellow Ledbetter\" is the 2nd episode of the sixth season of the Candian TV show \"The Vampire Diaries\" True, False, or Neither? False\n###\nForest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark.\nQuestion: Forest Hill Vineyard is very expensive True, False, or Neither? Neither\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: Santa Lucia was commissioned by an ancient Roman Emperor. True, False, or Neither? Neither\n###\nAndrea M\u00f3nica Montenegro DeFreitas, known as Andrea Montenegro (born 4 March 1969 in Lima, Peru), is a Peruvian actress and model well known for her participation in various telenovelas such as Zorro, la Espada y la Rosa, Latin Lover (2001), La viuda de la Mafia (2004) and currently in Telemundo's El Clon. She has a daughter Muriel and a son Amaru.\nQuestion: Andrea M\u00f3nica Montenegro DeFreitas speaks spanish True, False, or Neither?", "doc_id": 25, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30632, 20405, 8909, 19878], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University Church of England Academy is a secondary school located in Ellesmere Port, Cheshire. It was formed in 2009 by the merger of Ellesmere Port Specialist School of Performing Arts (located at Woodchurch Lane) and Cheshire Oaks High School (located at Stanney Lane).\nQuestion: the academy was founded one year prior to originally scheduled True, False, or Neither? Neither\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House.\nQuestion: Spittal is not a large city True, False, or Neither? True\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: Three Little Sisters was released on the last day of July. True, False, or Neither? True\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974.\nQuestion: Juan Domingo Per\u00f3n was serving from 8 October 1895 to 1 July 1974. True, False, or Neither? False\n###\nStudio One is an American radio\u2013television anthology drama series, created in 1947 by Canadian director Fletcher Markle, who came to CBS from the CBC. It aired under several variant titles: Studio One Summer Theatre, Studio One in Hollywood, Summer Theatre, Westinghouse Studio One and Westinghouse Summer Theatre.\nQuestion: Fletcher Markle came to the Columbia Broadcasting System from the Canadian Broadcasting Corporation. True, False, or Neither?", "doc_id": 245, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37607, 4943, 12633, 35604], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted.\nQuestion: Mads Wiel Nygaard's Endowment the hardest award to win. True, False, or Neither? Neither\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75.\nQuestion: Interstate 29 has lots of trucks True, False, or Neither? Neither\n###\nMahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja.\nQuestion: Mahalakshmi was broadcast first on 6 March 2017 True, False, or Neither? True\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night.\nQuestion: Night of Terror is a 1934 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. True, False, or Neither? False\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Alice Sue Claeys continued to play professional figure skating past 1993. True, False, or Neither?", "doc_id": 140, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20134, 29715, 7715, 32593], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Grafton writes stories. True, False, or Neither? True\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions.\nQuestion: Project Gasbuggy was located in Texas. True, False, or Neither? False\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006.\nQuestion: Oliver Francis O'Grady was born in the 19th century. True, False, or Neither? False\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ).\nQuestion: GSG 9 consists of an ethnically diverse team of police members True, False, or Neither? Neither\n###\nThe following are lists of the most populous fully defined incorporated settlements in Nigeria by population. This page consists three different tables, with different kinds of settlements; a list for \"defined cities\", listing the population, strictly within the defined city limits, a list for \"urban area\" population, and another list for the population within metropolitan areas.\nQuestion: The lists are about settlements in all of Africa. True, False, or Neither?", "doc_id": 336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13159, 19152, 43253, 12039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2008 Emerald Bowl, part of the 2008-09 NCAA football bowl games season, was played on December 27, 2008, at AT&T Park, the home field of the Giants in San Francisco, California. The Miami Hurricanes of the ACC were matched against the California Golden Bears (based in nearby Berkeley, California) of the Pac-10, the first appearance by either team in the seven-year history of the Emerald Bowl.\nQuestion: The 2008 Emerald Bowl was held in a stadium True, False, or Neither? True\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\".\nQuestion: Crawling was released in the early 21st century True, False, or Neither? True\n###\nBel Ami (; also known as \"Pretty Boy\", and \"'Pretty Man\", is a South Korean romantic comedy television series starring Jang Keun-suk, IU, Lee Jang-woo and Han Chae-young. Based on the same-titled 17-volume manhwa by Chon Kye-young, it aired on KBS2 from November 20, 2013 to January 9, 2014 on Wednesdays and Thursdays at 21:55 for 16 episodes.\nQuestion: Bel Ami was written by Chon Kye-Young. True, False, or Neither? True\n###\nThe Battle of Vauchamps (14 February 1814) was the final major engagement of the Six Days Campaign of the War of the Sixth Coalition. It resulted in a part of the Grande Arm\u00e9e under Napoleon I defeating a superior Prussian and Russian force of the Army of Silesia under Field-marshal Gebhard Leberecht von Bl\u00fccher.\nQuestion: The Battle of Vauchamps was a basketball game. True, False, or Neither? False\n###\nJean le F\u00e8vre de Saint-Remy or Jean Lefebvre de Saint-Remy (c. 1394 \u2013 June 16, 1468) born in Abbeville, was a Burgundian chronicler during the Hundred Years' War and lord (\"seigneur\") of Saint Remy, la Vacquerie, Avesnes and Morienne. He is also known by the formal title of authority \"Toison d'or\" (Golden Fleece) because he served as the King of Arms to the Order of the Golden Fleece.\nQuestion: He was born in 1300 + 52 True, False, or Neither?", "doc_id": 168, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38818, 43431, 44745, 11389], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "San Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County.\nQuestion: People really enjoy having the Ferry as a means of transportation. True, False, or Neither? Neither\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Claeys did not place 8th in the 92 Word Championships. True, False, or Neither? True\n###\nThe MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005.\nQuestion: The first test flight was reported less than 10 years ago. True, False, or Neither? False\n###\nThe roots of the Orton Ceramic Foundation date back to the establishment of the \"Standard Pyrometric Cone Company\" in 1896 by Edward J. Orton, Jr.. In 1894, he was appointed the first Chairman of the Ceramic Engineering Department at The Ohio State University, the first ceramic engineering school in the United States.\nQuestion: The roots of Orton was sprung forth before the 97th year of the 1800's True, False, or Neither? True\n###\nRiver Raid is a scrolling shooter video game designed and developed by Carol Shaw, and published by Activision in 1982 for the Atari 2600 video game console. Over a million game cartridges were sold. Activision later ported the title to the Atari 5200, ColecoVision, and Intellivision game consoles, as well as to the Commodore 64, IBM PCjr, MSX, ZX Spectrum, and Atari 8-bit family home computers.\nQuestion: Over one million, but less than 3 million cartridges of River Raid were sold. True, False, or Neither?", "doc_id": 108, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15676, 20231, 28558, 19479], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis.\nQuestion: Edward Annis was born in the 20th century True, False, or Neither? True\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: Pata Nahi Rabb Kehdeyan Rangan Ch Raazi was released in 2011. True, False, or Neither? False\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality.\nQuestion: Marcellite Wall is most remembered as the voice of Minnie Mouse during her 12 years working at Walt Disney Productions. True, False, or Neither? Neither\n###\nA sodium bicarbonate rocket (sometimes called an Alka-Seltzer rocket) is a model rocket fashioned from a 35mm film canister and propelled by the pressure of a gas, often carbon dioxide, generated from the reaction of an acid with sodium bicarbonate. Sodium bicarbonate rockets are often used in science classes to demonstrate principles of chemistry and physics.\nQuestion: Sodium bicarbonate are also as model to show to students some principle. True, False, or Neither? True\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\".\nQuestion: It Takes Two was Marvin Gaye's most successful song. True, False, or Neither?", "doc_id": 187, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27786, 17411, 36401, 20434], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "American Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company.\nQuestion: American Motors Incorporated is not the same company as American Motors Corporation. True, False, or Neither? True\n###\nThis is a list of United States Air Force test squadrons. It covers units considered to be part of the Air Force and serves as a break out of the comprehensive List of United States Air Force squadrons. Most units in this list are assigned to Air Force Materiel Command, however, a few reside in other Major Commands of the United States Air Force.\nQuestion: The list is classified. True, False, or Neither? Neither\n###\nMasquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line.\nQuestion: Mikhail Lermontov wrote Masquerade in the 1830s True, False, or Neither? True\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old.\nQuestion: John Cameron Urschel is a retired professional football player True, False, or Neither? True\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton).\nQuestion: The single \"Sick and Tired\" features Anthony Hamilton who was born in 2003 True, False, or Neither?", "doc_id": 927, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44386, 15384, 42919, 34986], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lady Pamela Carmen Louise Hicks (\"n\u00e9e\" Mountbatten; born 19 April 1929) is a British aristocrat. She is the younger daughter of the 1st Earl Mountbatten of Burma by his wife, Edwina Mountbatten. Through her father, Lady Pamela is a first cousin of Prince Philip, Duke of Edinburgh and a great niece of the last Empress of Russia, Alexandra Feodorovna.\nQuestion: Lady Pamela could not produce children. True, False, or Neither? Neither\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX.\nQuestion: One of the shows featured a dingo. True, False, or Neither? Neither\n###\nFriday: The Animated Series was a short-lived animated television series based on the \"Friday\" film series. The show is directed by Kevin Lofton and is co-produced and co-distributed by New Line Television, a subsidiary of New Line Cinema (the distributors of the \"Friday\" movies), MTV2, and Ice Cube's Cubevision. The series only lasted for 8 episodes.\nQuestion: Friday: The Animated Series appeared on MTV on Friday at 8. True, False, or Neither? Neither\n###\nMargaret Munnerlyn Mitchell (November 8, 1900 \u2013 August 16, 1949) was an American author and journalist. One novel by Mitchell was published during her lifetime, the American Civil War-era novel, \"Gone with the Wind\", for which she won the National Book Award for Most Distinguished Novel of 1936\nQuestion: Margaret's first, middle, and last name all start with M. True, False, or Neither? True\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: India and West Indies both made three consecutive world cup final appearances True, False, or Neither?", "doc_id": 550, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25513, 42079, 20395, 21864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Waterloo Corner is a rural/urban suburb approximately 22 kilometres north of Adelaide, the capital city of South Australia. Most of the land is used for agricultural purposes, including wheat, olives, grapes and tomatoes. Port Wakefield Road, and thus a lot of heavy freight, traverses the suburb.\nQuestion: Waterloo Corner is east of Adelaide True, False, or Neither? False\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer.\nQuestion: The princess was related to the pretender of the throne. True, False, or Neither? True\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle.\nQuestion: Boon Brewery produces beer using techniques that are traditional and classic. True, False, or Neither? False\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: View from the Top was seen by Amy. True, False, or Neither? Neither\n###\nJeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008.\nQuestion: Jeffrey B. Miller was a state police commissioner from 2003 until march 24 2008 True, False, or Neither?", "doc_id": 258, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11282, 37960, 45240, 35032], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lois Cleveland Chiles (born April 15, 1947) is an American actress and former fashion model known for her roles as Dr. Holly Goodhead in the 1979 James Bond film \"Moonraker\", and as a hit and run driver in 1987's \"Creepshow 2\", as well as such films as \"The Great Gatsby\", \"The Way We Were\", \"Death on the Nile\" and \"Broadcast News\".\nQuestion: Lois will star in future films. True, False, or Neither? Neither\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group.\nQuestion: He was the only performer on the album. True, False, or Neither? Neither\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau.\nQuestion: The 3rd Macau International Movie Festival subsequently honored Chinese films on December 7, 2012. True, False, or Neither? False\n###\nJenni Falconer (born 12 February 1976) is a Scottish radio and television presenter best known for her roles on the ITV daytime show \"This Morning\", where she is a regular travel reporter. Falconer was a regular presenter of the National Lottery Draws on BBC One.\nQuestion: She works many national lottery draws True, False, or Neither? Neither\n###\nMineral County Airport (FAA LID: 9S4) is a county-owned public-use airport located two nautical miles (3.7 km) southeast of the central business district of Superior, a town in Mineral County, Montana, United States. According to the FAA's National Plan of Integrated Airport Systems for 2011-2015, it is categorized as a \"general aviation\" facility.\nQuestion: Mineral County Airport is in the southern hemisphere. True, False, or Neither?", "doc_id": 510, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4597, 37109, 23251, 2572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise).\nQuestion: The magic roundabout was created in norway True, False, or Neither? False\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: New Day is a 1949 book by Jamaican author V. S. Reid. It is over 24 years old True, False, or Neither? True\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide.\nQuestion: Jersey Boys was released in 67 countries. True, False, or Neither? Neither\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983.\nQuestion: Sherwood Stewart ranked high in doubles tennis in 1983. True, False, or Neither? True\n###\nKate Kendall (born 27 July 1973) is an Australian actress best known for her role in the long-running Nine Network Australian drama \"Stingers\". She joined the cast of long-running television soap opera \"Neighbours\" in 2013 as the established character Lauren Turner.\nQuestion: Kate Kendall's name starts with A. True, False, or Neither?", "doc_id": 282, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18877, 5161, 2528, 4170], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Viktor Sarianidi led the Soviet invasion of Afghanistan. True, False, or Neither? False\n###\nNogiBingo! ( stylized as NOGIBINGO!) is a Japanese television variety show starring Japanese idol girl group Nogizaka46. Ijily Okada, who is known for many AKB48 related show such as \"AKB48 Nem\u014dsu TV\", hosted the program. The show firstly aired on July 3, 2013, as part of the variety show \"Nogizaka46 x HKT48 Kanbangumi Battle!\", and it became an independent show from the second season.\nQuestion: NogiBingo! became an independent show in the third season True, False, or Neither? False\n###\nValentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood.\nQuestion: Valentine is an American slasher film directed by Tom Savage. True, False, or Neither? False\n###\nThe Chullachaki (Quechua, \"one-footed\", from \"chulla\" or \"ch'ulla\" = single, odd, unpaired, asymmetric, \"chaki\" = foot; spelling sometimes also used in Spanish) or Chullachaqui (hispanicized spelling), also known as the Shapishico, is a mythical forest creature of the Peruvian and Brazilian Amazonian jungle.\nQuestion: There is no such thing as mythical beasts in Amazon culture. True, False, or Neither? False\n###\nAziyad\u00e9 (1879; also known as Constantinople) is a novel by French author Pierre Loti. Originally published anonymously, it was his first book, and along with \"Le Mariage de Loti\" (1880, also published anonymously), would introduce the author to the French public and quickly propel him to fame; his anonymous persona did not last long.\nQuestion: \"Le Mariage de Loti\" is the sequel to Aziyad\u00e9. True, False, or Neither?", "doc_id": 158, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30189, 11014, 9265, 29828], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Savoy Brown, originally known as the Savoy Brown Blues Band, are an English blues rock band formed in Battersea, south west London in 1965. Part of the late 1960s blues rock movement, Savoy Brown primarily achieved success in the United States, where they promoted their albums with non-stop touring.\nQuestion: Savoy Brown never crossed an ocean True, False, or Neither? False\n###\nArgonotes, the unofficial band of the Toronto Argonauts is an all volunteer organization committed to bringing quality musical entertainment and a \"traditional football atmosphere\" to all Argonauts home football games. Comprising more than 50 musicians on most game days, Argonotes is the largest musical organization associated with the CFL.\nQuestion: Argonotes is an official band. True, False, or Neither? False\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film was released on the last day of July. True, False, or Neither? True\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: Youth in Guatemala are blue eyed. True, False, or Neither? Neither\n###\nRed Earth, White Lies: Native Americans and the Myth of Scientific Fact is a book by Native American author Vine Deloria, originally published in 1995. The book's central theme is to criticize the scientific consensus which has, in his words, created \"a largely fictional scenario describing prehistoric North America\".\nQuestion: Vine Deloria wrote a fictional book about prehistoric North America. True, False, or Neither?", "doc_id": 311, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36572, 35043, 30769, 10331], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: \nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. He also owns a boat. True, False, or Neither? Neither\n###\nJenni Falconer (born 12 February 1976) is a Scottish radio and television presenter best known for her roles on the ITV daytime show \"This Morning\", where she is a regular travel reporter. Falconer was a regular presenter of the National Lottery Draws on BBC One.\nQuestion: She is an expert at presenting the lottery True, False, or Neither? Neither\n###\nSelma Diamond (August 6, 1920 \u2013 May 13, 1985) was an American comedic actress and radio and television writer, known for her high-range, raspy voice, and her portrayal of Selma Hacker on the first two seasons of the NBC television comedy series \"Night Court\".\nQuestion: Selma Diamond was married once True, False, or Neither? Neither\n###\nJames Brown (born February 25, 1951), commonly called \"J.B.\", is an American sportscaster known for being the host of \"The NFL Today\" on CBS Sports and \"Thursday Night Football\" on CBS Sports and NFL Network. He is also the Special Correspondent for CBS' news division. Also, he is best known as the former host of the FOX network's NFL pregame show, \"Fox NFL Sunday\" for 11 years.\nQuestion: James Brown was born in the same month as Valentine's Day. True, False, or Neither? True\n###\n\"In Due Time\" is the lead single from Killswitch Engage's sixth studio album, \"Disarm the Descent\". The song is the band's first single to feature vocalist Jesse Leach since 2003's \"The Element of One\". The song charted at no. 23 on the Active rock chart and no. 26 on the Mainstream Rock chart.\nQuestion: \"The Element of One\" did far better than \"In Due Time\" True, False, or Neither?", "doc_id": 625, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41325, 12038, 22495, 2579], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Douglas Daniel Weight (born January 21, 1971) is an American professional ice hockey coach, executive and former player. He is currently the head coach and assistant general manager for the New York Islanders. During his 19-year National Hockey League career, he played for the New York Rangers, Edmonton Oilers, Carolina Hurricanes, Anaheim Ducks, St. Louis Blues and the New York Islanders.\nQuestion: Douglas Daniel Weight is on television coaching. True, False, or Neither? Neither\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: 1974 New York Mets season is famous True, False, or Neither? Neither\n###\nWKKF \"(102.3 FM)\" - branded as Kiss 102.3 - is a Top 40 (CHR) station licensed to Ballston Spa, New York and serving the Capital District and Adirondacks. The station is owned by iHeartMedia and broadcasts at 102.3 FM at 4,100 watts ERP from a transmitter in Clifton Park, New York on a tower shared with WDCD-FM and WTMM-FM.\nQuestion: iHeart Media has over 100 stations True, False, or Neither? Neither\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall.\nQuestion: There are many national museums in Beijing. True, False, or Neither? Neither\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge.\nQuestion: The word \"ridge\" appears four times in this context. True, False, or Neither?", "doc_id": 774, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27922, 2862, 8661, 35656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title.\nQuestion: The 2011 Sudirman Cup was held more than 6666 days ago. True, False, or Neither? False\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song.\nQuestion: The word \"round\" appears multiple times in the lyrics of one of the songs on \"Shake it Up\" by The Cars. True, False, or Neither? True\n###\nChristmas Bounty is a 2013 television film directed by Gil Junger. It was produced by WWE Studios and stars Francia Raisa, Mike \"The Miz\" Mizanin and Will Greenberg. It premiered on ABC Family during their 25 Days of Christmas block on November 26, 2013.\nQuestion: Junger also directed Christmas Bounty 2. True, False, or Neither? Neither\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006.\nQuestion: Anime Speed and Anime Speed Newtype Edition are the only two albums to have featured anime music in 2005 and 2006. True, False, or Neither? Neither\n###\nHerv\u00e9 Le Tellier (born 21 April 1957) is a French writer and linguist, and a member of the international literary group Oulipo (Ouvroir de Litt\u00e9rature Potentielle, which translates roughly as \"workshop of potential literature\"). Other notable members have included Raymond Queneau, Georges Perec, Italo Calvino, Jacques Roubaud, Jean Lescure and Harry Mathews.\nQuestion: Oulipo joined writers and other literary enthusiasts on the international state. True, False, or Neither?", "doc_id": 367, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30540, 34695, 15985, 44453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution.\nQuestion: most students only get 2 year degrees True, False, or Neither? Neither\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Colorado Avalanche forward is Canadian True, False, or Neither? True\n###\nLik\u00ebng\u00eb are pork sausages flavored with salt, pepper and seed of Fennel (far\u00eb mbrai), made in Piana degli Albanesi and Santa Cristina Gela. \"Lik\u00ebng\u00eb\" is the Undefinite Singular, \"Lik\u00ebnga\" is the Definite Singular and is cognate with the Italian Lucanica and the Greek Loukaniko.\nQuestion: Likenge is cognate with the Italian Lucanica. True, False, or Neither? True\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: The Nydala Abbey was still in operation in 1500. True, False, or Neither? Neither\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\".\nQuestion: Bonzo Dog Band's second album was released two years before Tadpoles. True, False, or Neither?", "doc_id": 127, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30123, 29092, 2998, 16051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dobbs Ferry is a village in Westchester County, New York. The population was 11,093 at the 2016 census. The Village of Dobbs Ferry is located in, and is a part of, the town of Greenburgh. The village ZIP code is 10522. Most of the Village falls into the boundaries of the Dobbs Ferry Union Free School District.\nQuestion: Dobbs Ferry is named after a ferry True, False, or Neither? Neither\n###\nMajid (born 1975) is a Danish rapper of Moroccan-Berber origin. Residing in Aved\u00f8re near Copenhagen, Denmark he was a contributor to Danish act Outlandish, which also hails from Br\u00f8ndby Strand. Majid contributed to their tours and performed as a special guest in the warm-up for their acts.\nQuestion: Majid is a main act along with Danish act Outlandish on their mutual tour. True, False, or Neither? False\n###\nThe Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation.\nQuestion: nigerian national socce team under 20's is also known as the flying eagles True, False, or Neither? True\n###\nMate Pavi\u0107 (born 4 July 1993) is a Croatian professional tennis player specialising in doubles. Mate won the 2016 US Open mixed doubles title in partnership with Laura Siegemund, and reached the 2017 Wimbledon Championships men's doubles finals partnering Oliver Marach.\nQuestion: Mate Pavi\u0107 is an American tennis player True, False, or Neither? False\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats.\nQuestion: The football stadium \"El Cervol\" was built in 1965 B.C. True, False, or Neither?", "doc_id": 642, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [135, 5318, 14429, 9134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Emmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\".\nQuestion: The 5th Canadian Screen Awards were held four years after the 1st awards were given. True, False, or Neither? True\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur.\nQuestion: George Edward Foreman was an American boxer who won 2 gold medals at the olympics. True, False, or Neither? True\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\".\nQuestion: Northern Sweden contains the headquarters of the company. True, False, or Neither? True\n###\nMosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft.\nQuestion: Mosiula played football in Highschool True, False, or Neither? Neither\n###\nHercules and Love Affair is the debut studio album by American electronic music band Hercules and Love Affair, released on March 10, 2008 by DFA Records. The album was produced by Andrew Butler and Tim Goldsworthy. Andrew Raposo (of fellow DFA band Automato) and Tyler Pope (of !!!) contributed bass to the album, while Antony Hegarty co-wrote and performed vocals on select songs.\nQuestion: Hercules and Love Affair is the debut studio album by American electronic music band Hercules and Love Affair, released on in the third month of the year that equals 50.2 multiplied by 40 by DFA Records. True, False, or Neither?", "doc_id": 994, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34586, 6334, 17764, 36971], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place.\nQuestion: Paul Cormier has led several other college basketball teams throughout his career. True, False, or Neither? Neither\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes.\nQuestion: Hook, Line and Sinker is an American fishing television program produced in Hobart, Tasmania True, False, or Neither? False\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: Spanish Mastiff is a good guard dog True, False, or Neither? Neither\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio.\nQuestion: Christopher Lawrence (born 24 December 1926) is a classical musician, author, and conductor. True, False, or Neither? False\n###\nWriting Degree Zero (French: \"Le degr\u00e9 z\u00e9ro de l'\u00e9criture\" ) is a book of literary criticism by Roland Barthes. First published in 1953, it was Barthes' first full-length book and was intended, as Barthes writes in the introduction, as \"no more than an Introduction to what a History of Writing might be.\"\nQuestion: Writing Degree Zero was the first piece of literary criticism Barthes wrote. True, False, or Neither?", "doc_id": 861, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1433, 12862, 34190, 30860], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "High Bridge is a census-designated place (CDP) in Snohomish County, Washington, United States. The population was 2,994 at the 2010 census. High Bridge includes the Echo Lake community and the former Echo Lake CDP, which was superseded by the larger High Bridge CDP in 2010.\nQuestion: The population of High Bridge is now over 3,000. True, False, or Neither? Neither\n###\nThe 2000 Singapore Challenge, also known as the 2000 Godrej Singapore Challenge for sponsorship reasons, was a One Day International cricket tournament which took place between 20\u201327 August 2000. The tournament was held in Singapore. The tournament was won by South Africa who defeated Pakistan by 93 runs by the Duckworth\u2013Lewis method.\nQuestion: The 2000 Singapore Challenge was a highly anticipated tournament True, False, or Neither? Neither\n###\n\"Close Every Door\" is a song from the musical \"Joseph and the Amazing Technicolor Dreamcoat\" by Tim Rice and Andrew Lloyd Webber. It is the penultimate song of the first act of the musical, sung by Joseph while imprisoned for his supposed relationship with Potiphar's wife. Along with \"Any Dream Will Do\", it is one of the most popular songs from the musical.\nQuestion: \"Close Every Door\" is a song from the musical \"Joseph and the Amazing Technicolor Dreamcoat\" by Tim Rice and Andrew Lloyd Webber. There was no music in it. True, False, or Neither? False\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films.\nQuestion: Ileana Carusio visited the USA this year True, False, or Neither? Neither\n###\nAttitude City is the third studio album by the American comedy duo Ninja Sex Party. The album was released on July 17, 2015. Six tracks from the album, \"Party of Three,\" \"Dragon Slayer,\" \"Attitude City,\" \"Why I Cry,\" \"Peppermint Creams,\" and \"Road Trip\" were all released as singles on their YouTube channel prior to its release.\nQuestion: Ninja Sex Party released many videos before July 17 2015 True, False, or Neither?", "doc_id": 807, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19692, 39200, 28806, 41580], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Latin American Boom was a flourishing of literature, poetry and criticism in Latin America during the 1960s and 1970s, when writers from this region explored new ideas and came to international renown in a way that had not happened previously. Major figures of the boom include Julio Cort\u00e1zar, Gabriel Garc\u00eda M\u00e1rquez, Carlos Fuentes, Jorge Luis Borges, and Mario Vargas Llosa.\nQuestion: The Latin American Boom happened in 2002 True, False, or Neither? False\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree was born in a public hospital. True, False, or Neither? Neither\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016.\nQuestion: Before September of 2017, the group Fifth Harmony made at three albums. True, False, or Neither? True\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler.\nQuestion: Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. He has released many albums under a lot of different labels. True, False, or Neither? True\n###\nHave a Little Faith is a Hallmark Hall of Fame television movie. The film debuted on ABC on November 27, 2011, as the first \"Hallmark Hall of Fame\" film broadcast since CBS cancelled the series earlier in 2011. It was the first \"Hallmark Hall of Fame\" film broadcast on ABC since 1995.\nQuestion: ABC did not broadcast any Hallmark films for 20 years over the turn of the last century. True, False, or Neither?", "doc_id": 366, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18237, 42702, 22846, 9056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949.\nQuestion: The Girl from Jones Beach is an american film True, False, or Neither? True\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups.\nQuestion: Quesadillas are often eaten in Mexico. True, False, or Neither? True\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\").\nQuestion: His music was not popular True, False, or Neither? Neither\n###\nZina Lynna Garrison (born November 16, 1963) is a former professional tennis player from the United States. During her career, she was a women's singles runner-up at Wimbledon in 1990, a three-time Grand Slam mixed doubles champion, and a women's doubles gold medalist and singles bronze medalist at the 1988 Olympic Games. She is currently coaching Taylor Townsend.\nQuestion: Taylor Townsend is Garrison's first student. True, False, or Neither? Neither\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\".\nQuestion: He lived for over 90 years True, False, or Neither?", "doc_id": 68, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13891, 10366, 34538, 20647], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Benny Bell (born Benjamin Samberg or Benjamin Zamberg, March 21, 1906 \u2013 July 6, 1999) was an American singer-songwriter who reached popularity in the 1940s, with a comeback in the 1970s. He is particularly remembered for his risqu\u00e9 but cheerfully optimistic songs.\nQuestion: Benny Bell released more songs in the 1940s than the 1970s. True, False, or Neither? Neither\n###\nHeresy is a comedy talk show on BBC Radio 4, created and originally hosted by David Baddiel, now hosted by Victoria Coren Mitchell. In the show, the presenter and a panel of guests commit \"heresy\" by challenging people's most deeply received opinions on a subject, in front of a studio audience.\nQuestion: Victoria Coren MItchell is the host of Heresy True, False, or Neither? True\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016.\nQuestion: Fifth Harmony released two albums before the departure of Camila Cabello. True, False, or Neither? True\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry.\nQuestion: A man whose name rhymes with \"luck hairy\" made rock music over six decades ago. True, False, or Neither? True\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: Zombie Juice is only known for being a rapper and a director True, False, or Neither?", "doc_id": 208, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42196, 16287, 33612, 35564], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality.\nQuestion: Marcellite Wall was also the voice of Maggie Simpson. True, False, or Neither? Neither\n###\nDual-role transvestism is the formal diagnosis used by psychologists and physicians to describe people who wear clothes of the opposite sex to experience being the opposite sex temporarily, but don't have a sexual motive or want gender reassignment surgery. The International Classification of Diseases (ICD-10) list three diagnostic criteria for \"Dual-role transvestism\" (F64.1):\nQuestion: Dual-role transvestism is what Tom has. True, False, or Neither? Neither\n###\nSeton Catholic Preparatory High School is a college preparatory, co-educational Catholic high school in Chandler, Arizona, United States. Named after St. Elizabeth Ann Seton, the school was established in 1954 and is staffed by the Sisters of Charity of Seton Hill.\nQuestion: Seton Catholic Preparatory High School costs thousands True, False, or Neither? Neither\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks.\nQuestion: Cocaine costs under 10 usd True, False, or Neither? Neither\n###\nEl\u00ednr\u00f3s L\u00edndal is an entrepreneur in Fashion design. She established ELLA fashion label in 2008, one of the first Slow Fashion brands in the world. Elinr\u00f3s was the brands creative director and CEO. ELLA launched] it\u00b4s first fashion line in April 2011.\nQuestion: She does fashion design in the 2000's True, False, or Neither?", "doc_id": 684, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11037, 8163, 11916, 25069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gorgo is a 1961 British-American science fiction monster film directed by Eug\u00e8ne Louri\u00e9. The film focuses on Gorgo, an ancient large sea monster brought back to London for exploitation, and Ogra, his even larger mother, who rampages across London to search for him. The film was featured in an episode of the cult movie-mocking television show \"Mystery Science Theater 3000\".\nQuestion: The director also made another movie featuring large sea monster True, False, or Neither? Neither\n###\nMaris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy.\nQuestion: Maris Soule died in 1939. True, False, or Neither? False\n###\nA store-within-a-store, also referred to as shop-in-shop, is an agreement in which a retailer rents a part of the retail space to be used by a different company to run another, independent store. This concept was originally an idea proposed by the great philosopher and multi millionaire entrepreneur \"Joseph Westbrook\" of East Sussex, England.\nQuestion: Joseph Westbrook is from Sussex. True, False, or Neither? True\n###\nThe Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures.\nQuestion: The Golden Fetter was based on a true story. True, False, or Neither? Neither\n###\nStormRider was a simulator ride at Tokyo DisneySea. It simulated going into a weather storm in a futuristic airplane (a \"StormRider\") to dissipate the storm. The attraction opened on September 4, 2001, in the Port Discovery land of Tokyo DisneySea. The attraction closed on May 17, 2016 and replaced by a new Finding Nemo/Finding Dory simulator ride called Nemo & Friends SeaRider.\nQuestion: The attraction opened on the 17th day of the fourth month True, False, or Neither?", "doc_id": 199, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27263, 33185, 45029, 21707], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Requiem\" is the seventh episode in the fifth season, and the 101st overall episode, of the American crime drama television series \"NCIS\". It first aired on CBS in the United States on November 6, 2007. The episode was written by Shane Brennan and directed by Tony Wharmby.\nQuestion: \"Requiem\" was the 7th episode in the 5th season making it the first episode after the 100th episode and was watched by more than 20007 viewers. True, False, or Neither? Neither\n###\nThis Is England '86 is a 2010 British drama miniseries written by Shane Meadows and Jack Thorne, a spin-off from the 2006 film \"This Is England\". Set three years later, it focuses on the mod revival scene rather than the skinhead subculture, with the gang variously adopting an eclectic mix of clothing styles.\nQuestion: This Is England '86 had more than one writer True, False, or Neither? True\n###\nFenella Kernebone is an Australian radio and television presenter, MC and interviewer, based in Sydney, with a long record working across the arts, film, music, design, architecture and culture. Her most recent hosting roles include the presenter of By Design on Radio National and The Sound Lab on Triple J. In June 2016, she was appointed Head of Curation for TEDxSydney.\nQuestion: Sydney is Fenella Kernebones daughter. True, False, or Neither? False\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue.\nQuestion: Misty Knight sold billions. True, False, or Neither? Neither\n###\nTim Witherspoon (born December 27, 1957) is an American former professional boxer who competed from 1979 to 2003. He is a two-time world heavyweight champion, having held the WBC title in 1984, and the WBA title in 1986. Upon winning his second world title, Witherspoon joined Floyd Patterson and Muhammad Ali as the only boxers to win multiple world heavyweight championships.\nQuestion: Tim Witherspoon was born before November True, False, or Neither?", "doc_id": 988, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11413, 17875, 4751, 10447], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There are 26 states of Brazil, (Portuguese: \"estados\" ) which are the federal states of Brazil, plus the Federal District which contains the capital city, Bras\u00edlia. Below are these states listed in order of the area, from Amazonas with the greatest area, to the Federal District with the least. Brazil has a total area of 8,515,767\u00a0km, and is ranked 5th in the world.\nQuestion: Brazil has a total area less than 0.8 km. True, False, or Neither? False\n###\nHistorical period drama is a film genre in which stories are based on historical events and famous persons. Some historical dramas attempt to accurately portray a historical event or biography, to the degree that the available historical research will allow. Other historical dramas are fictionalised tales that are based on an actual person and their deeds.\nQuestion: HIstorical period dramas are expensive to make True, False, or Neither? Neither\n###\nLawrence Henry Johnson (1861 in Germany \u2013 1947) was a Minnesota Republican politician and a Speaker of the Minnesota House of Representatives. Johnson, a bridge contractor and engineer, came to Minnesota in 1884, and was elected to the Minnesota House of Representatives in 1900. He served five terms, serving as speaker from 1907 to 1909. Johnson died in 1947.\nQuestion: Lawrence Johnson was an engineer True, False, or Neither? True\n###\nHideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff.\nQuestion: Hideki Kamiya was born less than 52 weeks ago. True, False, or Neither? False\n###\nAmdoxovir is a nucleoside reverse transcriptase inhibitor (NRTI) undergoing research for the treatment of HIV/AIDS. It was discovered by Raymond F. Schinazi (Emory University) and C.K. Chu (University of Georgia). It is being developed by RFS Pharma. Currently, it is in Phase II clinical studies.\nQuestion: Amdoxovir is available with a prescription. True, False, or Neither?", "doc_id": 340, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15777, 3620, 12118, 34441], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II.\nQuestion: The Spanish Army would have carried the Campo-Giro before the year 1917, True, False, or Neither? True\n###\nAmericana Deluxe is the second studio album by Big Bad Voodoo Daddy. This album is also sometimes called \"Big Bad Voodoo Daddy\", as the album cover prominently displays a stylized \"Big Bad Voodoo Daddy\" logo and does not feature the phrase \"Americana Deluxe\" on it. However, the liner notes and the band's website clearly show that the true title is indeed \"Americana Deluxe\".\nQuestion: Big Bad Voodoo Daddy is a heavy metal band. True, False, or Neither? Neither\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: a power chord and the fifth cord are the same True, False, or Neither? True\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes.\nQuestion: private foundations are never used for estate planning purposes. True, False, or Neither? False\n###\nThe Pistol model 2000 (also \"md. 2000\") is a semi-automatic pistol designed and manufactured by RomArm via the Cugir Arsenal of Romania. The pistol, chambered in 9\u00d719mm Luger is an almost-identical copy of the Magnum Research Jericho 941 (Baby Eagle). The pistol is the standard sidearm of the Romanian Army.\nQuestion: The Romanian Army does not use the Pistol model 2000 very often. True, False, or Neither?", "doc_id": 760, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11893, 36887, 2009, 35242], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton).\nQuestion: Wooden Leather ends with r. True, False, or Neither? True\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: The Santa Cova Funicular is a cheap railway True, False, or Neither? Neither\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success.\nQuestion: \"Thief\", a styled maze chase, was the greatest game released that year. True, False, or Neither? Neither\n###\n\"Are You Sitting Comfortably?\" is a 1969 song by the progressive rock band The Moody Blues. It was written jointly by band members Justin Hayward and Ray Thomas. It was recorded and released in 1969 on the Moody Blues Album \"On the Threshold of a Dream\".\nQuestion: Ray Thomas came up with the song title \"Are you Sitting Comfortably?\" True, False, or Neither? Neither\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\"\nQuestion: Johns Creek has a population of 92,000. True, False, or Neither?", "doc_id": 600, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13657, 522, 24300, 35667], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi.\nQuestion: The movie The Godfather inspired the 1995 Indian crime film Aatank Hi Aatank directed by Dilip Shankar. True, False, or Neither? True\n###\nThe 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\"\nQuestion: Winners of the Stanley Cup in in 1903 were give a piece of a metal. True, False, or Neither? True\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards.\nQuestion: Many universities in South Florida rely on this technology to teach their students. True, False, or Neither? Neither\n###\nArt of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley.\nQuestion: Art of Dying is a country band True, False, or Neither? False\n###\nThe 2007 North Indian Ocean cyclone season was an event in the annual cycle of tropical cyclone formation. The North Indian Ocean cyclone season has no official bounds, but cyclones tend to form between April and December, with peaks in May and November. These dates conventionally delimit the period of each year when most tropical cyclones form in the northern Indian Ocean.\nQuestion: The 2007 North Indian Ocean had at least one cyclone. True, False, or Neither?", "doc_id": 541, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25866, 8400, 3753, 39365], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The Most Valuable Player was given to more than one player. True, False, or Neither? True\n###\nCommon Law Cabin (original title \"How Much Lovin' Does a Normal Couple Need?\") is a 1967 exploitation film directed by Russ Meyer. The movie features Alaina Capri and Meyer regulars Babette Bardot and Jack Moran. It was co-written by Russ Meyer and Jack Moran, and filmed on location on the Colorado River in Arizona. Other portions of the film were shot in the Coachella Valley, California.\nQuestion: Common Law was filmed in Canada True, False, or Neither? False\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan.\nQuestion: Ed Cebula is a famous pinball machine designer. True, False, or Neither? Neither\n###\nThe McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous.\nQuestion: A McLauren MP4/1 is very rare True, False, or Neither? Neither\n###\nThe Game Plan is a 2007 American family comedy film directed by Andy Fickman and written by Nichole Millard, Kathryn Price and Audrey Wells and starring Dwayne \"The Rock\" Johnson (marking the last film in which Johnson uses his ring name \"The Rock\" in billing). It follows an NFL quarterback who finds out he has an 8-year-old daughter from a previous relationship.\nQuestion: Nichole Millard once wrote a movie that starred Audrey Wells, which had a plot point involving a football player. True, False, or Neither?", "doc_id": 619, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27846, 18198, 29033, 2466], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"For a Minute\" is a song by English recording trio M.O. It was co-written by Jess Glynne and produced by Bless Beats and Loadstar. The song was released by Operator Records as a digital download on 13 April 2014 in the United Kingdom, marking the trio's debut single after buzz tracks \"On Ya\", \"Hot\", \"Wait Your Turn\", \"Come Let Me Show You\", and \"Ain't Got Time\".\nQuestion: \"For a Minute\" was written by 3 people. True, False, or Neither? Neither\n###\nFrancis Gary Powers (August 17, 1929 \u2013 August 1, 1977) \u2013 often referred to as simply Gary Powers \u2013 was an American pilot whose Central Intelligence Agency (CIA) U-2 spy plane was shot down while flying a reconnaissance mission in Soviet Union airspace, causing the 1960 U-2 incident.\nQuestion: Gary Powers was an American pilot who was shot down True, False, or Neither? True\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season.\nQuestion: The team did not work hard enough. True, False, or Neither? Neither\n###\nHuevos a la mexicana is a popular breakfast dish in Mexican cuisine. Finely chopped tomato, green chili pepper and onion is lightly fried in a hot skillet. Eggs are added and stirred until set. The heat is turned off and the coriander leaves are mixed in the eggs, adding salt. Refried beans is a common accompaniment.\nQuestion: A skillet is used in the recipe. True, False, or Neither? True\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\").\nQuestion: Ernest Guiraud wrote his final piece in 1892. True, False, or Neither?", "doc_id": 840, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39208, 25171, 16126, 14333], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: Tricia Wygal married one of The Flirts. True, False, or Neither? Neither\n###\nThe 1976 European Cup Winners' Cup Final was a football match between West Ham United of England and Anderlecht of Belgium. The final was held at Heysel Stadium in Brussels on 5 May 1976. It was the final match of the 1975\u201376 European Cup Winners' Cup tournament and the 16th European Cup Winners' Cup Final.\nQuestion: West Ham United of England has never played a Belgium team True, False, or Neither? False\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels.\nQuestion: Louis Glenn Marson will join the New York Yankees in 2020 True, False, or Neither? Neither\n###\nSpanglish is a 2004 American romantic comedy-drama film written and directed by James L. Brooks and starring Adam Sandler, T\u00e9a Leoni, Paz Vega, and Cloris Leachman. It was released in the United States on December 17, 2004 by Columbia Pictures. The film grossed $55 million worldwide on an $80 million production budget, and received mixed reviews from critics.\nQuestion: Spanglish received Best Movie of the Year. True, False, or Neither? Neither\n###\nMount Weeks, formerly Round Mountain, is a mountain located in Coos County, New Hampshire. Mt. Weeks is the northeasternmost of the Pliny Range of the White Mountains and the highest point within the city limits of Berlin, New Hampshire. Mount Weeks is flanked to the southwest by South Weeks, and faces Terrace Mountain to the northwest across Willard Notch.\nQuestion: Mount Weeks is not located in Berlin, Germany. True, False, or Neither?", "doc_id": 153, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4397, 19662, 26241, 20329], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eric Black is an American journalist. He was a longtime reporter for the Minnesota Star Tribune newspaper, and has also been a Twin Cities blogger. He is a columnist for online newspaper MinnPost, primarily writing about politics and the historical background of current issues.\nQuestion: eric black is from missouri True, False, or Neither? Neither\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign.\nQuestion: The 23rd Infantry are still in service. True, False, or Neither? False\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times.\nQuestion: . He was capped for the Belgian national team 42 times.\n True, False, or Neither? False\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians.\nQuestion: Adam Ant was touring during 1992. True, False, or Neither? Neither\n###\nSverre Peak ( ) is a small peak 0.5\u00a0nautical miles (0.9\u00a0km) off the north end of Pettersen Ridge in the Conrad Mountains of Queen Maud Land. Discovered and photographed by the German Antarctic Expedition, 1938-39. Mapped by Norway from air photos and surveys by the Norwegian Antarctic Expedition, 1956\u201360, and named for Sverre Pettersen, steward with Norwegian Antarctic Expedition, 1957-58.\nQuestion: A nautical mile is 1.8 kilometers. True, False, or Neither?", "doc_id": 18, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33532, 32688, 37024, 22140], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Piazza Colonna is a piazza at the center of the Rione of Colonna in the historic heart of Rome, Italy. It is named for the marble Column of Marcus Aurelius, which has stood there since AD 193. The bronze statue of Saint Paul that crowns the column was placed in 1589, by order of Pope Sixtus V. The Roman Via Lata (now the Via del Corso) runs through the piazza's eastern end, from south to north.\nQuestion: The Piazza Colonna is named after the Column of Marcus Aurelius. True, False, or Neither? True\n###\nLee Webb was the anchor of \"The 700 Club\", the flagship program of The Christian Broadcasting Network (CBN), and \"Newswatch\", a half-hour daily news program also on CBN. He was born in Pompano Beach, Florida. Since September 2013, he has served as the vice-president of broadcasting for Ligonier Ministries in Sanford, Florida.\nQuestion: Lee Webb has been in broadcasting his entire working career. True, False, or Neither? Neither\n###\nThe Sauber C33 is a Formula One racing car designed by Sauber to compete in the 2014 Formula One season. It was driven by Esteban Guti\u00e9rrez and Adrian Sutil, who joined the team after Nico H\u00fclkenberg returned to Force India. The C33 was designed to use Ferrari's new 1.6-litre V6 turbocharged engine, the 059/3.\nQuestion: The Sauber C33 was designed to use Ferrari's new 1.6-litre V6 True, False, or Neither? True\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s.\nQuestion: The album released 1994 True, False, or Neither? True\n###\nTanya McQueen is an American reality television personality and interior designer on TV's . She made her debut on \"Extreme Makeover\" in an October 2005 episode titled, \"The Teas Family\". On August 2, 2011, McQueen and fellow Extreme Makeover personality Tracy Hutson debuted the show \"Picker Sisters\" on Lifetime.\nQuestion: Tanya McQueen made her debut in Extreme Takeover. True, False, or Neither?", "doc_id": 405, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10418, 33739, 28189, 16583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: Kasey Peters retired from American football before he turned 35 years old. True, False, or Neither? True\n###\nRon & Carol Cope Stadium at Foster Field, is a football stadium located in Kearney, Nebraska, on the University of Nebraska\u2013Kearney campus. In 2005, the university named the stadium after Ron & Carol Cope, who were long-time supporters of the University of Nebraska System. The field is named after Charlie Foster, a former coach and athletic director at Nebraska\u2013Kearney.\nQuestion: Ron and Carol Cope don't like football. True, False, or Neither? False\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion.\nQuestion: Edward Gibbon FRS had work that was more important than \"The History of the Decline and Fall of the Roman Empire.\" True, False, or Neither? False\n###\nLeft Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker.\nQuestion: Chisholm was a man. True, False, or Neither? True\n###\nRichard Church Thompson (October 8, 1957 \u2013 July 27, 2016) was an American illustrator and cartoonist best known for his syndicated comic strip \"Cul de Sac\" and the illustrated poem \"Make the Pie Higher\". He was given the Reuben Award for Outstanding Cartoonist of the Year for 2010.\nQuestion: Richard Church Thompson starts with an A. True, False, or Neither?", "doc_id": 147, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30340, 30629, 4448, 56], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008.\nQuestion: Jeffrey B. Miller tried to obtain a PhD True, False, or Neither? Neither\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games.\nQuestion: Hannah won four British senior national titles. True, False, or Neither? True\n###\nThe Peoria Rivermen was a professional ice hockey team in the American Hockey League. They played in Peoria, Illinois, USA at the Carver Arena. On June 14, 2013, it was announced that the team would relocate to Utica, New York after the 2012\u201313 AHL season, and be known as the Utica Comets.\nQuestion: The Peoria Rivermen had a total of 23 hockey players on it. True, False, or Neither? Neither\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898.\nQuestion: Grimsby Pelham Football Club is a professional football club in Cleethorpes, Lincolnshire, England. True, False, or Neither? False\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc\nQuestion: Indian people flocked to see the movie Fukrey when it came out at the cinema True, False, or Neither?", "doc_id": 137, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31009, 39, 532, 36864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement.\nQuestion: Princess Caroline of Gloucester had no siblings True, False, or Neither? Neither\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\".\nQuestion: The Last of Us was released in the 21st century. True, False, or Neither? True\n###\nHarry Spencer Davis (born 24 September 1991) is an English professional footballer, who plays as a defender for Scottish Championship side St Mirren. Davis previously played with Crewe Alexandra. Early in his career, he was loaned by Crewe to Nantwich Town, Stafford Rangers and Curzon Ashton.\nQuestion: harry davis was not an athletic person True, False, or Neither? False\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall has an upper level True, False, or Neither? Neither\n###\nThe Great American Conference (GAC) is a collegiate athletic conference of eleven schools, with headquarters located in Russellville, Arkansas. It is affiliated in the National Collegiate Athletic Association (NCAA)'s Division II level. Athletic competition began play during the 2011\u201312 school year. Member schools are located in Arkansas and Oklahoma in the South Central United States.\nQuestion: The Great American Conference is a conference containing over 11 schools True, False, or Neither?", "doc_id": 936, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32774, 16763, 30652, 27808], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Adriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro is the most popular player True, False, or Neither? Neither\n###\n\"Outro\" is a song by French electronic music artist M83, released as the final track on the group's sixth studio album, \"Hurry Up, We're Dreaming\" (2011). It is a dramatic, symphonic rock song which has evoked \"heartbreak, nostalgia, anticipation, jubilation and triumph\".\nQuestion: Outro was sung by Clinton. True, False, or Neither? Neither\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree Victoria Murphy played Tricia Dingle, Eva Strong, and Dakota Davies. True, False, or Neither? True\n###\nLakeshore Technical College (or LTC) is a technical and community college located in Cleveland, Wisconsin, which is centrally located between the lakeshore cities of Sheboygan and Manitowoc. It is a member of the 16 schools in the Wisconsin Technical College System. The college was originally named Lakeshore Technical Institute (LTI).\nQuestion: Even though Sheboygan and Manitowoc are in Wisconsin, Madison is not. True, False, or Neither? Neither\n###\nThe Korea Aerospace Industries KF-X/Indonesian Aerospace IF-X is a South Korean and Indonesian program to develop an advanced multirole fighter for the Republic of Korea Air Force (ROKAF) and Indonesian Air Force (TNI-AU), spearheaded by South Korea with Indonesia as the primary partner. It is South Korea's second fighter development program following the FA-50.\nQuestion: Prior to the KF-X/IF-X program, South Korea had not taken initiative to develop fighters True, False, or Neither?", "doc_id": 155, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40459, 24874, 1942, 14543], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910.\nQuestion: Yi Bangja and Crown Prince Euimin were born in 1901. True, False, or Neither? True\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy.\nQuestion: The Kamwina Nsapu terrorist group used guns to attack the police. True, False, or Neither? Neither\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee\nQuestion: He was 52 when he released the novel True, False, or Neither? Neither\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: Santa Lucia showcases stunning architecture. True, False, or Neither? True\n###\nVivekananda Degree College is the only Degree college in Ichoda Mandal which is established in 2006 and is affiliated to Kakatiya University of Telangana, India. The college has its campus at Ichoda, Adilabad. The college runs degree courses in Computer Science, Arts, Science, Commerce and Management.\nQuestion: Vivekananda Degree College was established more than 10,000 seconds ago. True, False, or Neither?", "doc_id": 847, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36614, 13012, 42126, 45371], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic.\nQuestion: Lausche is located in the French border state of Saxony. True, False, or Neither? False\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees.\nQuestion: Jaron Long will end up the the Hall of Fame for his accomplishments. True, False, or Neither? Neither\n###\nInteractive Investor (II) is an online trading and investment platform based in London. The group offers retail investors an investment service to manage and trade shares, funds and bonds via trading accounts, ISAs and SIPPs. Its website provides content which is intended to support investors in making the difficult and complex decisions associated with online trading and investment.\nQuestion: Interactive Investor is an UK company. True, False, or Neither? True\n###\nDrifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016.\nQuestion: The fifth series was released in 2016. True, False, or Neither? False\n###\nIn cooking, coddled eggs are gently or lightly cooked eggs. They can be partially cooked, mostly cooked, or hardly cooked at all (as in the eggs used to make Caesar salad dressing, which are only slightly poached for a thicker end-product). Poached eggs are eggs that, arguably, are coddled in a very specific way: they are very gently cooked, in simmering water.\nQuestion: Eggs taste very bad True, False, or Neither?", "doc_id": 384, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35968, 29149, 39133, 32687], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: You & Me is the flirts best song of the 1980s True, False, or Neither? Neither\n###\nFloridana Beach is an unincorporated community in Brevard County, Florida, United States. It is located on a barrier island southeast of the city of Melbourne and east of the town of Grant-Valkaria. It is just south of the unincorporated community of Melbourne Shores, and north of the unincorporated community of Sunnyland Beach.\nQuestion: Sunnyland Beach and Melbourne Shores are also located on Brevard County. True, False, or Neither? Neither\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: Sadhu Kokila chose to direct Suntaragaali because romantic action films are his favorite thing to work on. True, False, or Neither? Neither\n###\nUSS \"Chicago\" (CA-136) was a \"Baltimore\"-class heavy cruiser laid down on 28 July 1943 at Philadelphia, Pennsylvania, US, by the Philadelphia Navy Yard. Launched on 20 August 1944, she was sponsored by Mrs. Edward J. Kelly, wife of the Mayor of Chicago, Illinois, and commissioned at the Philadelphia Navy Yard on 10 January 1945, Captain Richard R. Hartung, USN, in command.\nQuestion: The USS Chicago was sponsored by the wife of the Mayor of Chicago. True, False, or Neither? True\n###\nWilliam Elden Bolcom (born May 26, 1938) is an American composer and pianist. He has received the Pulitzer Prize, the National Medal of Arts, a Grammy Award, the Detroit Music Award and was named 2007 Composer of the Year by Musical America. Bolcom taught composition at the University of Michigan from 1973\u20132008. He is married to mezzo-soprano Joan Morris.\nQuestion: Bolcom started teaching composition at the University of Michigan in the summer of the year preceding 1794 True, False, or Neither?", "doc_id": 791, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7733, 39168, 7756, 14182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Serial Killers Ink is a website dedicated to selling \"murderabilia\" (collectibles related to murders, murderers or other violent crimes) and serial killer art, interviewing convicted serial killers and also serves as a meeting place for those interested or involved in the murderabilia industry.\nQuestion: Serial Killers Ink sells murder memorabilia. True, False, or Neither? True\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: Ra\u00fal Alberto Osella played FIFA U-21 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. True, False, or Neither? False\n###\nOpal Koboi is a fictional character from the Artemis Fowl series by Eoin Colfer. After the character's introduction in in the series as a supporting antagonist, Colfer again used Koboi as the main antagonist of the fourth, sixth, and eighth books in the series, giving her the status of archenemy to Artemis Fowl II.\nQuestion: Koboi and Fowl are friends in the 8th book. True, False, or Neither? False\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: The station has a lot of phone-in shows. True, False, or Neither? Neither\n###\nA cardigan is a type of knitted garment that has an open front. Commonly cardigans have buttons: a garment that is tied is instead considered a robe. A more modern version of the garment has no buttons and hangs open by design. By contrast, a pullover does not open in front but must be \"pulled over\" the head to be worn. It may be machine- or hand-knitted.\nQuestion: Cardigans has buttons however it can have buttons as well no buttons. True, False, or Neither?", "doc_id": 952, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14331, 4393, 30563, 40957], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Europrop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas.\nQuestion: EPI has many products. True, False, or Neither? False\n###\nCairn Energy PLC is one of Europe's leading independent oil and gas exploration and development companies and is listed on the London Stock Exchange. Cairn has discovered and developed oil and gas reserves in a variety of locations around the world. Cairn Energy has a primary listing on the London Stock Exchange and is a constituent of the FTSE 250 Index.\nQuestion: Cairn is shrinking its operations True, False, or Neither? Neither\n###\nFor Screening Purposes Only is the debut album by UK dance-punk trio Test Icicles. After being released in 2005, the album was critically praised for being unique and compelling in an increasingly homogenous indie music scene. Following the group's split in February 2006, the album remains Test Icicles' only LP.\nQuestion: The album was praised as unique True, False, or Neither? True\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: Dr. Jekyll Is Mr. Hyde is a popular south korean movie True, False, or Neither? Neither\n###\nFrench opera is one of Europe's most important operatic traditions, containing works by composers of the stature of Rameau, Berlioz, Bizet, Debussy, Poulenc and Messiaen. Many foreign-born composers have played a part in the French tradition as well, including Lully, Gluck, Salieri, Cherubini, Rossini, Meyerbeer, Offenbach and Verdi.\nQuestion: Rameau was from Spain True, False, or Neither?", "doc_id": 99, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7179, 4916, 10013, 40928], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ripponlea is an inner suburb of Melbourne, Victoria, Australia, named after the adjoining Rippon Lea Estate. It is 7\u00a0km south east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, Ripponlea had a population of 1,478.\nQuestion: Melbourne is very windy. True, False, or Neither? Neither\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles.\nQuestion: The 1970 Swedish open was held in 1969 True, False, or Neither? False\n###\nPangani Falls Dam is a dam in Tanzania, which is part of the Pangani Hydro Systems. The dam is located at Koani in the Muheza District of the Tanga Region, about 8\u00a0km south of another power station at Hale. The Pangani falls power station has two turbines and has an installed capacity of 68 MW .\nQuestion: The Pangani Falls power station produces all of the electricity for the Tanga Region. True, False, or Neither? Neither\n###\nUna questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga.\nQuestion: Una questione privata was a 1993 film based on a novel about WWII. True, False, or Neither? True\n###\nEdmund Quincy III ( ; 1681\u20131737) was an American merchant and judge. He was the son of Col. Edmund Quincy II (1627-1698) II and his second wife, Elizabeth Gookin. He married Dorothy Flynt and had 7 children. Four lived to adulthood, including Edmund Quincy IV and Dorothy Quincy, who was the topic of a famous poem by Oliver Wendell Holmes, Sr.\nQuestion: Edmund Quincy III married his second wife Dorothy Flynt. True, False, or Neither?", "doc_id": 525, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15994, 14343, 12645, 16848], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph Ernst Friedrich von Forcade de Biaix didn't live to see his 70th birthday. True, False, or Neither? True\n###\nHarry Brand (October 20, 1895 \u2013 February 22, 1989) was an American press agent. Described as \"the mastermind who made Shirley Temple the most famous child star in history, Betty Grable a GI Joe pinup girl and Marilyn Monroe a sex goddess,\" Brand was the head of publicity at 20th Century Fox from 1935 until 1962.\nQuestion: Shirley Temple knew Harry Brand True, False, or Neither? True\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: Antonio Lewis was the highest paying member of Flatbush ZOMBIES True, False, or Neither? Neither\n###\nThe Zurich derby is a football match between rivals FC Z\u00fcrich and Grasshopper Club Z\u00fcrich. The two teams were founded in Z\u00fcrich, Switzerland. Grasshopper in 1886 and Z\u00fcrich in 1896. Grasshoppers are known as the club of the elite and FCZ are known as the club of the workers. The derby is unique in Switzerland as it is the only rivalry between two teams from the same city.\nQuestion: These two teams do not like each other. True, False, or Neither? Neither\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards.\nQuestion: Jake Deckard was a part of at least one R rated movie True, False, or Neither?", "doc_id": 289, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35049, 4708, 27403, 7939], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Letter Black, formerly known as Breaking the Silence, is a Christian rock band that was formed in 2006 in Uniontown, Pennsylvania. The band consists of lead vocalist Sarah Anthony; her husband, lead guitarist and vocalist Mark Anthony; and drummer Justin Brown.\nQuestion: The Letter Black have popular music videos True, False, or Neither? Neither\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts.\nQuestion: Phacelia mutabilis is pink. True, False, or Neither? Neither\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge.\nQuestion: Kew Bridge station is named for the borough it is located in. True, False, or Neither? False\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75.\nQuestion: Interstate 29 has cars with Canadian license plates on it. True, False, or Neither? Neither\n###\nFranco Mari (Born 23 January 1947) is an Italian actor and comedian. Better known as Rupert Sciamenna, his best known character, he is famous for his participation in television programs such as Mai dire... on Italia 1 in many sketches with Marcello Macchia.\nQuestion: Rupert Sciamenna is a fictional character. True, False, or Neither?", "doc_id": 44, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4173, 20284, 13382, 19204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Animation Domination was an animated programming block which originally aired from May 1, 2005, until September 21, 2014, on the Fox network. The block aired on Sunday evenings through the entirety of that night's primetime schedule (unless preempted, usually by sports telecasts).\nQuestion: Animation Domination was geared to teenagers during the 2006 season. True, False, or Neither? Neither\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) .\nQuestion: Sabanc\u0131 University is located more than 10 km away from Istandbul's city center. True, False, or Neither? True\n###\nRobert Mills Delaney, sometimes incorrectly spelled Delany (1903-1956) was an American composer. He studied with Nadia Boulanger and Arthur Honegger in Paris, and was best known for his 1928 choral symphony, John Brown's Song, based on Stephen Benet's Pulitzer Prize winning poem \"John Brown's Body\".\nQuestion: Robert Delaney was at least 52 years of age at the time of his death. True, False, or Neither? True\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign.\nQuestion: 23rd Infrantry was in both World Wars. True, False, or Neither? True\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives.\nQuestion: Christina Applegate and Will Arnett have starred together previously. True, False, or Neither?", "doc_id": 789, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5039, 23406, 14293, 7912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Peter Joseph Wysocki (October 3, 1948 \u2013 June 14, 2003) was an American football linebacker who played his entire six-year career with the Washington Redskins from 1975 to 1980 in the National Football League (NFL). Wysocki previously played four seasons in the Canadian Football League (CFL) for the Hamilton Tiger-Cats, Toronto Argonauts and Saskatchewan Roughriders.\nQuestion: Peter Joseph Wysocki ended up dying from CTE due to injuries sustained during his playing career. True, False, or Neither? Neither\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson.\nQuestion: Great Balls of Fire took a few years to write. True, False, or Neither? Neither\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide.\nQuestion: GLO started operating later than 1996. True, False, or Neither? True\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Destiny can be written in Arabic. True, False, or Neither? True\n###\nAleksandr Danilovich Aleksandrov (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u0301\u043b\u043e\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 , alternative transliterations: \"Alexandr\" or \"Alexander\" (first name), and \"Alexandrov\" (last name)) (August 4, 1912 \u2013 July 27, 1999), was a Soviet/Russian mathematician, physicist, philosopher and mountaineer.\nQuestion: Aleksandr was an intelligent person, and also adventurous. True, False, or Neither?", "doc_id": 750, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23706, 18945, 38146, 13493], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: The final of the 1983 Prudential Cup was a popular event. True, False, or Neither? Neither\n###\nMargaret Mary \"Maggie\" Nichols (born September 12, 1997, in Little Canada, Minnesota) is an American collegiate artistic gymnast for the University of Oklahoma. She is one of only nine NCAA gymnasts to have scored a perfect 10 on all four events, and the first to do so for Oklahoma.\nQuestion: Nichols is also an avid bowler. True, False, or Neither? Neither\n###\nIntermountain Healthcare is a not-for-profit healthcare system and is the largest healthcare provider in the Intermountain West. Intermountain Healthcare provides hospital and other medical services in Utah and Idaho and also offers integrated managed care under the insurance brand SelectHealth. Intermountain Healthcare is headquartered in Salt Lake City, Utah, and has some 37,000 employees.\nQuestion: Intermountain Healthcare is a not-for-profit healthcare system and is the largest healthcare provider in the Intermountain West. Intermountain Healthcare provides hospital and other medical services in Utah and Idaho and also offers integrated managed care under the insurance brand SelectHealth. Intermountain Healthcare is headquartered in Salt Lake City, Utah, and has 5 employees. True, False, or Neither? False\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: Nydala Abbey belongs to church of England. True, False, or Neither? False\n###\nCruel Intentions: The Musical is a jukebox musical adapted from the film \"Cruel Intentions\" by Jordan Ross and Lindsey Rosin with music direction and arrangements by Zach Spound. After two sold-out engagements in Los Angeles, the show made its New York City debut at Le Poisson Rouge in 2017.\nQuestion: Jordan Ross wrote the music for Cruel Intentions. True, False, or Neither?", "doc_id": 169, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14357, 8223, 25392, 21436], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kate Kendall (born 27 July 1973) is an Australian actress best known for her role in the long-running Nine Network Australian drama \"Stingers\". She joined the cast of long-running television soap opera \"Neighbours\" in 2013 as the established character Lauren Turner.\nQuestion: Kate Kendall has only acted on television. True, False, or Neither? Neither\n###\nChristmas Bounty is a 2013 television film directed by Gil Junger. It was produced by WWE Studios and stars Francia Raisa, Mike \"The Miz\" Mizanin and Will Greenberg. It premiered on ABC Family during their 25 Days of Christmas block on November 26, 2013.\nQuestion: Christmas Bounty is one of the most famous WWE wrestling movies ever made. True, False, or Neither? Neither\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Broadway Rose was released in the 1920's True, False, or Neither? True\n###\nNelson is an American rock band founded by singer/songwriters Matthew and Gunnar Nelson (twin sons of Ricky Nelson and Kristin Nelson). The band achieved success during the early 1990s with their double platinum debut album \"After the Rain\", which featured the number-one hit \"(Can't Live Without Your) Love and Affection\".\nQuestion: Nelson sold albums. True, False, or Neither? True\n###\nFrankenstein Castle (German: \"Burg Frankenstein\" ) is a hilltop castle in the Odenwald overlooking the city of Darmstadt in Germany. It is thought that this castle may have been an inspiration for Mary Shelley when she wrote her 1818 Gothic novel \"Frankenstein\".\nQuestion: Frankenstein Castle was scene by Clinton. True, False, or Neither?", "doc_id": 357, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14168, 9250, 34345, 27966], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o emigrated to Japan when he was 19 True, False, or Neither? False\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75.\nQuestion: Interstate 29 is under construction at this time. True, False, or Neither? Neither\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: He used to play soccer with his childhood friends True, False, or Neither? Neither\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: CUHK is a co-ed institution. True, False, or Neither? Neither\n###\nAloe ( or ), also written \"Alo\u00eb\", is a genus containing over 500 species of flowering succulent plants. The most widely known species is \"Aloe vera\", or \"true aloe\", so called because it is cultivated as the standard source of so-called \"aloe vera\" for assorted pharmaceutical purposes. Other species, such as \"Aloe ferox\", also are cultivated or harvested from the wild for similar applications.\nQuestion: Aloe is used mostly for the lips True, False, or Neither?", "doc_id": 16, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18749, 13778, 18665, 6838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944.\nQuestion: USNS \"Lone Jack was made in PA True, False, or Neither? True\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Coriolano: eroe senza patria was a drama film with some romantic elements True, False, or Neither? Neither\n###\nThe third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale.\nQuestion: Gossip Girl, is still found on tv True, False, or Neither? Neither\n###\nThe Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter.\nQuestion: Renee Probert-Price left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great grand daughter and her husband True, False, or Neither? False\n###\nUdinese Calcio sensationally finished third in Serie A, much due to Oliver Bierhoff being in the form of his life, scoring 27 goals in a league season consisting of just 34 matches. Bierhoff, coach Alberto Zaccheroni and winger Thomas Helveg all left for Milan at the end of the season, ensuring Udinese had lots of work to do to stay at the level it was.\nQuestion: Biefhoff has won a championship. True, False, or Neither?", "doc_id": 904, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24126, 41788, 40987, 13836], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Recently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct.\nQuestion: Several hundred mammals have gone extinct since 1500 True, False, or Neither? False\n###\nBright Lights, Big City is a rock musical with music, lyrics and book written by Scottish composer Paul Scott Goodman based on the 1984 novel by Jay McInerney. It follows a week in the life of Jamie, a successful young writer who loses himself in the chaos of 1980s New York City. The piece premiered Off-Broadway in New York City in 1999 and was revived in a small London production in 2010.\nQuestion: Goodman wrote Bright Lights, Big City based on his own experiences. True, False, or Neither? False\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles.\nQuestion: Poor Pretty Eddie is a 1975 American film, the cast included Leslie Uggams and Shelly Winters as side characters, with Michael Christian with the starring roll. True, False, or Neither? Neither\n###\nAn election campaign was held ahead of a general election for the 54th Parliament of New South Wales on Saturday, 24 March 2007. The result\u2014a win for the social-democratic Australian Labor Party and its new leader Morris Iemma\u2014was widely perceived as a foregone conclusion, with opposition leader Peter Debnam conceding as much the week before the poll.\nQuestion: Australian Labor Party is a party in Japan. True, False, or Neither? False\n###\nMosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft.\nQuestion: Tatupu retired before his 28th birthday. True, False, or Neither?", "doc_id": 853, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13683, 18761, 16937, 27052], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Patti Clare (born 3 March 1976) is an English actress, known for playing the character of Mary Taylor in the ITV soap opera \"Coronation Street\" since 2008. She is a three-time winner of the British Soap Award for Best Comedy Performance (2011, 2013, 2016).\nQuestion: Patti Clare will receive a role in an upcoming Disney film True, False, or Neither? Neither\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990).\nQuestion: Zale Dalen is a Canadian. He is proud of his film the hounds of Notre Dame True, False, or Neither? Neither\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race.\nQuestion: Tomas is a cyclist from brazil True, False, or Neither? False\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ.\nQuestion: Frank is an African American author and blogger. True, False, or Neither? Neither\n###\nEucommia jeffersonensis is an extinct species of flowering plant in the family Eucommiaceae. It is known from a fossil fruit found in latest Eocene deposits of Oregon, United States. \"E.\u00a0jeffersonensis\" is one of five described fossil species from North America assigned to the modern genus \"Eucommia\". The other species are \"E.\u00a0constans\", \"E.\u00a0eocenica\", \"E.\u00a0montana\", and \"E.\u00a0rolandii\".\nQuestion: The letter \"E\" in E. montana stands for \"Eucommia\". True, False, or Neither?", "doc_id": 917, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10212, 17027, 24468, 26586], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "George Montgomery (born April 26, 1962) is a retired American basketball player. He played basketball at Illinois, and was a second-round draft selection of the Portland Trail Blazers in the 1985 NBA Draft, though he never played in the NBA. He is the biological father of Warriors center JaVale McGee, but did not raise his son.\nQuestion: George Montgomery died in Alabama. True, False, or Neither? Neither\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: \"I'm So Sorry\" is the 2nd promotional single from the album Smoke + Mirrors True, False, or Neither? True\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: I'm So Sorry is an epic ballad. True, False, or Neither? Neither\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989.\nQuestion: Girilal Jain was born in1924. True, False, or Neither? True\n###\nLakeshore Technical College (or LTC) is a technical and community college located in Cleveland, Wisconsin, which is centrally located between the lakeshore cities of Sheboygan and Manitowoc. It is a member of the 16 schools in the Wisconsin Technical College System. The college was originally named Lakeshore Technical Institute (LTI).\nQuestion: It is the only technical college in wisconsin True, False, or Neither?", "doc_id": 895, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27771, 2144, 39845, 28012], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city.\nQuestion: Kingfisher Ultra Indian Derby is named after Kingfisher beer. True, False, or Neither? Neither\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd.\nQuestion: Steve Carell wanted to play the main character True, False, or Neither? Neither\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\".\nQuestion: The students who write the journal are heavily involved in politics True, False, or Neither? Neither\n###\nThe Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December 1898. The significance of this pair of lions was their unusual behavior, such as the number of men killed and the manner of the attacks.\nQuestion: The Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December of the 98th year in the eighteenth century. True, False, or Neither? False\n###\nNo Said Date is the debut studio album by American rapper and Wu-Tang Clan member Masta Killa. The album was released on June 1, 2004, by Nature Sounds. The album features guest appearances from Raekwon, Ghostface Killah, Streetlife, Prodigal Sunn, Killah Priest, Method Man, Ol' Dirty Bastard, Allah Real, Inspectah Deck and GZA.\nQuestion: Wu-Tang Clan, as a rap group, released the album No Said Date in June 2004. True, False, or Neither?", "doc_id": 650, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18885, 29549, 35949, 23824], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: Kapil Batra Films Production House was initially reluctant to release the film because of its long title. True, False, or Neither? Neither\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12.\nQuestion: Upper Grosvenor Street has a z. True, False, or Neither? False\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro likes baseball True, False, or Neither? Neither\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Izzadeen Mohamed Naufer was born in the morning hours of January 17th, 1981. True, False, or Neither? Neither\n###\nO lieb, so lang du lieben kannst is a poem written by Ferdinand Freiligrath, a 19th-century German writer. In 1847, Hungarian composer Franz Liszt set the poem to music (soprano voice and piano), and eventually adapted it into his famous Liebestr\u00e4ume No. 3. The work is one of Liszt's most famous and poignant. \"Liebestr\u00e4ume\" in German means \"Dreams of Love\".\nQuestion: Dreams of Love showcased more than one musical number True, False, or Neither?", "doc_id": 414, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36703, 41534, 14881, 4948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Faer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting.\nQuestion: Dungeons and dragons is very cool these days True, False, or Neither? Neither\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau.\nQuestion: The 3rd Macau International Movie Festival ceremony has an E in it. True, False, or Neither? True\n###\nClear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Chris Vrenna of Nine Inch Nails/Tweaker, it was released in July 2000 on the now-defunct label Risk Records. After \"Clear Hearts, Grey Flowers\" the band formally split up and moved on to establish other projects.\nQuestion: Jack Off Jill produced no albums after July 2000. True, False, or Neither? True\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity.\nQuestion: Lex Talionis Fraternitas' existence dates back to the 1960s. True, False, or Neither? True\n###\nThe Hyundai Genesis Coup\u00e9 is a rear-wheel drive sports coupe from Hyundai Motor Company, released on October 13, 2008 for the Korean market. It is Hyundai's first rear-wheel drive sports coupe, and shares its basic platform with the Hyundai Genesis luxury sedan.\nQuestion: The Hyundai Genesis Coup\u00e9 was released to America. True, False, or Neither?", "doc_id": 164, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39757, 14323, 14358, 25349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places.\nQuestion: Doing well in advertising will get your home in the U.S. National Register of Historic Places. True, False, or Neither? Neither\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists.\nQuestion: The Caidas del Cielo cast had some English born actors. True, False, or Neither? Neither\n###\nD.A.R.Y.L. is a 1985 American science fiction film written by David Ambrose, Allan Scott and Jeffrey Ellis. It was directed by Simon Wincer and stars Barret Oliver, Mary Beth Hurt, Michael McKean, Danny Corkill, and Josef Sommer. The original music score was composed by Marvin Hamlisch.\nQuestion: D.A.R.Y.L. is a 90's American science fiction film written by David Ambros, Allan Scott and Jeffrey Ellis. True, False, or Neither? False\n###\nDemoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\".\nQuestion: Demoniac were formed over 10 years ago True, False, or Neither? True\n###\nNate Albert (born 1970) is an American music executive, songwriter, producer and guitar player. He is currently the Executive Vice President of A&R at Capitol Records a division of Universal Music Group. He was formerly Senior Vice President of A&R at Republic Records, where he worked with such artists as The Weeknd, Florence & the Machine, Phantogram and the Lonely Island.\nQuestion: Nate Albert sings in Phantogram True, False, or Neither?", "doc_id": 915, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19207, 27968, 41112, 1052], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Metal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed.\nQuestion: The PlayStation is still used today. True, False, or Neither? Neither\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback.\nQuestion: Hard Landing was originally written as a comedy. True, False, or Neither? Neither\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats.\nQuestion: Club Deportivo Denia was founded in 1927 its stadium holds 3,000 balls. True, False, or Neither? Neither\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced.\nQuestion: Splice is a horror scifi movie that was filmed in Paris. True, False, or Neither? Neither\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats.\nQuestion: Vinar\u00f2s Club de F\u00fatbol was founded in 1965 and was speculated to not have been made in the Valencian Community, but another community. True, False, or Neither?", "doc_id": 769, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11215, 4951, 26430, 12892], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "My Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington.\nQuestion: Guy de la B\u00e9doy\u00e8re hosts My Famous Family by himself. True, False, or Neither? False\n###\nAndr\u00e9 Olbrich (born 3 May 1967, in D\u00fcsseldorf, Germany) is a German guitarist, composer and backing vocalist, most known as the co-founder and lead guitarist of power metal band Blind Guardian, in which he serves as one of the main composers with other co-founder Hansi K\u00fcrsch.\nQuestion: Hansi K\u00fcrsch is a German guitarist, composer and backing vocalist, most known as the co-founder and lead guitarist of power metal band Blind Guardian True, False, or Neither? False\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada.\nQuestion: The ACT is based in the northeastern United States, Quebec, and Canada. True, False, or Neither? True\n###\nCaddyshack is a 1980 American comedy film directed by Harold Ramis and written by Brian Doyle-Murray, Ramis and Douglas Kenney. It stars Michael O'Keefe, Chevy Chase, Rodney Dangerfield, Ted Knight, and Bill Murray. Doyle-Murray also has a supporting role. The film was later dedicated to producer Douglas Kenney, who died shortly after the film's release.\nQuestion: Caddyshak, a comedy film was released in 1980 right after the death of 1/3 writers. True, False, or Neither? True\n###\nJohn Davison Rockefeller III (March 21, 1906 \u2013 July 10, 1978) was a philanthropist and third-generation member of the prominent Rockefeller family. He was the eldest son of philanthropists John D. Rockefeller Jr. and Abby Aldrich Rockefeller. His siblings were Abby, Nelson, Laurance, Winthrop, and David.\nQuestion: John Rockefeller was seventy-two when he passed away. True, False, or Neither?", "doc_id": 562, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41381, 35027, 5972, 36672], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers.\nQuestion: oleg smirnov won the super league with Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk True, False, or Neither? Neither\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns.\nQuestion: The 8.8 cm Flak was the gun of choice for female soldiers True, False, or Neither? Neither\n###\nFoaly is a fictional character in the Artemis Fowl series written by Eoin Colfer. He is the technical consultant to the Lower Elements Police (LEP). He is the most intelligent centaur on and under the Earth, considers himself to be an unappreciated genius, and is the inventor of most of the advanced technology possessed by the fairy world, rivaled only by Opal Koboi.\nQuestion: Eoin Colfer never wrote a book. True, False, or Neither? False\n###\nRelient K is the debut studio album by American rock band Relient K. Many of the tracks are newer versions of those found on their 1998 demo \"All Work & No Play\". Typical of early Relient K albums, the lyrics use pop culture references for teaching and to illustrate Biblical principles. As of late 2006/early 2007, this album has sold around 400,000 copies.\nQuestion: Relient K band is from America. True, False, or Neither? True\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele.\nQuestion: Stanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward but he is retired from football now. True, False, or Neither?", "doc_id": 185, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1796, 42279, 34736, 37412], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Anne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952.\nQuestion: The radio play has the same name as Anne Frank's diary. True, False, or Neither? True\n###\nHow Murray Saved Christmas is a 2014 animated musical television special, directed by Peter Avanzino and written by Mike Reiss. The voice actors include Jerry Stiller, Sean Hayes, Kevin Michael Richardson, Jason Alexander, John Ratzenberger and Dennis Haysbert.\nQuestion: How Murry Saved Christmas was produced by Peter Avanzino. True, False, or Neither? Neither\n###\nAucuba chinensis is a shrub or small tree, native to southern China, Taiwan, Burma and northern Vietnam. Typically it grows to 6 meters tall, though it can be larger. The leaves are thick, dark green above and light green below, sometimes with teeth along the margins.\nQuestion: The teeth only grow on the lower leaves. True, False, or Neither? Neither\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Idris was a no nonsense leader. True, False, or Neither? Neither\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests.\nQuestion: Maya & Marty was a variety show that ended on May 31, 2016, after just one season. True, False, or Neither?", "doc_id": 417, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1649, 27390, 25627, 31626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Whitechapel is a British television drama series produced by Carnival Films, in which detectives in London's Whitechapel district dealt with murders which replicated historical crimes. The first series was first broadcast in the UK on 2 February 2009 and depicted the search for a modern copycat killer replicating the murders of Jack the Ripper.\nQuestion: Whitechapel premiered on late 2009. True, False, or Neither? False\n###\nThe Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park.\nQuestion: Newnes railway line reopened last year. True, False, or Neither? False\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils.\nQuestion: Lana Turner did not appear in many movies after 1969 True, False, or Neither? True\n###\nKathleen Delaney is an American actress, voice actress, singer, and dancer who works on Broadway and on the properties of 4Kids Entertainment. She is best known as the voice of Hina in the 4Kids dub of One Piece, Mai Valentine in uncut versions of \"Yu-Gi-Oh!\" and Rouge in \"Sonic X\" and the succeeding games until 2010, when she was replaced by Karen Strassman.\nQuestion: Kathleen Delaney is an American actress who has often worked in children's films. True, False, or Neither? Neither\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8.\nQuestion: Dave Dennis was born 19 days after New Years holiday True, False, or Neither?", "doc_id": 798, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25579, 27623, 3492, 34671], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1972 Grantland Rice Bowl was an NCAA College Division game following the 1972 season, between the Louisiana Tech Bulldogs and the Tennessee Tech Golden Eagles. Louisiana Tech quarterback Denny Duron was named outstanding offensive player, while his teammate linebacker Joe McNeely was named outstanding defensive player.\nQuestion: The 1972 Grantland Rice Bowl took place in the year of the rat in the chinese zodiac. True, False, or Neither? True\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013.\nQuestion: Kimberly Ane Peirce is 40 plus True, False, or Neither? True\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points.\nQuestion: The 1994 Nebraska Cornhuskers football team was coached by Tom Osborne True, False, or Neither? True\n###\nLast Place is the fifth studio album by the American indie rock band Grandaddy, released on March 3, 2017 on 30th Century Records. Self-produced and recorded by the band's frontman and primary recording artist Jason Lytle, the album is the first by Grandaddy since \"Just Like the Fambly Cat\" (2006) and the band's prior break-up.\nQuestion: Last Place was released in the 19th century. True, False, or Neither? False\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015.\nQuestion: The famous politician Okezie Ikpeazu is the current governor of Abia State, he is the first ever person to be elected as the governor of the large Abia State True, False, or Neither?", "doc_id": 509, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2139, 18639, 18832, 6884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rampage is an upcoming American action adventure monster film directed by Brad Peyton and written by Ryan Engle. It is based on the 1980s arcade video game of the same name. The film stars Dwayne Johnson, Naomie Harris, Malin \u00c5kerman, Joe Manganiello, Jake Lacy, Marley Shelton, and Jeffrey Dean Morgan. New Line Cinema will release the film on April 20, 2018 in 3D and IMAX.\nQuestion: The movie will bomb at the box office. True, False, or Neither? Neither\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano.\nQuestion: Suzanna Todd was involved in Memento True, False, or Neither? True\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: The film Punjabi was released to critical and box office success in 2012. True, False, or Neither? Neither\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland.\nQuestion: Phacelia pedicellata is not native to the United States True, False, or Neither? False\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line.\nQuestion: Takoma is in the countryside of Maryland. True, False, or Neither?", "doc_id": 227, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44585, 31928, 22579, 21185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1947 KLM Douglas DC-3 Copenhagen accident was the crash of a KLM Royal Dutch Airlines flight from Amsterdam to Stockholm via Copenhagen on 26 January. The accident occurred shortly after the Douglas DC-3 took off from Kastrup in Denmark. All 22 passengers and crew on board were killed in the accident.\nQuestion: The capacity of the Douglas DC-3 was more than 25. True, False, or Neither? Neither\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: Campbell, a character based on Reid himself, was a boy in 1865 in the book New Day. True, False, or Neither? Neither\n###\nBrushstrokes in Flight is a 1984 sculpture by Roy Lichtenstein, installed at the John Glenn Columbus International Airport in Columbus, Ohio. It is part of the \"Brushstrokes\" series of artworks that includes several paintings and sculptures whose subject is the actions made with a house-painter's brush.\nQuestion: The John Glenn Columbus International Airport in Columbus, Ohio has had other artwork installed besides the series Brushstrokes. True, False, or Neither? Neither\n###\nUSS \"Fletcher\" (DD/DDE-445), named for Admiral Frank Friday Fletcher, was the lead \"Fletcher\"-class destroyer , and served in the Pacific during World War II. She received fifteen battle stars for World War II service, and five for Korean War service.\nQuestion: USS \"Fletcher\" got 15 battle stars for its WWII service True, False, or Neither? True\n###\nChief Crazy Horse is a 1955 American CinemaScope Technicolor Western film directed by George Sherman starring Victor Mature, Suzan Ball and John Lund. The film is a fictionalized biography of the Lakota Sioux Chief Crazy Horse. It was also known as \"Valley of Fury\".\nQuestion: Chief Crazy Horse was not a real person True, False, or Neither?", "doc_id": 257, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19882, 21147, 19862, 34869], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sasco is a ghost town located in Pinal County, Arizona, west of Red Rock. Sasco, which is an acronym for the Southern Arizona Smelter Company, was a company town with a large smelter that served several mines. Once an impressive and little-known ghost town, today Sasco is a common sporting destination with shotgun shells, airsoft bb's, paintball splatter, and litter in the area.\nQuestion: There is a large Airsoft tournament held every year at the old Southern Arizona Smelter Company site. True, False, or Neither? Neither\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson.\nQuestion: Great Balls of Fire! came out in the 20th century. True, False, or Neither? True\n###\nThe Lawrence Brown House, better known as the L.B. Brown House is the home of Lawrence Bernard Brown a self-made businessman, community leader, and master carpenter. The importance of the L.B. Brown House is that it may be the only home built by a former enslaved person, left in Florida. The house \"stands as a living testimony to one person's triumph over adversity.\"\nQuestion: L.B. Brown House has a moat. True, False, or Neither? Neither\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex.\nQuestion: Muccan Station is a large railway station True, False, or Neither? False\n###\nBaya M. Harrison, Jr. (1912 in Tampa, Florida \u2013 1975) was a politician and an attorney in Florida. He served as Chairman of the Florida Board of Control from 1960\u20131964. Harrison greatly impacted the State University System of Florida and helped desegregate Florida colleges and universities. He served as President of the Florida Bar in 1957.\nQuestion: Baya M. Harrison, Jr. was born after 1913 True, False, or Neither?", "doc_id": 442, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18602, 30473, 36791, 38655], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Memento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano.\nQuestion: Memento was released in 2000 True, False, or Neither? True\n###\nJohn Ormsby Vandeleur (1765 \u2013 28 November 1828) was an Irish barrister, landowner and politician from Kilrush in County Clare. He sat in the House of Commons of Ireland from 1790 to 1800, and then in the House of Commons of the United Kingdom from 1801 to 1802.\nQuestion: John has a rough 60 something years True, False, or Neither? Neither\n###\n\"Don't Look Back\" is a song by British pop-rock band Fine Young Cannibals. It was released as the third single from the band's 1988 album \"The Raw & the Cooked\". The song reached the top 40 charts in the United Kingdom, United States, Canada, Australia, and New Zealand.\nQuestion: Fine Young Cannibals have released an album. True, False, or Neither? True\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball.\nQuestion: Forever the moment is a movie that my grandchildren will be able to see. True, False, or Neither? Neither\n###\nLatin Jam Workout is a Latin Dance Fitness Program created by professional athlete and choreographer JP Santana. Founded in 2007 in Los Angeles, California, Latin Jam Workout combines techno and Latin music with dance and aerobic movements. It is a fusion of Latin dance steps such as Salsa, Merengue, Raeggaeton, Cumbia, Samba, Soca, Belly-Dancing and the faster-paced rhythms of Pop and Techno.\nQuestion: Latin Jam Workout has existed for twenty-five years. True, False, or Neither?", "doc_id": 833, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [826, 21844, 36072, 19989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Accelrys is a software company headquartered in the United States, with representation in Europe and Asia. It provides software for chemical, materials and bioscience research for the pharmaceutical, biotechnology, consumer packaged goods, aerospace, energy and chemical industries.\nQuestion: accelrys was profitable the past 7 years True, False, or Neither? Neither\n###\nShould the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986.\nQuestion: It was released in 1986 and sold very well. True, False, or Neither? Neither\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: Sebo is a professional skateboarder and artist. True, False, or Neither? True\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: There is more than 1 junction True, False, or Neither? Neither\n###\nLady Pamela Carmen Louise Hicks (\"n\u00e9e\" Mountbatten; born 19 April 1929) is a British aristocrat. She is the younger daughter of the 1st Earl Mountbatten of Burma by his wife, Edwina Mountbatten. Through her father, Lady Pamela is a first cousin of Prince Philip, Duke of Edinburgh and a great niece of the last Empress of Russia, Alexandra Feodorovna.\nQuestion: Hicks is the oldest child True, False, or Neither?", "doc_id": 55, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10161, 40916, 41485, 11149], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: Zuikaku took part in the Battle of Leyte Gulf. True, False, or Neither? True\n###\nFan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers.\nQuestion: Sanse plays home games at multiple stadiums. True, False, or Neither? Neither\n###\nMaria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\".\nQuestion: Mario Ho became one of the top ranked female poker players in the world by becoming a World Series of Poker record-breaker. True, False, or Neither? True\n###\nAlexander Stewart Jolly (1887\u20131957) was a Sydney-based architect, published poet and children\u2019s author in the early 20th century. His buildings are primarily in Sydney's northern suburbs and the north coast of New South Wales. His architectural work was strongly influenced by Frank Lloyd Wright\u2019s School in Chicago, as well as the Arts and Crafts movement of the time.\nQuestion: Alexander Stewart Jolly enjoyed arts and crafts True, False, or Neither? Neither\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\".\nQuestion: Peter John \"P. J.\" Carlesimo has a M. True, False, or Neither?", "doc_id": 133, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34829, 4922, 13010, 44882], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the\nQuestion: \"Eternally\" was written by Charlies Chapin True, False, or Neither? False\n###\nJordan Klepper (born March 9, 1979) is an American comedian, writer, producer, political commentator, actor and television host. He is best known for being a correspondent on \"The Daily Show\" for 170 episodes between 2014-2017, and after his departure therefrom for hosting the satirical Comedy Central program \"The Opposition with Jordan Klepper\" beginning in the fall of 2017.\nQuestion: Jordan Klepper worked for The Daily Show. True, False, or Neither? True\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s.\nQuestion: John Henry Newman died in the Fall of 1890. True, False, or Neither? True\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer.\nQuestion: Loui Jover washes paintings True, False, or Neither? False\n###\nJames Proud is a British entrepreneur, and former CEO of Hello, a technology company that created the personal sleep tracker Hello Sense. Founded in 2012, Hello raised over $30 million in venture capital funding and $2.4 million from a Kickstarter campaign for Hello Sense before ultimately shutting down in June 2017. James Proud received an inaugural Thiel Fellowship in 2011.\nQuestion: Hello Sense was shut down by July 2017. True, False, or Neither?", "doc_id": 428, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29076, 6972, 43390, 26967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Exergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid.\nQuestion: Exergonix Inc was once a part of another company. True, False, or Neither? True\n###\nMary Pierce (born 15 January 1975) is a French retired tennis professional who played on the Women's Tennis Association (WTA) tour. Born in Canada, she is a citizen of Canada, and the United States. Pierce played for France in team competitions and in the Olympics.\nQuestion: Born in United States, she is a citizen of Canada, and the United States True, False, or Neither? False\n###\nAn Act for naturalizing Louis Sekeyhaye, George Frederick Handel, and others (13 Geo. I), later given the short title of Handel's Naturalisation Act 1727, was a 1727 Act of the Parliament of Great Britain with the intent of naturalising and granting British citizenship to German-born composer George Frideric Handel and other foreign citizens.\nQuestion: Handel's Naturalisation Act 1727 included many other people in the title in the longer version True, False, or Neither? True\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night.\nQuestion: Sally Blane made more money than Wallace Ford. True, False, or Neither? Neither\n###\nIra Heiden (born September 22, 1966) is an American television and film actor, perhaps best known for his role in the 1987 horror film \"\" as Will Stanton. Ira's other film roles include the 1988 film \"Elvira, Mistress of the Dark\" and his most recent film is the 1996 film \"Timelock\".\nQuestion: Heiden was in Forrest Gump. True, False, or Neither?", "doc_id": 855, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4384, 38397, 32604, 45339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929.\nQuestion: State Route 360 can be found in Wisconsin. True, False, or Neither? False\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines is not motivated by profit. True, False, or Neither? True\n###\nThe Legend of Paradise Island is a Hawaiian Musical Fantasy in two acts, with book, music, and lyrics by Carmen Lombardo and John Jacob Loeb. The book was adapted by Francis Swann. The show was produced by Guy Lombardo at the Jones Beach Marine Theater. The show opened on June 22, 1961.\nQuestion: The Legend of Paradise Island featured singing. True, False, or Neither? True\n###\nClaus Biederstaedt (born 28 June 1928 in Stargard, today Poland) is a German actor and voice actor. He studied in Hamburg and began his career working with Joseph Offenbach. Among the actors for whom he has dubbed have been Yves Montand, Peter Falk, Marlon Brando, Vittorio Gassman, and James Garner.\nQuestion: Claus Biederstaedt is not necessarily always born in Stargard. True, False, or Neither? False\n###\nAlrifai is a Lebanese multinational nut retailing company headquartered in Beirut, Lebanon, and a wholly owned subsidiary of Alrifai International Holding Ltd. It is the largest nut retailing chain in the Middle East and the company with the biggest market share in Lebanon.\nQuestion: Alrifal holds the second biggest share of the nut market in Lebanon True, False, or Neither?", "doc_id": 863, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14060, 36521, 23572, 32985], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o was born in Germany True, False, or Neither? False\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines is based in Puerto Rico. True, False, or Neither? True\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: He was co director of the stanford program True, False, or Neither? True\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate.\nQuestion: The company will make a bar with no toffee. True, False, or Neither? Neither\n###\nIn the Ugric mythology, Kaltes-Ekwa (Khanty, Kaltes Ankw) was the mother of the hero Mir-Susne-Hum and the wife of the god Num-Torum, who defeated her in heaven. She was also a goddess of the moon associated with the month April; a birth giving goddess (she is called upon by women in child-birth); goddess of fate; goddess of dawn and a shape-shifter, often shown manifested as a hare.\nQuestion: Num-Torum is the god of June. True, False, or Neither?", "doc_id": 830, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11597, 38787, 3055, 24472], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael George Stroka (May 9, 1938 in Passaic, New Jersey \u2013 April 14, 1997) was an American actor on soap operas like ABC-TV's \"Dark Shadows\", in which he played Aristede, Bruno Hess, and Laszlo Ferrari from 1969 to 1970. In addition, he made a cameo appearance as a pallbearer in the MGM film, \"House of Dark Shadows\", the first of two feature films based on the ABC soap opera.\nQuestion: Michael George Stroka was born in the 30's. True, False, or Neither? True\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984 on time, and is 1412 m in length. True, False, or Neither? Neither\n###\nThe Monument to Vasil Levski (Bulgarian: \u041f\u0430\u043c\u0435\u0442\u043d\u0438\u043a \u043d\u0430 \u0412\u0430\u0441\u0438\u043b \u041b\u0435\u0432\u0441\u043a\u0438 , \"Pametnik na Vasil Levski\") in the centre of Sofia, the capital of Bulgaria, is one of the first monuments to be built in the then newly liberated Principality of Bulgaria. It commemorates the hanging of Bulgarian national hero and major revolutionary figure Vasil Levski on the same spot on 18 February 1873.\nQuestion: It commemorates the accomplishment of Vasil True, False, or Neither? False\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Tillya tepe is worth millions of dollars. True, False, or Neither? Neither\n###\nMajor General Sen\u00e9n Casas Regueiro (30 July 1934 in Bomb\u00ed - 6 August 1996) was a Cuban politician. He was the Cuban Minister of Transportation from 1989 till his death. He was also the First Deputy Minister of Defence and the Chief of Staff of the Cuban Army. He was a brother of another Cuban politician Julio Casas Regueiro.\nQuestion: Julio Casas Regueiro had two brothers. True, False, or Neither?", "doc_id": 938, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30545, 35244, 28651, 14038], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Artur Edler von Mecenseffy (23 June 1865, Vienna \u2014 6 October 1917, Asiago) was an Austro-Hungarian Army officer who held the rank of \"Feldmarschall-leutnant\" (\"lieutenant field marshal\") and served during World War I, becoming the highest ranking officer of Austria-Hungary to be killed on the battlefield.\nQuestion: Artur did not die before 6 October 1917. True, False, or Neither? True\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: Lourdes Ver\u00f3nica Ar\u00e9valos Elias is a highly paid model True, False, or Neither? Neither\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality.\nQuestion: Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) married Richard B. Wall and was an American artist and voice actress. True, False, or Neither? Neither\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College.\nQuestion: There aren't any Major League Baseball teams based in California. True, False, or Neither? Neither\n###\nHaverstraw is a village incorporated in 1854 in the town of Haverstraw in Rockland County, New York, United States. It is located north of Congers, southeast of West Haverstraw, east of Garnerville, northeast of New City, and west of the Hudson River at its widest point. According to the 2013 U.S. Census estimate, the population was 12,102, an increase from the 2010 Census population of 11,910.\nQuestion: The population for Haverstraw had a higher census estimate in 2011 than the year before, 2010. True, False, or Neither?", "doc_id": 20, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12782, 4802, 45074, 10309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stainer & Bell Limited is a British publisher of classical sheet music and books, based in London. Stainer, founded in 1907, publish the works of a number of significant twentieth-century composers, including Charles Villiers Stanford, Gustav Holst, Ralph Vaughan Williams, and Herbert Howells. They also publish a number of earlier composers, including Henry VIII, William Byrd, and Henry Purcell.\nQuestion: Stainer & Bell Limited was founded in the 20th century True, False, or Neither? True\n###\nLorca F\u00fatbol Club, S.A.D. is a Spanish football team based in Lorca, in the autonomous community of the Region of Murcia. Founded in 2003, it currently plays in Segunda Divisi\u00f3n, holding home games at Estadio Francisco Art\u00e9s Carrasco, which has a capacity of 8,120.\nQuestion: They do not participate in the Segunda Division. True, False, or Neither? False\n###\nThe Texas A&M Aggie baseball team represents Texas A&M University in NCAA Division I college baseball. The Aggies have competed in the Southeastern Conference since 2013. The Aggies play home games at Olsen Field at Blue Bell Park. The team is led by head coach Rob Childress.\nQuestion: The Aggies play all games at Olsen Field. True, False, or Neither? False\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh.\nQuestion: TSS was once owned by a government agency True, False, or Neither? True\n###\nWestbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008.\nQuestion: Westbury Senior High School has a pre-k. True, False, or Neither?", "doc_id": 533, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41729, 29545, 27296, 7659], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Eliza Mahoney (May 7, 1845 \u2013 January 4, 1926) was the first African American to study and work as a professionally trained nurse in the United States, graduating in 1879. Mahoney was one of the first African Americans to graduate from a nursing school, and she prospered in a predominantly white society. She also challenged discrimination against African Americans in nursing.\nQuestion: Mahoney was an avid supporter of all prejudice. True, False, or Neither? False\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: He was very strict True, False, or Neither? Neither\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169.\nQuestion: Innylay is a well known locality. True, False, or Neither? Neither\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets.\nQuestion: Georgia bears many players for the Redskins. True, False, or Neither? Neither\n###\nTodd Wash (born July 19, 1968) is an American football coach who is the defensive coordinator for the Jacksonville Jaguars of the National Football League (NFL). From 2013 to 2015 he was the defensive line coach and run game coordinator for the Jacksonville Jaguars.\nQuestion: If one subtracts the \"t\" from the beginning of Todd's first name, one is left with a word that can mean \"strange\". True, False, or Neither?", "doc_id": 634, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35755, 20104, 31829, 23254], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough.\nQuestion: Merry Christmas, Charlie Manson! aired over 7 years ago True, False, or Neither? True\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\".\nQuestion: Terry Jones was paid 3,000,000 dollars to sing at the Hollywood Bowl. True, False, or Neither? Neither\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree Victoria Murphy was born in July True, False, or Neither? False\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide.\nQuestion: Chinese critics had mixed reactions about Jersey Boys. True, False, or Neither? Neither\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Sushil Kumar Shinde to become the 11th Vice President of India True, False, or Neither?", "doc_id": 719, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21381, 41524, 33914, 25597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue.\nQuestion: the drake hotel is a venue True, False, or Neither? True\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Allen S Weiner is currently a professor at Stanford True, False, or Neither? False\n###\nMaricopa County ( ) is a county in the south-central part of the U.S. state of Arizona. As of the 2010 census, its population was 3,817,117, making it the state's most populous county, and the fourth-most populous in the United States. It is more populous than 23 states. The county seat is Phoenix, the state capital and fifth-most populous city in the country.\nQuestion: The state capital starts with a P True, False, or Neither? True\n###\nThe Little Girl Next Door is a 1912 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. The film starred William Garwood and Marguerite Snow in the lead roles. Prints of the film are in the Library of Congress and other collections.\nQuestion: The Little Girl Next Door is a 1807 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. True, False, or Neither? False\n###\nFire!! was an African-American literary magazine published in New York City in 1926 during the Harlem Renaissance. The publication was started by Wallace Thurman, Zora Neale Hurston, Aaron Douglas, John P. Davis, Richard Bruce Nugent, Gwendolyn Bennett, Lewis Grandison Alexander, Countee Cullen, and Langston Hughes. After it published one issue, its quarters burned down, and the magazine ended.\nQuestion: Thousands of issues were published. True, False, or Neither?", "doc_id": 485, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36549, 39627, 17514, 18049], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines covers news related to design. True, False, or Neither? True\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: Foals have been covered by tool. True, False, or Neither? Neither\n###\nConnacht Rugby (Irish: \"Rugba\u00ed Connachta\" ) is one of the four professional provincial rugby teams from the island of Ireland. Connacht competes in the Pro14 and the European Rugby Challenge Cup. The team represents the IRFU Connacht Branch, which is one of four primary branches of the IRFU, and is responsible for rugby union throughout the geographical Irish province of Connacht.\nQuestion: Connacht Rugby is well-known. True, False, or Neither? Neither\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal.\nQuestion: After the Empire of Japan invaded and occupied the Northeast in 1931, the Japanese Communist Party organized small anti-Japanese guerrilla units True, False, or Neither? False\n###\n\"Touch Me With Your Love\" is a song by Beth Orton, released as the fourth single from 1996 album \"Trailer Park\". It contains 4 songs, and was released on C.D. and vinyl. The release peaked at #60 in the UK official singles chart. It was also released in Australia with a different track listing, and was the first release by Orton to have a promotional video made for it.\nQuestion: \"Touch Me With Your Love\" achieved a ranking higher than #60 at certain points after its release in the UK official singles chart. True, False, or Neither?", "doc_id": 733, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17985, 7654, 3263, 27218], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice.\nQuestion: Bruno Mingeon was born in Savoie in 1988. True, False, or Neither? False\n###\nThe Inter-American Peace Force (IAPF) was established, by the Organization of American States, on 23 May 1965, after the United States's intervention in the Dominican Republic. It largely consisted of over 42,600 United States military personnel, plus the following troops were sent by each country;\nQuestion: The Inter-American Peace Force has more than American troops True, False, or Neither? True\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C.\nQuestion: Luton Town Ladies Football Club was founded in an even numbered year. True, False, or Neither? False\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: Kidsty Pike has in recent years flowed away from the English Lake DIstrict True, False, or Neither? False\n###\nBremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society.\nQuestion: Maine is smaller than Bremen True, False, or Neither?", "doc_id": 364, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14592, 10904, 17344, 22060], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "High Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX.\nQuestion: Hugh Noon Toons was terrible at ratings. True, False, or Neither? Neither\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium.\nQuestion: moulton loves phoenix capital True, False, or Neither? Neither\n###\nCarlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features.\nQuestion: A spring U.S. theater release is what awaited this film after Sundance. True, False, or Neither? True\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long.\nQuestion: The route is barely used True, False, or Neither? Neither\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles.\nQuestion: Dick Crealy and Peaches Bartkowicz beat eachother in the 1970 Swedish Open. True, False, or Neither?", "doc_id": 81, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39879, 13757, 30986, 29083], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville.\nQuestion: There is another airport in Greenville County. True, False, or Neither? Neither\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Coriolano: eroe senza patria inspired many future films True, False, or Neither? Neither\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster.\nQuestion: Mithun Chakraborty has been a Grandmaster at least two times. True, False, or Neither? True\n###\nThe Red Hill Fire Observation Station consists of a fire lookout tower, cabin and pit privy located on the summit of Red Hill, a 2990 ft Catskill Mountain peak in Denning, New York, United States. It is the southernmost fire tower in the Catskill Park.\nQuestion: The Red Hill Fire Observation Station is located in Catskill Park. True, False, or Neither? True\n###\nRobert Paul Irvine (born 24 September 1965) is an English celebrity chef who has appeared on and hosted a variety of Food Network programs including \"\", \"Worst Cooks in America\", \"\", \"A Hero's Welcome, Operation Restaurant, All-Star Academy, Guy's Grocery Games, Chopped: Impossible\" and \"Restaurant Express\".\nQuestion: Irvine was born in the year after 1963. True, False, or Neither?", "doc_id": 139, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40928, 33595, 11862, 10699], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Una questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga.\nQuestion: Una questione privata was a 1993 film based on a novel about WWII. True, False, or Neither? True\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson.\nQuestion: The team is representing Iceland. True, False, or Neither? True\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht.\nQuestion: Khan Kluay 2 has a short ending. True, False, or Neither? Neither\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold.\nQuestion: Fabien Onteniente directed a film in 2016 that sold over 3 million tickets. True, False, or Neither? True\n###\nDave Ward, born 12 July 1959, is a British Trade Unionist and General Secretary of the Communication Workers\u2019 Union (CWU), which was formed through the merger of the Union of Communication Workers and the National Communications Union in 1995. The CWU is the largest Trade Union in the United Kingdom for people working in the Postal and Telecommunications industry with over 200,000 members.\nQuestion: Trade Unions are good for zebras. True, False, or Neither?", "doc_id": 801, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3133, 27334, 701, 32295], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg.\nQuestion: Ferrero SpA is worth $24.2 billion. True, False, or Neither? Neither\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997.\nQuestion: The album Goodnight Sweetheart has more then three songs. True, False, or Neither? True\n###\nThe Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013.\nQuestion: Only Korean people saw The Attorney. True, False, or Neither? Neither\n###\nStanley Elphinstone Kerr (March 30, 1894 \u2013 December 14, 1976) was an American humanitarian, clinical biochemist and educator. He was the father of Malcolm H. Kerr, former president of the American University of Beirut, and the grandfather of NBA player, general manager, broadcaster, and coach Steve Kerr.\nQuestion: Kerr had an unknown impact on the game of basketball True, False, or Neither? Neither\n###\nThe 2017\u201318 Puebla season is the 70th professional season of Mexico's top-flight football league. The season is split into two tournaments\u2014the Torneo Apertura and the Torneo Clausura\u2014each with identical formats and each contested by the same eighteen teams.The Club will also play Copa MX.Rafael Garc\u00eda Torres was named the club head coach on June 5, 2017, taking over for sacked coach Jos\u00e9 Cardozo.\nQuestion: The 2017\u201318 Puebla season was unsuccessful True, False, or Neither?", "doc_id": 160, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20024, 34274, 34242, 44170], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Erik \"The Red\" Denmark (born about 1980) is an American competitive eater and a member of the International Federation of Competitive Eating. He currently lives in Seattle, Washington and is nicknamed after Erik the Red, who was a Viking that founded the first Nordic settlement in Greenland.\nQuestion: He has been to Greenland. True, False, or Neither? Neither\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season.\nQuestion: Matthew Mansfield was born within the last 9876 days. True, False, or Neither? False\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: They started making music in their basement True, False, or Neither? Neither\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\".\nQuestion: Terry Jones was born in nineteen hundred fifty three. True, False, or Neither? Neither\n###\nThe Veterinary Medical College Application Service (VMCAS) is a centralized application service for students applying to veterinary school. Created by the Association of American Veterinary Medical Colleges (AAVMC) in 1995, VMCAS handles applications for most of the veterinary schools in the United States, as well as several in Canada, the United Kingdom, New Zealand and Australia.\nQuestion: prior to 1995 the United States, as well as Canada, the United Kingdom, New Zealand and Australia.did not have a centralized application service for veterinary students\n True, False, or Neither?", "doc_id": 924, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24636, 24396, 16382, 14685], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute.\nQuestion: 54-40 plays metal rock. True, False, or Neither? False\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942.\nQuestion: Susan Peters left a large legacy after she died. True, False, or Neither? Neither\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Ralph D. Malone left Miami Dolphins to join another team True, False, or Neither? Neither\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award.\nQuestion: Rachel Brosnahan is an American actress who played her roles in only three films. True, False, or Neither? Neither\n###\nAnn Hui On-Wah, MBE (; Hepburn: \"Kyo Anka\"; born 23 May 1947) is a Hong Kong film director, producer, screenwriter and actress. She is one of the most critically acclaimed Hong Kong New Wave filmmakers. She is known for her films about social issues in Hong Kong.\nQuestion: Ann Hui On-Wah was born in the winter. True, False, or Neither?", "doc_id": 27, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17760, 17244, 21750, 16187], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: Spanish Mastiff is very friendly towards children True, False, or Neither? Neither\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015.\nQuestion: Tom Osborne coached the Arkansas team in the 1985 season. True, False, or Neither? False\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie.\nQuestion: Cher performs song from other artists. True, False, or Neither? True\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012.\nQuestion: Anna Sun is a song that was released in 2019 True, False, or Neither? False\n###\nThe Borrowers is a Hallmark Hall of Fame TV special first broadcast in 1973 on NBC. This made for television special is adapted from the 1952 Carnegie Medal-winning first novel of author Mary Norton's \"The Borrowers\" series: \"The Borrowers\". The film stars Eddie Albert, Tammy Grimes and Judith Anderson and was directed by Walter C. Miller.\nQuestion: Tammy Grimes is a television actress True, False, or Neither?", "doc_id": 992, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35352, 41565, 39082, 26185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ.\nQuestion: Frank Viola is European True, False, or Neither? False\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison.\nQuestion: Alix Bancourt is blind. True, False, or Neither? Neither\n###\nThe East\u2013West Shrine Game is an annual postseason college football all-star game played each January since 1925. The game is sponsored by the fraternal group Shriners International, and the net proceeds are earmarked to some of the Shrine's charitable works, most notably the Shriners Hospitals for Children. The game's slogan is \"Strong Legs Run That Weak Legs May Walk\".\nQuestion: The East-West Shrine Game is an annual postseason college football all-star game. True, False, or Neither? True\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters.\nQuestion: Larry Ruvo is not the VP/GM of Southern Wine and Spirits of Nevada. True, False, or Neither? False\n###\nKimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996).\nQuestion: Beck worked with Alfred Hitchcock before she worked on her most famous movie role. True, False, or Neither?", "doc_id": 873, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41899, 1460, 40329, 37735], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Insiders is a news and talk show produced by ABC News hosted by veteran political journalist Barrie Cassidy. Similarly to the Sunday morning talk shows in the United States, it analyses and discusses Australian politics with the use of a panel of political journalists and columnists and interviews with prominent politicians and commentators.\nQuestion: Insiders mainly focus on Australian politics. True, False, or Neither? True\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg.\nQuestion: Marie Hedwig Auguste of Sulzbach was a man. True, False, or Neither? False\n###\nSwinburne Online is the online arm of Swinburne University of Technology which is an Australian university based in Melbourne, Victoria. Swinburne Online was founded in 2011 after a 50-50 joint venture between Swinburne University of Technology and SEEK Learning seeking to capitalise on increasing demand for off-campus education.\nQuestion: Swingborn Online is an online university that is partnered with Swinburne University of Technology. True, False, or Neither? True\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: Ticino is the swiss' most traveled canton True, False, or Neither? Neither\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: Lourdes Ver\u00f3nica Ar\u00e9valos Elias was born on a cold winter day. True, False, or Neither?", "doc_id": 675, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31537, 37196, 3537, 41261], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Babar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\".\nQuestion: Nelvana Limited is a Canadian based company True, False, or Neither? Neither\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Lucy's son has a book written about him. True, False, or Neither? True\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The Barbary ostrich was also referred to as The North African ostrich by scientists. True, False, or Neither? Neither\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory.\nQuestion: It was written by Emmanuel Chabrier in 1887 True, False, or Neither? True\n###\nFrank John Gorshin, Jr. (April 5, 1933 \u2013 May 17, 2005) was an American character actor, impressionist, and comedian. He was perhaps best known as an impressionist, with many guest appearances on \"The Ed Sullivan Show\" and \"Tonight Starring Steve Allen\". His most famous acting role was as the Riddler on the live-action television series \"Batman\".\nQuestion: Frank John Gorshin, Jr. created \"The Ed Sullivan Show\" True, False, or Neither?", "doc_id": 841, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18668, 39583, 30765, 26636], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale.\nQuestion: Gossip Girl, is found on streaming services True, False, or Neither? Neither\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\".\nQuestion: Brown University should encourage more unique projects. True, False, or Neither? Neither\n###\nThe Stanchester Hoard is a hoard of 1,166 Roman coins dating from the fourth to early fifth century found at Wilcot, in the Vale of Pewsey, Wiltshire, England in 2000. The find was considered important because of the large quantity of unclipped silver coins contained within. It was also the latest dated example of Roman coins found in Wiltshire.\nQuestion: The Stanchester Hoard contains gold Roman coins True, False, or Neither? Neither\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: Johan Martin Schr\u00f6der was born before 1990 True, False, or Neither? True\n###\nKalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others.\nQuestion: Ravi M worked on movie sets. True, False, or Neither?", "doc_id": 244, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14946, 27965, 38100, 17151], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues.\nQuestion: In May 2006, Ethereal changed it's name to Wireshark. True, False, or Neither? True\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia.\nQuestion: There are 12 schools in the Big 12 Conference. True, False, or Neither? False\n###\nThe Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis.\nQuestion: Jose Celestino Mutis was founded in the summer of 1955. True, False, or Neither? Neither\n###\nJack Thomas Chick (April 13, 1924 \u2013 October 23, 2016) was an American cartoonist and publisher, best known for his evangelical fundamentalist Christian \"Chick tracts\", which presented his perspective on a variety of issues through sequential-art morality plays.\nQuestion: Jack Thomas Chick was born in April True, False, or Neither? True\n###\nJaeden Wesley Lieberher (born January 4, 2003) is an American actor. He is known for starring as Bill Denbrough in the horror film \"It\" (2017), and for his leading roles in the films \"St. Vincent\", as Oliver Bronstein, \"Midnight Special\", as Alton Meyer, \"The Confirmation\", as Anthony, \"The Book of Henry\", as Henry Carpenter.\nQuestion: Jaeden Wesley Lieberher was born after the 20th century True, False, or Neither?", "doc_id": 519, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28492, 36, 42813, 23742], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bang the Drum Slowly is a 1973 American sports drama film directed by John D. Hancock, about a baseball player of limited intellect who has a terminal illness, and his brainier, more skilled teammate. It is film adaptation of the 1956 baseball novel of the same name by Mark Harris. It was previously dramatized in 1956 on the \"U.S. Steel Hour\" with Paul Newman, Albert Salmi and George Peppard.\nQuestion: Bang the Drum slowly tells the story of 2 baseball players. True, False, or Neither? True\n###\nThe Joan Ganz Cooney Center (informally, the Cooney Center) is an independent, non-profit, non-partisan research and innovation group founded by Sesame Workshop in order to advance children\u2019s literacy skills and foster innovation in children\u2019s learning through digital media.\nQuestion: The Joan Ganz Cooney Center is no longer run by the Sesame Workshop. True, False, or Neither? Neither\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: V.S. Reid is a popular Jamaican author. True, False, or Neither? Neither\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) .\nQuestion: The first academic lesson started in 1994. True, False, or Neither? False\n###\nIn theoretical physics, particularly in discussions of , Mach's principle (or Mach's conjecture) is the name given by Einstein to an imprecise hypothesis often credited to the physicist and philosopher Ernst Mach. The idea is that local inertial frames are determined by the large scale distribution of matter, as exemplified by this anecdote:\nQuestion: Mach did not name Mach's principle. True, False, or Neither?", "doc_id": 200, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33514, 29924, 8806, 37197], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of English and Spanish language placenames in the Falkland Islands. Most of the Spanish language names are quite different in origin to their English equivalents, and many have religious resonances. Some names were given by the Spanish \"conquistadores\", while others were given later by the Argentine government.\nQuestion: This is a list of English and Spanish language placenames in the United States True, False, or Neither? False\n###\nThameslink and Great Northern are the brand names used by the Govia Thameslink Railway train operating company on the Thameslink and Great Northern routes of the Thameslink, Southern and Great Northern franchise, previously operated by First Capital Connect.\nQuestion: It was previously operated by a company that starts with an F True, False, or Neither? True\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: The census occurred four years prior to 2015. True, False, or Neither? True\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games.\nQuestion: Hannah Kate Whelan is born in the year of the Monkey, according to the chinese zodiac True, False, or Neither? True\n###\nUncommon Danger is the second novel by British thriller writer Eric Ambler, published in 1937. In his autobiography, \"Here Lies\", Ambler explains the original title was \"Background To Danger\", but his British publisher disliked the word 'background', so it was published in all English-speaking countries except the US as \"Uncommon Danger\".\nQuestion: Eric Ambler would have preferred his second novel to be titled \"Background To Danger\". True, False, or Neither?", "doc_id": 399, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10289, 44537, 17264, 29046], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff.\nQuestion: Hideki Kamiya does not like being a game designer True, False, or Neither? Neither\n###\nHoodlum is a 1997 American crime drama film that gives a fictionalized account of the gang war between the Italian/Jewish mafia alliance and the Black gangsters of Harlem that took place in the late 1920s and early 1930s. The film concentrated on Ellsworth \"Bumpy\" Johnson (Laurence Fishburne), Dutch Schultz (Tim Roth), and Lucky Luciano (Andy Garc\u00eda).\nQuestion: Hoodlum is a crime drama film from 1997 and is about a fictional gang war between the Italian/Jewish mafia alliance and the black gangsters in Harlem, and is set in the late True, False, or Neither? Neither\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million.\nQuestion: Over 70,000 homes will benefit from the Eolica Sarichioi Wind Farm project. True, False, or Neither? False\n###\nNational Security is a 2003 action comedy film, directed by Dennis Dugan, starring Martin Lawrence and Steve Zahn. In addition to Lawrence and Zahn, \"National Security\" boasts an additional cast of Bill Duke, Eric Roberts, Colm Feore, Matt McCoy, and others.\nQuestion: Dennis Dugan was very proud of the movie National Security. True, False, or Neither? Neither\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\").\nQuestion: Christy performed a song in German for Eurovision 1976. True, False, or Neither?", "doc_id": 13, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4683, 9046, 38601, 4916], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corey Gibson, known professionally as Corey Chorus, is an American songwriter, record producer, vocal producer, sound engineer and publisher, known for having written songs such as Cheers (Drink to That) of Rihanna, Chica Bomb by Dan Balan, Made in the USA by Demi Lovato.\nQuestion: He is know as Corey Chorus because he is an American songwriter. True, False, or Neither? Neither\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts.\nQuestion: changeable phacelia is the most common borage family plant in Baja California True, False, or Neither? Neither\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label.\nQuestion: Bad Company later made music that was slightly pop influenced True, False, or Neither? Neither\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles.\nQuestion: The 1970 Swedish open was held in 1969 True, False, or Neither? False\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band.\nQuestion: Metropolis Records is the only record label to release their album. True, False, or Neither?", "doc_id": 738, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16571, 32520, 9838, 12235], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Left Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker.\nQuestion: Left Hand Spring isn't located in Canada. True, False, or Neither? True\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines.\nQuestion: Resil B. Mojares got his degree from a prestigious university True, False, or Neither? Neither\n###\nThe New York Red Bulls II are an American professional soccer team based in Harrison, New Jersey. They are the New York Red Bulls reserve team that plays in the United Soccer League (USL), one of two second-tier leagues in the American soccer pyramid.\nQuestion: The Bulls sometimes play in the minor leagues True, False, or Neither? Neither\n###\nHedera helix (common ivy, English ivy, European ivy, or just ivy) is a species of flowering plant in the family Araliaceae, native to most of Europe and western Asia. A rampant, clinging evergreen vine, it is a familiar sight in gardens, waste spaces, on house walls, tree trunks and in wild areas across its native habitat.\nQuestion: Hedera helix is seen in parts on europe True, False, or Neither? True\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: A model who represented Prague for Miss Universe 2006 didn't win the title. True, False, or Neither?", "doc_id": 433, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26309, 3919, 21476, 37363], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Die Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen.\nQuestion: Die Antwoord is a South African hip hop group formed in Cape Town in 2002 True, False, or Neither? False\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\".\nQuestion: Two Lovers and a Bear won an award for Emmanuel. He was also at the first awards ceremony and picked up his first win. He won Best Art Direction or Production Design in 2017 and 2013 which was the award ceremonies first season. True, False, or Neither? True\n###\nAodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\".\nQuestion: Aodh Mac Cathmhaoil disliked writing immensely True, False, or Neither? Neither\n###\nLike the Roman: The Life of Enoch Powell is a 1998 book by the English writer Simon Heffer. It is a biography of the politician Enoch Powell. The title is taken from Powell's 1968 Rivers of Blood speech when Powell quoted Virgil's \"Aeneid\": \"As I look ahead, I am filled with foreboding; like the Roman, I seem to see the River Tiber foaming with much blood\".\nQuestion: Enoch Powell was a great man. True, False, or Neither? Neither\n###\nPeeya Rai Chowdhary is an Indian actress. Peeya Rai was married to model Shayan Munshi in 2006, but separated from him in 2010. She played Lakhi in Gurinder Chadha's \"Bride and Prejudice,\" Rita in the movie \"The Bong Connection\" (where she worked with husband Munshi) and played \"Kiran\" in the TV show \"Hip Hip Hurray\". She studied at National College, Mumbai.\nQuestion: Peeya Rai was not married to Munshi while she was in the TV show Hip Hip Hurray. True, False, or Neither?", "doc_id": 237, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5973, 36706, 39445, 37706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash.\nQuestion: The Middlewich Folk and Boat Festival began after the Second World War True, False, or Neither? Neither\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting.\nQuestion: Dungeons and dragons is the best game ever True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall is the most popular mall in Augusta County. True, False, or Neither? Neither\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: Young people are happy in Guatemala. True, False, or Neither? Neither\n###\nThe Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice.\nQuestion: The Mannlicher\u2013Sch\u00f6nauer rifle is a popular rifle, used by modern military forces. True, False, or Neither?", "doc_id": 183, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36827, 35375, 43416, 19614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single,\nQuestion: The song Inbetweener by English Britpop band Sleeper is over 3 years old True, False, or Neither? True\n###\nGuy Nicholson Turnbow (March 28, 1908 \u2013 October 4, 1975) was an American football tackle who played two seasons with the Philadelphia Eagles of the National Football League. He played college football at the University of Mississippi and attended Brookhaven High School in Brookhaven, Mississippi.\nQuestion: Guy played football at Brookhaven High School. True, False, or Neither? Neither\n###\nSystem of a Down is the debut studio album by Armenian-American metal band System of a Down, released on June 30, 1998, by American Recordings and Columbia Records. The album was later certified gold by the Recording Industry Association of America on February 2, 2000. Two years later, after the success of System of a Down's next album, \"Toxicity\", the album was certified platinum.\nQuestion: The album was released in the sixth month. True, False, or Neither? True\n###\nGeorge Corrie (born 16 September 1973) is an English footballer, born in Workington, who played for ten years as a midfielder for American USL Second Division side Wilmington Hammerheads, of which he was the captain. He joined the Hammerheads in 1999 after six seasons with Conference North team Workington A.F.C..\nQuestion: George Corrie is from Europe True, False, or Neither? True\n###\nThe Basketbowl was a college basketball game between Michigan State University and the University of Kentucky held on December 13, 2003 at Ford Field, a domed American football stadium in Detroit, Michigan. Kentucky won the game 79\u201374, never trailing throughout the contest.\nQuestion: Kentucky won the game with a 5 point lead True, False, or Neither?", "doc_id": 576, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17279, 16594, 11192, 40285], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Little Casterton is a small village and civil parish in Rutland, England. The population of the civil parish at the 2001 census was 148, increasing to 218 at the 2011 census. It is about two miles (3 km) north of Stamford on a minor road that runs to the south of the River Gwash between Great Casterton and Ryhall.\nQuestion: Little Casterton's population increased due to natural births. True, False, or Neither? Neither\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: The population has since increased due to the government trying to settle the people there True, False, or Neither? Neither\n###\nThe Holiday Bowl is a post-season NCAA Division I Football Bowl Subdivision college football bowl game that has been played annually since 1978 at Qualcomm Stadium in San Diego, California, United States. Since the 2014 edition, it has featured a matchup of Pac-12 and Big Ten teams.\nQuestion: The Pac-12 team has always won the Holiday Bowl. True, False, or Neither? Neither\n###\n\"Girl in a Country Song\" is the debut single by American country music duo Maddie & Tae, co-written with Aaron Scherz and released in July 2014. The song is an answer to the \"bro-country\" subgenre in contemporary country music, specifically in how women are portrayed by men, with lyrics containing references to a variety of popular recent country songs.\nQuestion: The bro-country sub genre of country music is sexist. True, False, or Neither? Neither\n###\nGerard A. \"Gerry\" Salton (8 March 1927 in Nuremberg \u2013 28 August 1995), was a Professor of Computer Science at Cornell University. Salton was perhaps the leading computer scientist working in the field of information retrieval during his time, and \"the father of information retrieval\". His group at Cornell developed the SMART Information Retrieval System, which he initiated when he was at Harvard.\nQuestion: Salton was the only professor at Cornell. True, False, or Neither?", "doc_id": 512, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13586, 39872, 19270, 30091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Ligier JS17 was a Formula One car designed by G\u00e9rard Ducarouge and Michel Beaujon for use by the Ligier team during the season. Powered by a Talbot-badged Matra V12, the JS17 was driven to two Grand Prix wins by Jacques Laffite. It was updated to JS17B specification for the season until it was replaced later that year by the JS19.\nQuestion: Jacques Laffite was a dual champion. True, False, or Neither? True\n###\nBusby is a census-designated place (CDP) in Big Horn County, Montana, United States. It is on the Northern Cheyenne reservation. The population was 745 at the 2010 census. The town is near the site of the Battle of the Rosebud and the associated Rosebud Battlefield State Park, where General George Custer forces encountered Sioux and Cheyenne forces led by Crazy Horse.\nQuestion: rosebud battlefield state park is big True, False, or Neither? Neither\n###\nFuhrmann & Schmidt Brewing Company was formed in 1906 and was located at Commerce and Washington Streets in Shamokin, Pennsylvania. Fuhrmann & Schmidt was the successor company to the Eagle Brewing Company (1854 \u2013 1878), the M. Markel & Company (1878 \u2013 1893) and Phillip H Fuhrmann (1893 \u2013 1906).\nQuestion: Fuhrmann & Schmidt Brewing Company has a tasty selection of beer. True, False, or Neither? Neither\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\").\nQuestion: Sanation was a french political movement True, False, or Neither? False\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983.\nQuestion: Robert L. \"Rusty\" White was born less than 6666 days ago. True, False, or Neither?", "doc_id": 956, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23544, 40833, 4610, 9024], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Petasites is the least well known genus of plants in the sunflower family. True, False, or Neither? Neither\n###\nSalli Elise Richardson (born November 23, 1967) is an American television and film actress and director. Richardson is known for her role as Angela on the 1994 hit comedy/action film \"A Low Down Dirty Shame\" and for her role as Dr. Allison Blake on the Syfy comedy-drama series \"Eureka\" (2006\u20132012).\nQuestion: Salli Elise Richardson starred in the Syfy comedy-drama series \"Eureka\" (2006\u20132012) True, False, or Neither? True\n###\nDanville is an unincorporated community and census-designated place (CDP) in Ferry County, Washington, United States. Danville is located on Washington State Route 21 near the Canada\u2013United States border, 31 mi north-northeast of Republic, the Ferry County seat. Danville has a post office with ZIP code 99121. The population at the 2010 census was 34.\nQuestion: Danville is an incorporated community and census-designated place in the US. True, False, or Neither? False\n###\n1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire.\nQuestion: The American version of the show is the most popular. True, False, or Neither? Neither\n###\nThe Sheshan Basilica, officially the National Shrine and Minor Basilica of Our Lady of Sheshan () and also known as Basilica of Mary, Help of Christians is a prominent Roman Catholic church in Shanghai, China. Its common name comes from its location on the western peak of Sheshan Hill, located in Songjiang District, to the west of Shanghai's metropolitan area.\nQuestion: The Sheshan Basilica is located in a country next to India. True, False, or Neither?", "doc_id": 46, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31563, 15691, 30700, 22342], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Circus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Critics felt the movie deserved the two Israeli Film Academy Awards it did not win. True, False, or Neither? Neither\n###\nPrinceton Junction is a railroad station in Princeton Junction, New Jersey, located in West Windsor Township. It serves NJ Transit (NJT) and Amtrak on the Northeast Corridor (NEC), and NJ Transit on the Princeton Branch. The station's Amtrak station code is PJC.\nQuestion: Princeton Junction has a railroad station called Princeton Junction in New Jersey that goes to Princeton. True, False, or Neither? Neither\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator.\nQuestion: Wishart received a scholarship to play football. True, False, or Neither? Neither\n###\nCari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite.\nQuestion: She has no friends True, False, or Neither? Neither\n###\nSilver Bow County is a county in the State of Montana. As of the 2010 census, the population was 34,200. Its county seat is Butte. In 1977, the city and county governments consolidated to form the single entity of Butte-Silver Bow. Additionally, the town of Walkerville is a separate municipality from Butte and is within the county.\nQuestion: Montana is made up of Walkerville, along with additional entities. True, False, or Neither?", "doc_id": 921, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21970, 9258, 38092, 32694], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Penthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue.\nQuestion: Penthouse was later released on dvd. True, False, or Neither? Neither\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States in the central time zone. True, False, or Neither? Neither\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\".\nQuestion: Paint It Black is the first song on the album. True, False, or Neither? True\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line.\nQuestion: The Takoma Langle Crossroads Transit Center is located in Maryland, which is in the East of the US. It is a very large bus transfer. True, False, or Neither? Neither\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: You can visit Storybrooke, Maine True, False, or Neither?", "doc_id": 188, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16907, 8443, 37808, 44104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ashcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: New York's most populated county is Ontario County True, False, or Neither? Neither\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals.\nQuestion: Udinese Calcio won the finals on Serie A True, False, or Neither? False\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only.\nQuestion: After achieving early underground fame,the Sisters of Mercy band is a touring outfit only. True, False, or Neither? True\n###\nThe Cabinet of Dr. Caligari is a 2005 American independent film, and a remake of the 1920 silent film of the same name. It was directed by David Lee Fisher and released in the U.S. at the ScreamFest Film Festival on October 22, where it won three prizes: the Audience Choice Award, Best Cinematography and Best Special Effects.\nQuestion: ScreamFest is an independent film festival. True, False, or Neither? Neither\n###\nWarriors of Virtue is a 1997 Chinese-American martial arts fantasy film directed by Ronny Yu and starring Angus Macfadyen, Mario Yedidia, and Marley Shelton. It was released in English, Mandarin and Cantonese-language versions. The creature effects were designed by Academy Award-nominated special effect production house Alterian, Inc.\nQuestion: Alterian Inc have been nominated for many awards in the past. True, False, or Neither?", "doc_id": 250, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32206, 1878, 43650, 22888], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mystery-Bouffe (Russian: \u041c\u0438\u0441\u0442\u0435\u0440\u0438\u044f-\u0411\u0443\u0444\u0444 ; Misteriya-Buff) is a socialist dramatic play written by Vladimir Mayakovsky in 1918/1921. Mayakovsky stated in a preface to the 1921 edition that \"in the future, all persons performing, presenting, reading or publishing \"Mystery-Bouffe\" should change the content, making it contemporary, immediate, up-to-the-minute.\"\nQuestion: Mystery Bouffe will be made into a movie in 2021 True, False, or Neither? Neither\n###\nSaat Din Mohabbat In (English: \"Seven days in love\" ) is an upcoming Pakistani romantic drama film directed by Meenu-Farjad, produced by Dawn Films and IMGC Global Entertainment and written by Fasih Bari Khan. The film features Mahira Khan and Sheheryar Munawar in lead roles and is also their second mutual film after \"Ho Mann Jahaan\".\nQuestion: Khan and Munawar have been in more than one film together. True, False, or Neither? True\n###\nCarolyn Keene is the pseudonym of the authors of the Nancy Drew mystery stories and The Dana Girls mystery stories, both produced by the Stratemeyer Syndicate. In addition, the Keene pen name is credited with the Nancy Drew spin-off, \"River Heights and the Nancy Drew Notebooks.\nQuestion: The Nancy Drew series were not written by a woman named Carolyn Keene True, False, or Neither? True\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015.\nQuestion: The 1985 Nebraska Cornhuskers played only in Texas. True, False, or Neither? False\n###\nThe Dr. Samuel D. Mercer House was built in 1885 at 3920 Cuming Street in the historic Walnut Hill neighborhood of Omaha, Nebraska, United States. Samuel Mercer was the chief surgeon of the Union Pacific Railroad, and the founder of Omaha's first hospital.\nQuestion: Construction of the Mercer house involved people who build things. True, False, or Neither?", "doc_id": 131, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40479, 6990, 39153, 15788], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016.\nQuestion: Amanda Knox killed Meredith Kercher. True, False, or Neither? Neither\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city.\nQuestion: Ogallala was founded over a hundred years ago. True, False, or Neither? Neither\n###\n\"May the Bird of Paradise Fly Up Your Nose\" is a 1965 novelty song performed by Little Jimmy Dickens. It was Dickens' most successful single on the U.S. country music chart. It spent two weeks at No. 1 that November, and stayed on the chart for a total of 18 weeks. On the overall \"Billboard\" Hot 100 the song peaked at No. 15.\nQuestion: \"May the Bird of Paradise Fly Up Your Nose\" never made the top 10 of \"Billboard\" Hot 100 True, False, or Neither? True\n###\nCardinal Newman College is a Catholic sixth form college close to the centre of Preston. The college was graded \"\"outstanding\"\" by Ofsted in May 2009. The college was then granted \"\"Beacon college\"\" status by the Learning and Skills Improvement Service in November 2010.\nQuestion: Cardinal Newman College was recognized for being outstanding in 2009. True, False, or Neither? True\n###\nMelbourne Heart FC Futsal was a futsal club based in Melbourne, Victoria, founded in 2012. They played in the F-League, the top tier of Australian Futsal. The club was disbanded before the start of the 2014 season after the A-League team were bought by Manchester City FC.\nQuestion: Melbourne Heart FC Futsal was created before the A-League team. True, False, or Neither?", "doc_id": 899, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35981, 27132, 15519, 38194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Regent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day.\nQuestion: Regent Power plans to expand its production in 2021. True, False, or Neither? Neither\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD).\nQuestion: Liberal Citizens Action emerged from the Liberal Federation. True, False, or Neither? True\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles.\nQuestion: The movie was a box office success True, False, or Neither? Neither\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively.\nQuestion: Shadowgun Legends is a famous video game. True, False, or Neither? Neither\n###\nEnrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America.\nQuestion: Enrique Leff created Latin America with his 25 books that he wrote True, False, or Neither?", "doc_id": 748, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [45187, 1150, 19565, 23813], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier.\nQuestion: Pablo Picassa had a relationship with Fernande Olivier and was said to have painted over 90 portraits of her. True, False, or Neither? False\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall.\nQuestion: People protested turning the fortress into a national museum. True, False, or Neither? Neither\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town.\nQuestion: Harbour Place Shopping Centre is a shopping centre. True, False, or Neither? True\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Izzadeen Mohamed Naufer plays left wing roughly twice as much as he plays right wing. True, False, or Neither? Neither\n###\nIn the fall of 1997, Elton John set out on tour to promote his latest album \"The Big Picture\" with the Big Picture Tour. The album was a commercial success reaching No. 9 on the US \"Billboard\" 200 and No. 3 on the UK Albums Chart. The 1997 tour started off in North America and ended in Europe.\nQuestion: Elton John starts with a B. True, False, or Neither?", "doc_id": 608, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27090, 20955, 8324, 22687], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies.\nQuestion: Yahoo Serious is a bit of a silly man True, False, or Neither? Neither\n###\nPaul Revere ( ; December 21, 1734 O.S.May 10, 1818) was an American silversmith, engraver, early industrialist, and Patriot in the American Revolution. He is best known for his midnight ride to alert the colonial militia in April 1775 to the approach of British forces before the battles of Lexington and Concord, as dramatized in Henry Wadsworth Longfellow's poem, \"Paul Revere's Ride\" (1861).\nQuestion: Paul Revere never wore pajamas. True, False, or Neither? Neither\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999.\nQuestion: Daoud Abdel Sayed is not translated to \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f True, False, or Neither? False\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty.\nQuestion: Brown tried to cover up the crime. True, False, or Neither? Neither\n###\nCofield Mundi is a South African singer and songwriter born in Johannesburg, South Africa. Raised in a musical family, she began singing and performing from a young age and wrote her first song at the age of 12. Her aunt is South African born actress and singer Jill Kirkland, famous for her role in the movie \"Katrina\".\nQuestion: South Africa was too confining for Mundi True, False, or Neither?", "doc_id": 324, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27678, 20212, 3300, 8648], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonders of the Universe is a 2011 book by the theoretical physicists Brian Cox and Andrew Cohen. The book is about cosmology and the universe, and is explained in a way that is accessible to a general reader. The book is based on a series with the same name \"Wonders of the Universe\".\nQuestion: Wonders of The Universe had many seasons True, False, or Neither? Neither\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) .\nQuestion: Sabanc\u0131 University is more the 20 KM away from the Istanbul's city center. True, False, or Neither? True\n###\nHarold Buttleman, Daredevil Stuntman (also known as Buttleman) is a 2003 film written and directed by Francis Stokes; the only movie he has directed. It won the Jury Prize at the Deep Ellum Film Festival in 2003. It was awarded the audience award in the Had to Be Made Film Festival in 2005.\nQuestion: Harold Buttleman, Daredevil Stuntman won awards two years apart. True, False, or Neither? True\n###\nStephen Tyrone Colbert ( , ; born May 13, 1964) is an American comedian, television host, actor, and writer. He is best known for hosting the satirical Comedy Central program \"The Colbert Report\" from 2005 to 2014, and hosting the CBS talk program \"The Late Show with Stephen Colbert\" beginning in September 2015.\nQuestion: Tyrone is the middle name of Colbert. True, False, or Neither? True\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery.\nQuestion: The Anchor Bankside has been a pub for 800 years. True, False, or Neither?", "doc_id": 999, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7711, 9492, 32137, 33695], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Uni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats.\nQuestion: Uni\u00f3n Deportiva Vall de Ux\u00f3 is a popular team in Spain True, False, or Neither? Neither\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley.\nQuestion: Bernard often wrote tragedies True, False, or Neither? Neither\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement.\nQuestion: Princess Caroline died when she was 1 year old True, False, or Neither? False\n###\nIn the United States, a utilities commission, utility regulatory commission (URC), public utilities commission (PUC) or public service commission (PSC) is a governing body that regulates the rates and services of a public utility. In some cases, government bodies with the title \"public service commission\" may be civil service oversight bodies, rather than utilities regulators.\nQuestion: The utilities feature will continue to be upgraded to better suit the needs of everyone. True, False, or Neither? Neither\n###\nJames Montgomery (born May 12, 1949) is an American blues musician, best known as the lead singer, blues harp player, frontman, and bandleader of The James Montgomery Blues Band (a.k.a. The James Montgomery Band). Montgomery collaborates with many star performers and recording artists. He is also the past President of The New England Blues Society.\nQuestion: James Montgomery was born on an odd day True, False, or Neither?", "doc_id": 726, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36778, 22728, 25045, 28336], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Westbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008.\nQuestion: Westbury Senior High School is a very dirtyschool True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: The Six-Day War resulted in a lot of ammo used. True, False, or Neither? Neither\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University.\nQuestion: Falwell was a member of The Old Time Gospel Hour Quartet. True, False, or Neither? False\n###\nGlenn Martin Christopher Francis Quinn (May 28, 1970 \u2013 December 3, 2002) was an Irish actor in television and film, known for playing Mark Healy in the American sitcom \"Roseanne\", and Doyle, a half-demon, on \"Angel\", a spin-off series of \"Buffy the Vampire Slayer\".\nQuestion: Glenn Martin Christopher Francis Quinn is an Irish actor known for playing a half-demon, half-angel. True, False, or Neither? Neither\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina.\nQuestion: Irina Selyutina won the doubles True, False, or Neither?", "doc_id": 347, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35520, 27209, 11956, 41435], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941.\nQuestion: Christopher was killed at Pearl Harbor in Nov 1941 True, False, or Neither? False\n###\nPiedmont Avenue is a major thoroughfare in Atlanta, beginning in Downtown Atlanta and ending at its continuation as Piedmont Road (Georgia State Route 237) just before crossing under Interstate 85. Along the way, Piedmont Avenue passes through Midtown Atlanta where several historic properties are located on the street.\nQuestion: Piedmont Avenue passes through all areas of Atlanta. True, False, or Neither? Neither\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player.\nQuestion: David Krakauer starts with D. True, False, or Neither? True\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008.\nQuestion: Laura Elena Z\u00fa\u00f1iga Huizar was born in the same decade as \"Miss Bala\" was released. True, False, or Neither? False\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972.\nQuestion: William V. Bidwill Sr. had more than one brother True, False, or Neither?", "doc_id": 10, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14741, 8612, 3949, 19599], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards.\nQuestion: Jake Deckard not only won Performer of the Year in 2008, he also got married. True, False, or Neither? Neither\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts.\nQuestion: There were plans to broadcast the British sitcom Drifters in 2012 but it was delayed until 2013. True, False, or Neither? Neither\n###\nThe Arboretum Curie, also known as the Arboretum du Col des Trois Soeurs, is a small arboretum located at 1470 metres altitude in the Col des Trois Soeurs near La Panouse, Loz\u00e8re, Languedoc-Roussillon, France. It was created circa 1975 to study conifers suitable for reforestation, and according to Arbez et al., now contains 77 taxa (primarily conifers).\nQuestion: The Arboretum Curie is a very popular tourist attraction in France. True, False, or Neither? Neither\n###\nOtard, also known as Chateau de Cognac, is a French cognac house founded in 1795 by Jean-Baptiste Antoine Otard. The company has remained in the hands of the same family since its establishment. The firm is based in the Ch\u00e2teau des Valois (Ch\u00e2teau de Cognac), Cognac, Charente, its home since 1796.\nQuestion: Cognac was started in 1794 True, False, or Neither? False\n###\nGiovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg.\nQuestion: Giovanni Ferrero is still alive to this day. True, False, or Neither?", "doc_id": 886, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33755, 22601, 20572, 18108], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording.\nQuestion: Hamilton is based on the life of Alexa Hamilton. True, False, or Neither? False\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Alice Claeys children compete in figure skating like their mother. True, False, or Neither? Neither\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: the Lord's Cricket Ground could only fit 2,000 spectators at a time. True, False, or Neither? Neither\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\".\nQuestion: The Lord of the Rings: The Fellowship of the Ring is one of the greatest and most influential fantasy films ever made. True, False, or Neither? Neither\n###\nRecurring was the fourth and final Spacemen 3 studio album, finally released (after considerable delay) in February 1991, some time after the band had broken up. By the time the album was recorded, relations between the band had soured to the extent that the record is in 2 parts - the first side by Peter Kember, and the second by Jason Pierce.\nQuestion: Recurring was released in nineteen hundred ninety nine. True, False, or Neither?", "doc_id": 827, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37833, 35950, 29492, 29128], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German.\nQuestion: The song was inspired by a poem. True, False, or Neither? True\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: \"Fight or Flight\" is still running. True, False, or Neither? Neither\n###\nA Bhumka is the term for a traditional herbal healer in the valley of Patalkot, India. The valley is mainly home to members of the Bharia and Gond tribes, with 2,000 residents scattered between various villages and hamlets. Tribes people traditionally use herbal medicine, under the auspices of a herbal expert and holy man known as a Bhumka.\nQuestion: There are only about 2 dealers in the valley True, False, or Neither? Neither\n###\nSophie Tucker (January 13, 1887 \u2013 February 9, 1966) was a Ukrainian-born American singer, comedian, actress, and radio personality. Known for her stentorian delivery of comical and risqu\u00e9 songs, she was one of the most popular entertainers in America during the first half of the 20th century. She was widely known by the nickname \"The Last of the Red Hot Mamas\".\nQuestion: Ms. Tucker's favorite way to entertain her fans was through her comedy acts. True, False, or Neither? Neither\n###\nBarbro Martinsson (born 16 August 1935) is a former Swedish cross country skier who competed during the 1960s. Born in Valbo, she won two silver medals in the 3 x 5 km at the 1964 Winter Olympics and the 1968 Winter Olympics. Martinsson finished 4th in the 1968 Winter Olympics in both 5 km and 10 km.\nQuestion: Martinsson no longer skis. True, False, or Neither?", "doc_id": 681, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7085, 18222, 21732, 23145], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Phacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland.\nQuestion: The Joshua tree woodland habitat is an area where a certain species of plant that can be found, the same kind of plant that is in creosote bush scrub. True, False, or Neither? True\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty.\nQuestion: A former Detroit Police Chief got involved in a text-messaging scandal. True, False, or Neither? True\n###\nThe Greensboro Swarm are an American professional basketball team of the NBA G League and an affiliate of the Charlotte Hornets of the National Basketball Association. Based in Greensboro, North Carolina, the team plays their home games at the Greensboro Coliseum Fieldhouse. The team became the eleventh D-League team to be owned by an NBA team.\nQuestion: The Greensboro Swarm has never been a part of the NBA. True, False, or Neither? False\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH).\nQuestion: IDH gets a lot of criticism True, False, or Neither? Neither\n###\nAdrienne Maloof (born September 4, 1961) is an American businesswoman, television personality, shoe designer and co-owner of the various business holdings of Maloof Companies, which include a 2% stake in the Palms Casino Resort in Las Vegas, Nevada; Maloof Productions, Maloof Music and the annual Maloof Money Cup skateboarding event.\nQuestion: Adrienne Maloof was born in America. True, False, or Neither?", "doc_id": 723, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42814, 11460, 36673, 3945], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SFU Exchange is a bus terminus for TransLink located on the campus of Simon Fraser University in Burnaby, British Columbia. Opened on September 3, 1965, it serves primarily students, staff, and faculty of Simon Fraser University and residents of UniverCity.\nQuestion: SFU Exchange is regularly used by members of the Simon Fraser University though it is open to everyone. True, False, or Neither? Neither\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012.\nQuestion: Kaley Cuoco is an actress True, False, or Neither? Neither\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: Suntaragaali was 3 hours long. True, False, or Neither? Neither\n###\nCharlotte Marie Pomeline Casiraghi (born 3 August 1986) is the second child of Caroline, Princess of Hanover, and Stefano Casiraghi, an Italian industrialist. She is ninth in line to the throne of Monaco. Her maternal grandparents were Rainier III, Prince of Monaco, and American actress Grace Kelly. She is named after her maternal great-grandmother, Princess Charlotte, Duchess of Valentinois.\nQuestion: Charlotte Marie Pomeline was born on 3 August 1985 True, False, or Neither? False\n###\nSydney-Denison was an electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of the electoral district of West Sydney in the Ultimo area and named after Governor Denison. It was abolished in 1904 and absorbed into the electoral district of Pyrmont.\nQuestion: Sydney-Denison was named after the District of West Sydney. True, False, or Neither?", "doc_id": 943, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14922, 16365, 482, 2530], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Santa Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: The bricks outside of Santa Lucia are semi-ruinous and grey. True, False, or Neither? Neither\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord.\nQuestion: Taylor Swift was one of the musicians who performed at the Svensk Hyllningsfest in 2015. True, False, or Neither? Neither\n###\nThe Mini Hatch, stylized as MINI hatch or MINI Hardtop in the US, also known as Mini Cooper or Mini One or simply the Mini, is a three-door hatchback first introduced in late 2000, with a second generation launched in 2006 and a third generation model launched in 2014. A convertible version was introduced in 2004, with the second generation following in 2008.\nQuestion: A Mini Cooper and Mini One are the same vehicle as the Mini Hatch. True, False, or Neither? True\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: Bosch will only run for 2 more seasons. True, False, or Neither? Neither\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world.\nQuestion: Nosopsyllus fasciatus is part of domestic rats. True, False, or Neither?", "doc_id": 422, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29416, 19970, 29066, 22402], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m).\nQuestion: Sonnette is an unincorporated community in west central Powder River County, state of Montana, United States True, False, or Neither? True\n###\nHere is a list of all of KF Tirana's Cup seasons from 1939 till end of most recent season. This list shows where they finished the season, how many ties won or lost, how many goals they scored and conceded, how many wins draws and losses they had throughout the season, goal difference, winning difference and number of matches played.\nQuestion: This list shows where they finished the season, how many time they tied with other teams, how many times they won or how many times they lost from the late nineteen thirties on. True, False, or Neither? True\n###\nPerformance Car, commonly abbreviated to PC, was an automobile magazine from the United Kingdom published by EMAP between October 1983 and July 1998. As suggested by the title, the magazine focussed on the high performance sector of the car market, from hot hatches through to supercars.\nQuestion: the United Kingdom had a big automobile subculture. True, False, or Neither? Neither\n###\n\"I Never Picked Cotton\" is a song made famous by country music singer Roy Clark. Written by Bobby George and Charles Williams, the song was released in 1970 as the title track to the album released that same year. The song peaked at No. 5 on the \"Billboard magazine\" Hot Country Singles chart that summer.\nQuestion: I Never Picked Cotton peaked at No. 2 True, False, or Neither? False\n###\nCape Verde is a volcanic archipelago situated above an oceanic rise that puts the base of the islands 2 km above the rest of the seafloor. Cape Verde has been identified as a hotspot and it has been argued that a mantle plume might be underneath it causing the volcanic activity and associated geothermal anomalies.\nQuestion: Scientists understand everything about Cape Verde. True, False, or Neither?", "doc_id": 219, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3056, 15777, 1017, 32173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation.\nQuestion: Samson and Delilah has been performed in at least two languages. True, False, or Neither? True\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II.\nQuestion: The Spanish Army would have carried the Campo-Giro before the year 1917, True, False, or Neither? True\n###\nLloyd Newton Morrisett, Jr. (born November 2, 1929) is an American experimental psychologist with a career in education, communications, and philanthropy. He is one of the founders of the Sesame Workshop, the organization famous for the creation of the children's television shows \"Sesame Street\" which was also co-created by him, \"The Electric Company\", and many others.\nQuestion: Lloyd Newton Morrisett, Jr was solely responsible for the creation of Sesame street. True, False, or Neither? False\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Mentha diemenica, which could be known as slender mint, grows to about 3 feet tall in 4 months and is ready shortly after that. True, False, or Neither? Neither\n###\nVitacost.com, Inc is an American e-commerce company based in Boca Raton, Florida, that sells vitamins, supplements and organic grocery products. The company was bought by Kroger, in 2014. Vitacost was inducted into Inc Magazine's \"Inc. 500 Lifetime Hall of Fame,\" in 2006 as one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005).\nQuestion: Boca Raton was one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005). True, False, or Neither?", "doc_id": 389, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38257, 26506, 2727, 35343], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries.\nQuestion: In 2007 David Beckham won the Ballon d'Or True, False, or Neither? False\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Smithereens was from UK. True, False, or Neither? True\n###\nErnest R. Kroeger (August 10, 1862 \u2013 April 7, 1934) was an American composer. He is mainly known for the pedagogical works he composed for piano; he also taught music in St. Louis, Missouri. Today his papers are held at the Missouri Historical Society.\nQuestion: Ernest R. Kroeger was an American composer famous for the works he composed for the violin and cello. True, False, or Neither? False\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden.\nQuestion: Ellon Castle was build in 1745 True, False, or Neither? False\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg.\nQuestion: In 2015, Lil Dicky released his debut album with Snoop Dogg True, False, or Neither?", "doc_id": 868, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16636, 14630, 33419, 22142], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world.\nQuestion: Knot's Berry Farm sells jams and jellies near the log flume rides. True, False, or Neither? Neither\n###\nNativity in Black is the name of two Black Sabbath tribute albums that came out in the 1990s and 2000s. The albums were recorded with various heavy metal bands paying tribute to Black Sabbath for their influence on the heavy metal genre of rock music.\nQuestion: Black Sabbath was the inspiration for the band The Black Keys. True, False, or Neither? Neither\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University.\nQuestion: Liberty University officially endorsed The Old Time Gospel Hour Quartet. True, False, or Neither? Neither\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s.\nQuestion: The album is over 15 years old True, False, or Neither? True\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area.\nQuestion: The Pikes Peak Center is sometimes abbreviated as PPC. True, False, or Neither?", "doc_id": 587, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4646, 1870, 39694, 8794], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas.\nQuestion: It was controversial to move the finale out of the original state True, False, or Neither? Neither\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant.\nQuestion: Hakea gibbosa is the most concerning for New Zealand. True, False, or Neither? Neither\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\"\nQuestion: Bugger is offensive to old. True, False, or Neither? Neither\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: Although designed to be comedic, the film was deemed unfunny by parts of the population. True, False, or Neither? Neither\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition.\nQuestion: Best of 4Minute is a two language album. True, False, or Neither?", "doc_id": 236, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23271, 8219, 7281, 29978], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: He became a woman True, False, or Neither? Neither\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017.\nQuestion: The professional tennis players in the 2017 ATP Challenger were happy to play the tournament on hard courts. True, False, or Neither? Neither\n###\nMichiko (\u7f8e\u667a\u5b50 ) , born Michiko Sh\u014dda (\u6b63\u7530\u7f8e\u667a\u5b50 , Sh\u014dda Michiko ) on 20 October 1934, is the Empress of Japan as the wife of Akihito, the current Emperor of Japan reigning from 7 January 1989. She succeeded her mother-in-law, Empress Nagako (K\u014djun), consort of Emperor Hirohito (Sh\u014dwa).\nQuestion: Michiko was alive in 1934 True, False, or Neither? True\n###\nHomebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line.\nQuestion: It was written over many years True, False, or Neither? Neither\n###\nThomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 1, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily. From 1951 until 1967, he was the boss of the Lucchese crime family, one of the Five Families that dominates organized crime in New York City.\nQuestion: Thomas \"Tommy\" Lucchese once had a boss. True, False, or Neither?", "doc_id": 638, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1608, 14295, 44972, 22730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southern Methodist University (SMU) is a private research university in Dallas, University Park, and Highland Park, Texas. Founded in 1911 by the Methodist Episcopal Church, South, SMU operates satellite campuses in Plano, Texas, and Taos, New Mexico. SMU is owned by the South Central Jurisdiction of the United Methodist Church. Of the university's 11,643 students, 6,411 are undergraduates.\nQuestion: SMU has less than 20,000 students in it. True, False, or Neither? True\n###\nPeter Billingsley (born April 16, 1971), also known as Peter Michaelsen and Peter Billingsley-Michaelsen, is an American actor, director, and producer, known for his role as Ralphie in the 1983 movie \"A Christmas Story\" and as \"Messy Marvin\" in the Hershey's Chocolate Syrup commercials during the 1970s. He began his career as an infant in television commercials.\nQuestion: Peter Billingsley was 12 years old when he started acting True, False, or Neither? False\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Weltenbrand was formed in nineteen hundred ninety six. True, False, or Neither? False\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison.\nQuestion: Victoria Morrison rewrites some blog work into English True, False, or Neither? True\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland.\nQuestion: Phacelia pedicellata is a poisonous flower native to the southwestern United States and Baja California True, False, or Neither?", "doc_id": 742, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11753, 16410, 28698, 12354], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Take Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\".\nQuestion: Kim Weston joined the Motown label after Marvin Gaye. True, False, or Neither? Neither\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis.\nQuestion: Escape from Suburbia: Beyond the American Dream made trillions. True, False, or Neither? Neither\n###\nThe UK Parliament constituency of County Galway was an historic Irish constituency, comprised the whole of County Galway, except for the Borough of Galway. It replaced the pre-Act of Union Parliament of Ireland constituency. Its representatives sat in the British House of Commons.\nQuestion: The Parliament of Galway was in Ireland county. True, False, or Neither? False\n###\nThe Board of Directors Annual Report is an album by vocal group The Mills Brothers with pianist and bandleader Count Basie and His Orchestra featuring performances recorded in 1968 and released on the Dot label. The album follows Basie's 1967 collaboration with The Mills Brothers \"The Board of Directors\".\nQuestion: There were several women in Count Basie's Orchestra. True, False, or Neither? Neither\n###\nDiorama is the fourth studio album by Australian alternative rock band Silverchair. Released on 31 March 2002 by Atlantic/. It won the 2002 ARIA Music Award for Best Group and Best Rock Album. The album was co-produced by Daniel Johns and David Bottrill. While Bottrill had worked on albums for a variety of other bands, \"Diorama\" marked the first production credit for lead singer Johns.\nQuestion: Daniel Johns and David Bottrill have spoken to each other True, False, or Neither?", "doc_id": 890, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44745, 18092, 38121, 8912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005.\nQuestion: The first test flight was reported less than 10 years ago. True, False, or Neither? False\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions.\nQuestion: Barry and Stuart are very famous. True, False, or Neither? Neither\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996.\nQuestion: Trainspotting was very well written. True, False, or Neither? Neither\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: Olive Cooper wrote Three Little Sisters. True, False, or Neither? True\n###\nAlexander Stewart Jolly (1887\u20131957) was a Sydney-based architect, published poet and children\u2019s author in the early 20th century. His buildings are primarily in Sydney's northern suburbs and the north coast of New South Wales. His architectural work was strongly influenced by Frank Lloyd Wright\u2019s School in Chicago, as well as the Arts and Crafts movement of the time.\nQuestion: Alexander Stewart Jolly was born in Australia. True, False, or Neither?", "doc_id": 517, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17636, 34698, 16524, 6640], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif.\nQuestion: There Is a Man in Our House was released before 1960 True, False, or Neither? False\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: The Commotions was released on February 21, 1990 True, False, or Neither? False\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister.\nQuestion: The script writer got the idea for the film while drinking Johnnie Walker. True, False, or Neither? Neither\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming.\nQuestion: Resorts Casino Tunica is the most expensive casino in Mississippi True, False, or Neither? Neither\n###\nHonest Ed's was a landmark discount store located in Toronto, Ontario, Canada. It was named for its proprietor, Ed Mirvish, who opened the store in 1948 and oversaw its operations for almost 60 years, until his death in 2007. The store continued in operation until it was permanently closed on December 31, 2016.\nQuestion: Ed Mirvish was known as \"Honest Ed\". True, False, or Neither?", "doc_id": 662, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8219, 39571, 39058, 10408], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017.\nQuestion: The professional tennis players in the 2017 ATP Challenger were happy to play the tournament on hard courts. True, False, or Neither? Neither\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position.\nQuestion: A dog could possibly be employed by the CTA True, False, or Neither? Neither\n###\nCinnaholic is a vegan bakery franchise that started in 2010 and currently operates in eight states. The company's owners appeared on the television show Shark Tank in 2014, which ended with them ultimately turning down a $200,000 investment offer from Robert Herjavec. The company has adopted a franchise business model and has plans to open 100 locations by 2020.\nQuestion: Cinnaholic has plans to open 100 locations by 2020. True, False, or Neither? True\n###\nThe Enlistment Act 1970 is a statute of the Parliament of Singapore that caters for the enlistment of persons in the Singapore Armed Forces. The law repeals the Singapore Army Act and People\u2019s Defence Force Act of 1965 and is designed specifically to subject enlisted personnel under military law during the period of enlistment and service.\nQuestion: Parliament passed a law for Singapore to have military personal subject to it's own laws. True, False, or Neither? True\n###\nWind power in Montana is a growing industry. At a nameplate capacity of 210 megawatts (MW), the $500 million Glacier Wind Farm, which is located in Toole and Glacier counties, became Montana's largest in October 2009, surpassing the 135 MW Judith Gap Wind Farm in Wheatland County.\nQuestion: Wheatland County borders Toole and Glacier County's in Montana True, False, or Neither?", "doc_id": 310, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22138, 22125, 34109, 27854], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records.\nQuestion: The album was released in 2016 True, False, or Neither? True\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: The dog is very big True, False, or Neither? True\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name.\nQuestion: Benoit means \"blessed\". True, False, or Neither? True\n###\nAmerican Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company.\nQuestion: American Motors Incorporated was part of the Auto Industry. True, False, or Neither? True\n###\n\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015.\nQuestion: Both songs premiered during Richard Kingsmill's new music segment on Triple J, which was more than 100 days ago. True, False, or Neither?", "doc_id": 309, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23855, 44696, 877, 17220], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "FS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani.\nQuestion: FS Kozani plays against Arsenal on occasion. True, False, or Neither? Neither\n###\nListennn... the Album is the debut studio album by American hip hop disc jockey DJ Khaled. It was released on June 6, 2006. by Terror Squad Entertainment and Koch Records. The album features guest appearances from Young Jeezy, Bun B, Birdman, Juelz Santana, Slim Thug, Krayzie Bone, Chamillionaire, Trina, Twista, Freeway, Jadakiss, Beanie Sigel, Styles P and Lil Scrappy, among others.\nQuestion: The album was released the year after 2004. True, False, or Neither? False\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker.\nQuestion: The Doberman Gang was released more than 100 years ago. True, False, or Neither? False\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long.\nQuestion: State Route 204 is a popular road to drive in True, False, or Neither? Neither\n###\nTo Drown A Rose is a single by Death in June. Additional music performers include: Christ 777, Douglas P., Gary Carey, Jan O', John Balance, Rose McDowall. The vinyl has the phrases \"Our time has been...\" and \"...and will be again\" scratched into it. The test pressing for this release was done on 12\" vinyl as opposed to the finalized 10\" format.\nQuestion: Death in June had to scrap their single because it was done on 12' vinyl. True, False, or Neither?", "doc_id": 387, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41679, 26235, 16985, 24388], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: The Mast\u00edn Espa\u00f1ol needs to eat a lot of food. True, False, or Neither? Neither\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times.\nQuestion: He was capped for the Belgian national team 48 times.\n True, False, or Neither? False\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer.\nQuestion: Joseph Maurice Ravel ends with a L. True, False, or Neither? True\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: There was another movie made by the same name which was a comedy drama years later True, False, or Neither? Neither\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser was one of multiple Waffen-SS members who survived World War II True, False, or Neither?", "doc_id": 825, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26657, 1112, 12895, 10721], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: Evel Knievel's museum cost $3.50 to enter. True, False, or Neither? Neither\n###\nDiablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood.\nQuestion: Diablo was the first Western that starred Clint Eastwood's nephew Scott. True, False, or Neither? False\n###\nImpatiens sakeriana is a species of flowering plant in the family Balsaminaceae. It is native to Cameroon and Equatorial Guinea. It grows in mountain forest understory habitat at altitudes up to 2000 meters. It occurs on Mount Cameroon. It can be locally common in parts of its range, but its habitat is threatened by agriculture.\nQuestion: Impatiens sakeriana grows in rain forests. True, False, or Neither? False\n###\nZambia Sugar Plc, is a company based in Mazabuka, Southern Province, Zambia and is the largest sugar producer in Zambia. The company is listed on the Lusaka Stock Exchange (symbol: ZSUG) with 82% of the shares held by Illovo Sugar Limited of South Africa (a subsidiary of Associated British Foods) and the balance by institutional and private shareholders in Zambia.\nQuestion: Illovo Sugar Limited of South Africa holds less than 83% of the shares for Zambia Sugar Plc. True, False, or Neither? True\n###\nGeneo Grissom (born June 4, 1992) is an American football defensive end for the New England Patriots. He played college football at Oklahoma. He was drafted by the New England Patriots in the third round with the 97th overall pick of the 2015 NFL Draft.\nQuestion: During the NFL draft in 2015, Geneo was drafted by the Patriots in the third round as the 96th pick and played football in college. True, False, or Neither?", "doc_id": 963, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6062, 24999, 4516, 1152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father.\nQuestion: John Abele is the current Milwaukee County Executive. True, False, or Neither? False\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98.\nQuestion: Death Race was released in different countries. True, False, or Neither? Neither\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Sushil Kumar Shinde was an amazing vice president of India. True, False, or Neither? Neither\n###\nDuke is a fictional character from the \"\" toyline, comic books, and cartoon series. He is the G.I. Joe Team's First Sergeant, and debuted in 1983. The character is also featured in both the \"\" animated series and comic books. Channing Tatum portrays Duke in the 2009 live-action film, \"\", and the 2013 sequel \"\".\nQuestion: Duke was played by Tatum. True, False, or Neither? True\n###\nJack Frost is the name of two unrelated fictional characters appearing in American comic books published by Marvel Comics. The first Jack Frost was published by Marvel's 1940s forerunner Timely Comics during the period fans and historians call the Golden Age of comic books.\nQuestion: Jack Frost, the first instance, was published by Golden Age Comics during a timely period in the 1940s True, False, or Neither?", "doc_id": 678, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39284, 23315, 5324, 11875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maxillaria rufescens, the Light Fox-red Maxillaria, is a species of orchid native to Trinidad and the Amazon Basin in Colombia, Ecuador, Peru, Bolivia, Venezuela, The Guianas and Brazil. The plant grows at eleveations of 200 to 2000 meters, and grows up to 1 inches (3 to 4 centimeters).\nQuestion: Maxillaria rufescens grow more heavily in Trinidad than in other areas. True, False, or Neither? Neither\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: It is located in western pennsylvania True, False, or Neither? Neither\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017).\nQuestion: Andy Barclay is a character in the \"Child's Play\" movie franchise. True, False, or Neither? True\n###\nThe Mnet Asian Music Award for Best Collaboration is an award presented annually by CJ E&M Pictures (Mnet). It was first awarded at the 12th Mnet Asian Music Awards ceremony held in 2010; singers Ga-in & Jo Kwon won the award for their song \"We Fell in Love\", and it is given in honor for the artists with the most artistic achievement in collaboration performances in the music industry.\nQuestion: The Mnet Asian Music Award is a disliked show in South korea True, False, or Neither? Neither\n###\nThe New York Blade was a free weekly newspaper focusing on lesbian, gay, bisexual and transgender (LGBT) issues in New York City, New York. The \"Blade\" was a member of the National Gay Newspaper Guild, and contained news, entertainment, classified ads, and free personals for men and women.\nQuestion: The New York Blade is now paid True, False, or Neither?", "doc_id": 972, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24673, 29955, 43606, 10893], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others.\nQuestion: Umashree, Kari Subbu, and Hulagappa Kattimani all played the same character in Kalavu. True, False, or Neither? Neither\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983.\nQuestion: Robert L. \"Rusty\" White's father was born July 1905 in Newton, Mississippi. True, False, or Neither? Neither\n###\nDie Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen.\nQuestion: Die Antwoord is comprised of two men and one woman. True, False, or Neither? True\n###\nRana amurensis (Khabarovsk frog, Siberian wood frog, Heilongjiang brown frog or Amur brown frog) is a species of true frog found in northern Asia. It ranges across western Siberia, as well as northeastern China, northeastern Mongolia, and on the northern Korean Peninsula and on Sakhalin. \"Rana coreana\" was previously included in this species as a subspecies.\nQuestion: Rana amurensis can be found in northern Korean Peninsula. True, False, or Neither? True\n###\nPlainfield South High School, or PSHS, is a four-year public high school located in Joliet, a southwest suburb of Chicago, Illinois, in the United States. It is part of the Plainfield Community Consolidated School District 202, which also includes three other high schools: Plainfield Central High School, Plainfield North High School and Plainfield East High School.\nQuestion: PSHS is in south west of Chicago True, False, or Neither?", "doc_id": 494, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1172, 29394, 2174, 31574], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There Was a Crooked Man... is a 1970 western starring Kirk Douglas and Henry Fonda and directed by Joseph L. Mankiewicz. This was the only western made by Mankiewicz, director of such notable films as \"All About Eve\", \"Guys and Dolls\" and \"Cleopatra\". It was written by David Newman and Robert Benton, their first script after \"Bonnie and Clyde\".\nQuestion: There Was a Crooked Man was released after the year 1970 ended. True, False, or Neither? False\n###\nCattle Decapitation is an American extreme metal band from San Diego, California, formed in 1996. The band's current line-up includes vocalist Travis Ryan, guitarist Josh Elmore, drummer Dave McGraw, and bassist Derek Engemann. Cattle Decapitation have released seven albums, the most recent being \"The Anthropocene Extinction\" in 2015.\nQuestion: The band has released seven albums. True, False, or Neither? True\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States.\nQuestion: The Forum Shops at Caesars has the largest gross income. True, False, or Neither? True\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Political satire is a popular film genre in Israel. True, False, or Neither? Neither\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\".\nQuestion: \"Crawling\" was written by Linkin Park for the DLC pack in \"Rock Band 3\". True, False, or Neither?", "doc_id": 12, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31744, 32661, 9504, 43225], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time.\nQuestion: Jefferson County was named because Thomas Jefferson was going to be president. True, False, or Neither? Neither\n###\nAndrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen.\nQuestion: Andrea was born in 1953. True, False, or Neither? True\n###\nThe European Association of Science Editors (EASE ) is a non-profit membership organisation for people interested in science communication and editing. Founded in 1982, in France, EASE now has an international membership from diverse backgrounds, professional experiences, and job titles.\nQuestion: EASE doesn't make a profit. True, False, or Neither? True\n###\nSimon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro.\nQuestion: Simon Corbell was born more than 5000 days ago. True, False, or Neither? True\n###\nDame Nicola Mary Brewer DCMG is a British diplomat and university administrator. In May 2014 she was appointed Vice-Provost (International) at University College London. She is a non-executive director of Aggreko. Brewer was British High Commissioner to South Africa from 2009 to 2013.\nQuestion: Brewer's appointment as British High Commissioner to South Africa ended in May 2013. True, False, or Neither?", "doc_id": 744, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32697, 3709, 2670, 14072], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns.\nQuestion: The german 88mm gun was originally designed as an anti-aircraft only weapon. True, False, or Neither? Neither\n###\nAdam Best is a fictional character from the BBC soap opera \"EastEnders\", played by David Proud, the first adult actor with a visible disability to appear regularly in the soap. Both Proud and his character live with spina bifida. The character made his first appearance in the episode broadcast on 10 September 2009 and his last in the one broadcast on 19 July 2010.\nQuestion: Adam Best had a visible disability True, False, or Neither? True\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe.\nQuestion: David Tench provided the voice of Drew Forsythe True, False, or Neither? False\n###\nBalaji K. Kumar is a Film Director who entered Tamil cinema as a director with the 2013 thriller film \"Vidiyum Munn\" which released on 29 November 2013 and received positive reviews from critics. Then started his career as story board artist for advertising firms like Ogilvy & Mather, JWT, Saatchi & Saatchi.\nQuestion: Balaji K. Kumar sold a billion tickets. True, False, or Neither? Neither\n###\nAlong With The Gods \u2013 Part 1 () is an upcoming South Korean fantasy drama film based on a webcomic of the same name. The film will be released in two parts, and stars Ha Jung-woo, Cha Tae-hyun, Ju Ji-hoon, Lee Jung-jae, Do Kyung-soo and Kim Hyang-gi. The first part of the film will be released on December 20, 2017.\nQuestion: Along with the Gods is a multi film franchise True, False, or Neither?", "doc_id": 828, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16883, 20299, 1322, 15975], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\".\nQuestion: Aodh Mac Cathmhaoil has Irish ancestry True, False, or Neither? True\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style.\nQuestion: Real Fuerza A\u00e9rea won all of their wrestling matches. True, False, or Neither? Neither\n###\nLibya TV (also known as Libya Al Ahrar TV) is a Libyan TV channel broadcast by satellite from its headquarters in Doha. The channel was created in 2011 during the Libyan Civil War. Its presents news, opinions, analysis, photo and video reports about Libya in specific and the region in a wider scope. It focuses on Libya\u2019s revolution and future toward building a democratic state.\nQuestion: Libya TV was created in the 20th century. True, False, or Neither? False\n###\nIreland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth.\nQuestion: There are two islands larger than Ireland in Europe True, False, or Neither? True\n###\nWang Tieguan ()is a Chinese noted geologist,Academician of the Chinese Academy of Sciences,Professor of China University of Petroleum,PhD Tutor born in December,1937,born in Shanghai City,People's Republic of China in December,1937, graduated from Beijing Petroleum Geology School(predecessor of Yangtze University)in 1956 and from Beijing Petroleum Institute in 1965.\nQuestion: Wang Tieguan never lived in China after 1966. True, False, or Neither?", "doc_id": 491, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26692, 21054, 28616, 29103], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: Lathan McKay lives in Minnesota. True, False, or Neither? Neither\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old.\nQuestion: John Cameron Urschel (born June 24, 1991) is a Canadian historian and retired professional American football guard and center. True, False, or Neither? False\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro.\nQuestion: Salavatore Rosa taught many students how to paint. True, False, or Neither? Neither\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon has been in a draft before. True, False, or Neither? True\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include:\nQuestion: Abbott had 39 years of life before passing on True, False, or Neither?", "doc_id": 551, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1957, 23939, 5074, 6572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Game Plan is a 2007 American family comedy film directed by Andy Fickman and written by Nichole Millard, Kathryn Price and Audrey Wells and starring Dwayne \"The Rock\" Johnson (marking the last film in which Johnson uses his ring name \"The Rock\" in billing). It follows an NFL quarterback who finds out he has an 8-year-old daughter from a previous relationship.\nQuestion: Kids can watch this movie. True, False, or Neither? True\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005.\nQuestion: Samuel Eto'o Fils was born less than 1981 days ago. True, False, or Neither? False\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music.\nQuestion: Mission: Impossible III was released in the 2000s True, False, or Neither? True\n###\nAmethyst: Princess of Gemworld is a comic book series published by DC Comics in the 1980s. The series tells the story of a teenage girl named Amy Winston who discovers that she is the orphaned princess of the magical Gemworld. Amy learns that an evil ruler called Dark Opal is out to destroy her and travels to Gemworld to overthrow him.\nQuestion: Amethyst: Princess of Gemworld was inspired by a real teen girl named Amy who had delusions and was fighting cancer. True, False, or Neither? Neither\n###\nCommunal riots occurred in Bihar from 24 October to 11 November 1946, in which Hindu mobs targeted Muslim families. The riots were triggered by the Great Calcutta Killings, as well as the Noakhali riots earlier that year. Mahatma Gandhi declared that he would fast unto death if the riots did not stop. The riots were part of a sequence of communal violence that culminated in the partition of India.\nQuestion: Communal riots occurred in Bihar were due to food price escallation True, False, or Neither?", "doc_id": 308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3294, 34278, 29604, 8483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Diablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood.\nQuestion: diablo is a western film very similar to a clint eastwood movie True, False, or Neither? Neither\n###\nLeonard Pilkington (1527\u20131599) was an English academic and clergyman. A Marian exile, he became Regius Professor of Divinity at Cambridge and Master of St John's College, Cambridge at the start of the reign of Elizabeth I. In his subsequent church career, he followed the way opened when his brother James Pilkington became Bishop of Durham.\nQuestion: Leonard Pilkington was an American clergyman True, False, or Neither? False\n###\nClarendon is an urbanized, upper-class neighborhood in Arlington County, Virginia, located between the Rosslyn area and the Ballston area. It was named after Edward Hyde, 1st Earl of Clarendon, a leading statesman and historian of the English Civil War. The main thoroughfares are Wilson Boulevard (one-way westbound) and Clarendon Boulevard (one-way eastbound).\nQuestion: Clarendon has houses. True, False, or Neither? True\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\".\nQuestion: Dr. Scobie published a magazine in 1972. True, False, or Neither? Neither\n###\nFinsbury Park TMD was a railway Traction Maintenance Depot situated in London, England. It was the first purpose built main line diesel locomotive depot opened in England and it was fully commissioned in April 1960. Finsbury Park was a steam shed under British Railways with the depot code 34G; the depot code of the diesel depot under BR was FP. The nearest railway station is Finsbury Park.\nQuestion: Other first purpose main line diesel locomotive depots were built after Finsbury Park. True, False, or Neither?", "doc_id": 947, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43546, 45290, 10102, 28092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "St Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795.\nQuestion: The city of Port Phillip is part of the St Kilda neighbourhood. True, False, or Neither? True\n###\nThe Puerto Rico Baseball Academy and High School (PRBAHS) is a non-profit organization combining academics and sports programs into one curriculum. Its goal is to prepare its students for higher education, competitive college scholarship opportunities, and the Major League Baseball Draft. The PRBAHS is the only high school in Puerto Rico or the United States with this type of learning environment.\nQuestion: The Puerto Rico Baseball Academy and High School starts with a B. True, False, or Neither? False\n###\nStraumfj\u00f6r\u00f0 Icelandic), or Straumfj\u01ebr\u00f0 (Old Norse) sometimes anglicised to Straumsfjordr, Straumfjordr, Straumsfjord or Straumfjord, is according to the Sagas of Icelanders a fjord in Vinland where Thorfinn Karlsefni set up a temporary settlement. It is described in the \"Saga of Erik the Red\", but not in the \"Greenland saga\". Its name translates to \"Current-fjord\", \"Stream-fjord\" or \"Tide-fjord\".\nQuestion: Straumfj\u00f6r\u00f0 Icelandic is the greenland saga from erik the red, created by Thorfinn Karlsefni True, False, or Neither? False\n###\nThe Asian Institute is a research centre at the Munk School of Global Affairs at the University of Toronto, and is located in the historical Devonshire House, a former residential hall of the university's Trinity College. Ritu Birla is the Richard Charles Lee Director of the Asian Institute.\nQuestion: The Asian Institute at the University of Toronto is located in a newly built Devonshire House, but has never been affiliated with the university. True, False, or Neither? False\n###\n\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\".\nQuestion: Guy Sebastian set the release date for his single Cover on My Heart. True, False, or Neither?", "doc_id": 842, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7064, 32436, 1677, 8125], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Altamonte Springs is a suburban city in Seminole County, Florida, United States, which had a population of 41,496 at the 2010 census. The city is in the northern suburbs of the Orlando\u2013Kissimmee\u2013Sanford Metropolitan Statistical Area, which the United States Census Bureau estimated had a population of 2,054,574 in 2008.\nQuestion: Altamonte Springs, a suburban city in Florida, will show a population increase in the 2020 census. True, False, or Neither? Neither\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer.\nQuestion: Maria's father often took her hunting True, False, or Neither? Neither\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee\nQuestion: Mystery was published in 1990. True, False, or Neither? True\n###\nThe Washington Nationals are a professional baseball team that has been based in Washington, D.C. since . The Nationals are a member of both the Major League Baseball's (MLB) National League Eastern Division and the National League (NL) itself. Since the 2008 season, the Nationals have played in Nationals Park; from 2005 through , the team played in Robert F. Kennedy Memorial Stadium.\nQuestion: The Washington Nationals are amateurs. True, False, or Neither? False\n###\nSemonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781.\nQuestion: The population in 2006 was 7,780 plus one. True, False, or Neither?", "doc_id": 141, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20868, 18068, 14997, 32651], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel.\nQuestion: bete noire was the last album ferry made True, False, or Neither? Neither\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in Washington state. True, False, or Neither? True\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: The President of the Tibetan Youth Congress is Tsewang Rigzin. True, False, or Neither? True\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation.\nQuestion: The 89th Medium Tank Battalion was a stand alone unit True, False, or Neither? False\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant.\nQuestion: Neelix Is a character played by \"\". True, False, or Neither?", "doc_id": 142, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19749, 19661, 30919, 30134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Franco Mari (Born 23 January 1947) is an Italian actor and comedian. Better known as Rupert Sciamenna, his best known character, he is famous for his participation in television programs such as Mai dire... on Italia 1 in many sketches with Marcello Macchia.\nQuestion: Franco Mari is a thin man True, False, or Neither? Neither\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie.\nQuestion: The DVD came out after Y2K. True, False, or Neither? False\n###\nKJEF-CA, channel 13, was a class A television station in Jennings, Louisiana. Owned by Townsquare Media, the station was an independent station. It was the only television station owned by Townsquare, a company that otherwise specializes exclusively in radio.\nQuestion: KJEF-CA had 500 employees True, False, or Neither? Neither\n###\nKathleen Delaney is an American actress, voice actress, singer, and dancer who works on Broadway and on the properties of 4Kids Entertainment. She is best known as the voice of Hina in the 4Kids dub of One Piece, Mai Valentine in uncut versions of \"Yu-Gi-Oh!\" and Rouge in \"Sonic X\" and the succeeding games until 2010, when she was replaced by Karen Strassman.\nQuestion: Delaney voiced Rouge in Sonic video games until the middle of Obama's second term as president of the U.S. True, False, or Neither? False\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks.\nQuestion: Cocaine is a globally used drug True, False, or Neither?", "doc_id": 392, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40327, 34873, 2689, 34947], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mick Napier (born December 12, 1962) is an American director, actor, teacher and author living in Chicago. He is the founder and artistic director of the Annoyance Theatre and an award-winning director at The Second City. He has directed Stephen Colbert, Tina Fey, Rachel Dratch, Horatio Sanz, Nia Vardalos, Andy Richter, Jeff Garlin, and David Sedaris, amongst others.\nQuestion: Mick Napier directed Stephen Colbert, Tina Fey, and Donald Trump True, False, or Neither? Neither\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex.\nQuestion: Muccan Station is a rap musician True, False, or Neither? False\n###\nLove's Labour's Won is a lost play attributed by contemporaries to William Shakespeare, written before 1598 and published by 1603, though no copies are known to have survived. Scholars dispute whether it is a true lost work, possibly a sequel to \"Love's Labour's Lost\", or an alternative title to a known Shakespeare play.\nQuestion: \"Love's Labour's Won\" was misplaced by Shakespeare. True, False, or Neither? Neither\n###\nCentral Mountain Air Ltd. is a Canadian regional airline based in Smithers, British Columbia. It operates scheduled and charter services and transborder services. Its main base is Smithers Airport, with other bases at Calgary International Airport, Vancouver International Airport and Prince George Airport.\nQuestion: British Columbia contains 3 regional airlines. True, False, or Neither? Neither\n###\nThe 1968 Senior League World Series took place from August 13\u201318 in Gary, Indiana, United States. New Hyde Park, New York defeated West Tampa, Florida in the championship game. It was the third straight title for New York. This was the first SLWS held in Gary.\nQuestion: 1968 was the first time that New York and Tampa competed. True, False, or Neither?", "doc_id": 905, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27897, 33512, 28977, 24105], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Guns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin).\nQuestion: Guns of Diablo came out in the 21st century True, False, or Neither? False\n###\n\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the\nQuestion: Eternally will be featured in other films. True, False, or Neither? Neither\n###\nAldo Fabrizi (] ; 1 November 1905 \u2013 2 April 1990 in Rome, Italy) was an Italian actor, director, screenwriter and comedian, probably best known for the role of the heroic priest in Roberto Rossellini's \"Rome, Open City\" and as partner of Tot\u00f2 in a number of successful comedies.\nQuestion: Aldo Fabrizi was a director before he was a comedian True, False, or Neither? Neither\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: \"I'm So Sorry\" was recorded in 2017. True, False, or Neither? Neither\n###\nThe Gospel According to the Other Mary is an opera/oratorio by contemporary American composer John Adams. The world premiere took place on May 31, 2012, at the Walt Disney Concert Hall in Los Angeles with Gustavo Dudamel conducting the Los Angeles Philharmonic who also premiered the staged version on March 7, 2013, at the same venue.\nQuestion: John Adams did not compose more operas. True, False, or Neither?", "doc_id": 78, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27106, 34811, 43830, 30849], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004.\nQuestion: The second game is technically superior to the first game. True, False, or Neither? Neither\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only.\nQuestion: Sisters of Mercy formed before 1990. True, False, or Neither? True\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: JC Aragone plays tennis for the USA. True, False, or Neither? True\n###\nExergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid.\nQuestion: Kokam America, Inc. will open a store in Texas the next fiscal year. True, False, or Neither? Neither\n###\nJay Kahn is a Democratic member of the New Hampshire Senate representing the 10th district. The 10 district is located in the southwestern corner of the state and includes Alstead, Chesterfield, Gilsum, Harrisville, Hinsdale, Keene, Marlborough, Roxbury, Sullivan, Surry, Swanzey, Walpole, Westmoreland and Winchester, New Hampshire.\nQuestion: Jay Kahn has lived in Florida. True, False, or Neither?", "doc_id": 119, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [988, 26897, 42335, 38263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run.\nQuestion: Jean de Segonzac worked with Eric Overmyer on Homicide: The Movie. True, False, or Neither? True\n###\nMiss Peregrine's Home for Peculiar Children is a contemporary fantasy debut novel by American author Ransom Riggs. The story is told through a combination of narrative and vernacular photographs from the personal archives of collectors listed by the author.\nQuestion: Riggs has written a fantasy novel. True, False, or Neither? True\n###\nOn July 16, 2009, Harvard University professor Henry Louis Gates Jr. was arrested at his Cambridge, Massachusetts home by local police officer Sgt. James Crowley, who was responding to a 9-1-1 caller's report of men breaking and entering the residence. The arrest initiated a series of events that unfolded under the spotlight of the international news media.\nQuestion: This happened in 2009 True, False, or Neither? True\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries.\nQuestion: The Ballon d'Or is judged by the President of the United States True, False, or Neither? False\n###\nJohn Michael Stipe (born January 4, 1960) is an American singer, songwriter, musician, film producer, music video director, visual artist, and philanthropist. He is best known as the lead singer of the alternative rock band R.E.M. from their formation in 1980 until their dissolution in 2011.\nQuestion: John Michael Stipe is known for giving away money. True, False, or Neither?", "doc_id": 496, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44684, 1868, 19400, 17113], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alias is a fictional character in the \"Forgotten Realms\" campaign setting for the \"Dungeons & Dragons\" fantasy role-playing game. Alias is the main character of \"Azure Bonds\". She also appeared in the computer game, Curse of the Azure Bonds which was based on the book. Alias later appears in the sequel \"Song of the Saurials\", and the standalone book \"Masquerades\".\nQuestion: Alias is an actress that has featured in \"Forgotten Realms\" campaign setting for the \"Dungeons & Dragons True, False, or Neither? False\n###\nThe Little League World Series took place between August 22 and August 27 in Williamsport, Pennsylvania. Westbury American Little League of Houston, Texas defeated American Little League of West New York, New Jersey in the championship game of the 20th Little League World Series.\nQuestion: Westbury American Little League of Houston, Texas lost the championship game. True, False, or Neither? False\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\".\nQuestion: Alex Rider is a title character and the protagonist of the unpopular \"Alex Rider\" novel series by British author Anthony Horowitz. True, False, or Neither? True\n###\nBruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice.\nQuestion: Bruno Mingeon was a popular bobsledder True, False, or Neither? Neither\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead.\nQuestion: The Sierra Leone Civil War ended in March 2002. True, False, or Neither?", "doc_id": 75, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32912, 30207, 15877, 29875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Vorontsov Lighthouse (Ukrainian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u044c\u043a\u0438\u0439 \u043c\u0430\u044f\u043a , Russian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u043a\u0438\u0439 \u043c\u0430\u044f\u043a ) is a famous red-and-white, 27.2 metre landmark in the Black Sea port of Odessa, Ukraine. It is named after Prince Mikhail Semyonovich Vorontsov, one of the governors-general of the Odessa region.\nQuestion: Prince Mikhail Semyonovich Vorontsov was around 2 meters tall. True, False, or Neither? Neither\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life.\nQuestion: Health For All has been used by zack. True, False, or Neither? Neither\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: Zuikaku was the largest aircraft carrier in the Japanese fleet. True, False, or Neither? Neither\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972.\nQuestion: The Arizona Cardinals existed in 1962. True, False, or Neither? True\n###\nThe 2015 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 30th edition of the ASB Classic, and part of the WTA International tournaments category of the 2015 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 5 to 10 January 2015.\nQuestion: There were 30 edition of the ASB Classic played before the 2015 ASB Classic True, False, or Neither?", "doc_id": 967, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27550, 1165, 27262, 34774], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The J.J. Deal and Son Carriage Factory was the largest factory built in Jonesville, Michigan. It is the only 19th century factory remaining in the City. It is located at 117 West Street. On August 1, 2012, the building was added to the National Register of Historic Places.\nQuestion: Michigan only has 3 remaining 19th century factories in total. True, False, or Neither? Neither\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success.\nQuestion: deep death is an arcade video game True, False, or Neither? True\n###\nMarion Anna Fischer (born July 18, 1986 in East Berlin) is a German actress and singer. Since 2003, she appeared in over 30 film and television roles in appearance. She is most recognised to international audiences as the innocent vampire \"Nora\" in Dennis Gansel's drama film \"We Are The Night\"\nQuestion: \"Nora\" used to drink rat's blood instead. True, False, or Neither? Neither\n###\nWood River is a provincial electoral district for the Legislative Assembly of Saskatchewan, Canada. Located in southern Saskatchewan, the district was created by the \"Representation Act, 1994\" (Saskatchewan) out of the former constituency of Assiniboia-Gravelbourg and half of the Shaunavon district.\nQuestion: Wood River is in southern Saskatchewan. True, False, or Neither? True\n###\nKaalamellam Kaathiruppen (Tamil: \u0b95\u0bbe\u0bb2\u0bae\u0bc6\u0bb2\u0bcd\u0bb2\u0bbe\u0bae\u0bcd \u0b95\u0bbe\u0ba4\u0bcd\u0ba4\u0bbf\u0bb0\u0bc1\u0baa\u0bcd\u0baa\u0bc7\u0ba9\u0bcd ; English: I Will Wait Forever ) is 1997 Tamil romance film directed by R. Sundarrajan. The film stars Vijay and Dimple in the lead roles, while R. Sundarrajan, Jaishankar, Srividya, Karan, Manivannan play other pivotal roles. The music for the film was composed by Deva and the film released on 14 January 1997.\nQuestion: Kaalamellam Kaathiruppen was finished in 1997 True, False, or Neither?", "doc_id": 425, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33635, 38931, 20724, 5823], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator.\nQuestion: Fraser Wishart is a daily radio and television commentator. True, False, or Neither? False\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The Arkansas Razorbacks wore red and yellow as their team uniform. True, False, or Neither? Neither\n###\nJerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern.\nQuestion: Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He played college football at Georgia Southern where hes also met his future wife. True, False, or Neither? Neither\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered.\nQuestion: Justicia umbrosa cannot grow in shady landscapes True, False, or Neither? False\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex.\nQuestion: Croton lechleri is a dull looking red latex type plant. True, False, or Neither?", "doc_id": 34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31312, 40459, 16998, 8446], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW.\nQuestion: The government wanted to close the plant. True, False, or Neither? Neither\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910.\nQuestion: Yi Bangja and Crown Prince Euimin were born in 1901. True, False, or Neither? True\n###\nThe Valley of Fire Road (also called the Valley of Fire Highway) is a road in northeastern Clark County, Nevada serving the Valley of Fire State Park. The roadway was previously designated State Route 40 (SR 40), and the segment within the state park is currently designated a Nevada Scenic Byway.\nQuestion: The Valley of Fire Road is located in idaho True, False, or Neither? False\n###\nNew Hampshire Route 120 is a 26.928 mi secondary north\u2013south state highway in Sullivan and Grafton counties in the upper Connecticut River Valley region of New Hampshire. Its southern terminus is at New Hampshire Route 11 and New Hampshire Route 103 in Claremont. Its northern terminus is at New Hampshire Route 10 in Hanover.\nQuestion: New Hampshire Route 120 is a 26.918 mi secondary north\u2013south state highway True, False, or Neither? False\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title.\nQuestion: Gene Mayer doesn't compete in doubles tennis. True, False, or Neither?", "doc_id": 350, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31765, 7356, 35414, 13825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Aster\" (M915) is a Tripartite-class minehunter of the Belgian Naval Component, launched on 16 December 1985 at the Mercantile-Belyard shipyard in Rupelmonde and christened by Queen Paola of Belgium. The patronage of \"Aster\" was accepted by the city of Blankenberge. \"Aster\" was the first of the Belgian Tripartite-class minehunters.\nQuestion: People did not want it to launch True, False, or Neither? Neither\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city.\nQuestion: The county seat of Keith County does not refer to a chair. True, False, or Neither? True\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole is a second album True, False, or Neither? False\n###\nTony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls.\nQuestion: Tony Rena Snell Jr. has a R. True, False, or Neither? True\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: Ranila has an A. True, False, or Neither?", "doc_id": 446, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39231, 12511, 4687, 28286], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: The Flirts split up in 1990. True, False, or Neither? Neither\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175.\nQuestion: Look at Me is a song by the Beatles True, False, or Neither? False\n###\nBrandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more.\nQuestion: Brandon hughes is an american actor True, False, or Neither? False\n###\nMarry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil. It aired on KBS2 from October 14 to December 3, 2013 on Mondays and Tuesdays at 22:00 for 16 episodes.\nQuestion: Marry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil had more than 14 episodes. True, False, or Neither? True\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees.\nQuestion: Jaron Long's father has only worked for NY teams True, False, or Neither?", "doc_id": 224, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44258, 25691, 8714, 1415], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Derailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago.\nQuestion: The novel is set in Chicago. True, False, or Neither? Neither\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups.\nQuestion: Corn smut is very expensive to make True, False, or Neither? Neither\n###\nA Lady's Morals is a 1930 American Pre-Code film offering a highly fictionalized account of singer Jenny Lind. The movie features opera diva Grace Moore as Lind, Reginald Denny as a lover, and Wallace Beery as P. T. Barnum; Beery would play Barnum again four years later in \"The Mighty Barnum\". The film contains some fine opera arias by Moore and was directed by Sidney Franklin.\nQuestion: The story of Lind was fabricated to a large extent in this production. True, False, or Neither? True\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006.\nQuestion: \"Live Free or Die\" was the best rated episode of the HBO original series \"The Sopranos\". True, False, or Neither? Neither\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: 10 September 2016 has a z. True, False, or Neither?", "doc_id": 515, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37710, 32664, 8035, 31652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament.\nQuestion: The 2011 Atlantic Sun Conference Baseball Tournament is watched mainly by seniors True, False, or Neither? Neither\n###\nAndrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen.\nQuestion: Andrea was born in Bavaria. True, False, or Neither? True\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is in CA True, False, or Neither? True\n###\nVivekananda Degree College is the only Degree college in Ichoda Mandal which is established in 2006 and is affiliated to Kakatiya University of Telangana, India. The college has its campus at Ichoda, Adilabad. The college runs degree courses in Computer Science, Arts, Science, Commerce and Management.\nQuestion: Vivekananda Degree College offers both bachelors and masters degrees. True, False, or Neither? Neither\n###\n\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\".\nQuestion: Guy performed the song many times on tv True, False, or Neither?", "doc_id": 854, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12473, 30527, 39547, 19359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor.\nQuestion: the Portuguese governor went on to become a very important federal government official True, False, or Neither? Neither\n###\nArtur Edler von Mecenseffy (23 June 1865, Vienna \u2014 6 October 1917, Asiago) was an Austro-Hungarian Army officer who held the rank of \"Feldmarschall-leutnant\" (\"lieutenant field marshal\") and served during World War I, becoming the highest ranking officer of Austria-Hungary to be killed on the battlefield.\nQuestion: Artur died in Asiago. True, False, or Neither? True\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training.\nQuestion: The Boulton Paul Balliol and Sea Balliol took a lot of money to maintain True, False, or Neither? Neither\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships.\nQuestion: The United States Bowling Congress Open Championships were held in Nevada a little bit more than a decade ago True, False, or Neither? True\n###\nKenneth \"Ken\" Gibson was a Northern Irish politician who was the Chairman of the Volunteer Political Party (VPP), which he had helped to form in 1974. He also served as a spokesman and Chief of Staff of the loyalist paramilitary organisation, the Ulster Volunteer Force (UVF).\nQuestion: ken gibson was the founder of vpp in 1974 and was also the president of the loyalist paramilitary organisation uvf True, False, or Neither?", "doc_id": 692, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4244, 36231, 13494, 13330], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added.\nQuestion: Stereolab has released at least three albums. True, False, or Neither? True\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field no longer exists True, False, or Neither? True\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title.\nQuestion: Grand Prix tennis circuit is also now defunct True, False, or Neither? Neither\n###\nBe Mine Tonight was the debut single from New Zealand band Th' Dudes. It was released in May 1979 as a Double A-side with Walking In Light and reached No. 36 on the New Zealand music charts. Be Mine Tonight won Single of the Year at the 1979 New Zealand Music Awards. It was voted 27th best New Zealand Song of the 20th Century by APRA members and featured on the Nature's Best CD.\nQuestion: The B side of the debut single by Th'Dudes was a hit. True, False, or Neither? Neither\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts.\nQuestion: Diaspora studies cannot be taught in college. True, False, or Neither?", "doc_id": 406, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7548, 15536, 4700, 15518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\".\nQuestion: Abbas-Mustan received the Filmfare Best Villain Award True, False, or Neither? False\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists.\nQuestion: Wikipedia articles exist for most of the list. True, False, or Neither? Neither\n###\nUni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats.\nQuestion: Jos\u00e9 Mangri\u00f1\u00e1n has a capacity of 3000 seats. True, False, or Neither? False\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde.\nQuestion: Jango had a black cast True, False, or Neither? Neither\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: The song \"Amor a la Mexicana\" was written by Mexican singer Thalia. True, False, or Neither?", "doc_id": 408, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21072, 10503, 11958, 8228], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: James Wyatt is not currently a Methodist minister. True, False, or Neither? True\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station.\nQuestion: Moto is the operator of service areas in England. True, False, or Neither? True\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player.\nQuestion: David Krakauer has an E. True, False, or Neither? True\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart.\nQuestion: Korean boy band member Sangmin contributed to the album Timeless: Begins. True, False, or Neither? True\n###\nSantos \"Sandy\" Alomar Vel\u00e1zquez Jr. (] , ; born June 18, 1966) is a professional baseball catcher, coach, and manager. He played in Major League Baseball catcher for the San Diego Padres, Cleveland Indians, Chicago White Sox, Colorado Rockies, Texas Rangers, Los Angeles Dodgers, and New York Mets between 1988 and 2007.\nQuestion: Santos \"Sandy\" Alomar Vel\u00e1zquez Jr. never sweated. True, False, or Neither?", "doc_id": 696, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34703, 16743, 17458, 17233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SeaWorld Ohio was a park in the SeaWorld chain of marine animal theme parks. The park opened in 1970 directly across the lake and less than one mile from Geauga Lake Park in Aurora, Ohio, United States. The small lake separated the two parks. Wildwater Kingdom, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016.\nQuestion: Blizzard Beach, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016. True, False, or Neither? False\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer.\nQuestion: Joseph Maurice Ravel was smart. True, False, or Neither? Neither\n###\nBoneyard Beach is a 1995 album by Raleigh, North Carolina band Dish, led by singer and pianist Dana Kletter, on Interscope Records. The album was produced by John Agnello at Ardent Studios in Memphis, Tennessee. Interscope's VP, Tom Whalley, told \"Billboard\" magazine that \"the high quality of songwriting in Dish and the sound of Dana's voice are two things that set this band apart.\"\nQuestion: Boneyard Beach was Dish's debut album. True, False, or Neither? Neither\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015.\nQuestion: The team wasn't able to participate in the 1985 season. True, False, or Neither? False\n###\nSeven Little Monsters is a children's picture book by American author and illustrator Maurice Sendak. \"Seven Little Monsters\" was published by Harper & Row in 1977 and served as the basis for the Canadian-Chinese television production of the same name (2000-2007).\nQuestion: The television production of Seven Little Monsters started ten years after the book was published. True, False, or Neither?", "doc_id": 186, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31827, 30534, 33012, 14122], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin.\nQuestion: Corn crab soup is definitely of northern Chinese origin. True, False, or Neither? False\n###\nSqueezing Out Sparks is the fourth studio album by English musician Graham Parker and his band the Rumour. It was voted album of the year in the 1979 \"Village Voice\" Pazz & Jop Critics Poll and later ranked number 334 on \"Rolling Stone\" magazine's list of the 500 greatest albums of all time. Although the Rumour were not credited on the cover, their name was included on the album label.\nQuestion: Graham Parker has sold at least one album True, False, or Neither? True\n###\nWinning America is a documentary television film about the Canadian band Said the Whale. It follows the band on their first US tour down through California, and then to South by Southwest. It premiered on CBC Television on July 23, 2011. The film was directed by Brent Hodge and Thomas Buchan, and was produced by Brent Hodge, Jon Siddall and Sheila Peacock. It was nominated for a Leo Award in 2012.\nQuestion: The band is called Said the Bird. True, False, or Neither? False\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\".\nQuestion: Down the Way was Stone's first album. True, False, or Neither? False\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: Look at My Dab was released as a single after October 31st, 2015. True, False, or Neither?", "doc_id": 266, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31442, 26531, 1283, 5169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism.\nQuestion: Big Sky is considered a town and has a town government. True, False, or Neither? False\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Nick Harper has an album known as Smithereens. True, False, or Neither? True\n###\nAmericana Deluxe is the second studio album by Big Bad Voodoo Daddy. This album is also sometimes called \"Big Bad Voodoo Daddy\", as the album cover prominently displays a stylized \"Big Bad Voodoo Daddy\" logo and does not feature the phrase \"Americana Deluxe\" on it. However, the liner notes and the band's website clearly show that the true title is indeed \"Americana Deluxe\".\nQuestion: Big Bad Voodoo Daddy has 3 members. True, False, or Neither? Neither\n###\nWayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war.\nQuestion: He will retire at the age of 50. True, False, or Neither? Neither\n###\nMurder Rock (Italian: Murderock - uccide a passo di danza; also known as Murder-Rock: Dancing Death, Slashdance and The Demon Is Loose!) is a 1984 Italian giallo film starring Olga Karlatos and Ray Lovelock, and written and directed by Lucio Fulci. Fulci recalled the producer forced him to turn the film into a musical with the music of Keith Emerson due to the success of \"Flashdance\".\nQuestion: Flashdance was a dance studio. True, False, or Neither?", "doc_id": 711, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32479, 13965, 19984, 15880], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Real Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded.\nQuestion: Most players in Real Madrid C were Spanish. True, False, or Neither? Neither\n###\nBe Mine Tonight was the debut single from New Zealand band Th' Dudes. It was released in May 1979 as a Double A-side with Walking In Light and reached No. 36 on the New Zealand music charts. Be Mine Tonight won Single of the Year at the 1979 New Zealand Music Awards. It was voted 27th best New Zealand Song of the 20th Century by APRA members and featured on the Nature's Best CD.\nQuestion: Be Mine Tonight was successful in Australia True, False, or Neither? Neither\n###\nJames Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: James Wyatt won a literary award for \"Dungeon Master's Guide\". True, False, or Neither? Neither\n###\nElmhurst is a residential neighborhood in the southernmost part of Oakland, California. Originally a separate town, it was annexed by Oakland in 1909, and today is considered part of East Oakland. It lies at an elevation of 39 feet (12 m). It contains the Eastmont Town Center.\nQuestion: Eastmont Town Center is the best one in Oakland. True, False, or Neither? Neither\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils.\nQuestion: The Big Cube had a lot of drugs True, False, or Neither?", "doc_id": 477, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6051, 39625, 45458, 6652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38.\nQuestion: Leningrad artists displayed art inside one of the biggest exhibitions in Russia after the death of Stalin. True, False, or Neither? True\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: Foals have been covered by rush. True, False, or Neither? Neither\n###\nHolding Back the River is the third studio album by Wet Wet Wet. It was released on 30 October 1989. The album is actually a follow-up to their debut, \"Popped In Souled Out\". Its offspring singles were \"Sweet Surrender\", \"Broke Away\", \"Hold Back the River\" and \"Stay With Me Heartache (Can't Stand the Night)\". The album reached #2 in the charts.\nQuestion: There were three albums by Wet Wet Wet before Holding Back the River. True, False, or Neither? False\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances.\nQuestion: Spaceballs is both funny and dramatic at the same time. True, False, or Neither? Neither\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title.\nQuestion: The Palm Harbor Open was played over 10 hours ago. True, False, or Neither?", "doc_id": 163, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33671, 29184, 3843, 14809], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cambarus cryptodytes, the Dougherty Plain cave crayfish or Apalachicola cave crayfish, is a small, freshwater crayfish endemic to Florida and Georgia in the United States. It is an underground species known only from waters associated with the Floridan aquifer.\nQuestion: Apalachicola cave crayfish lives in saltwater True, False, or Neither? False\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant.\nQuestion: Neelix is a good person. True, False, or Neither? Neither\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The North African ostrich is known for burying it's head in the sand. True, False, or Neither? Neither\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: Albrect von Houwald was married to Helene Grafin von Carmer. True, False, or Neither? Neither\n###\nHideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff.\nQuestion: Hideki Kamiya worked as a video game designer and director for Capcom and Clover Studio. True, False, or Neither?", "doc_id": 942, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21846, 9751, 2638, 11832], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Margarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos.\nQuestion: Margarita la tornera is translated to Margarita the Gatekeeper in English. True, False, or Neither? True\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: Hyde, Jekyll, Me is based on the Dr. Jekyll and Mr. Hyde character True, False, or Neither? True\n###\nThe Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation.\nQuestion: There are no star players on the U-20 team currently. True, False, or Neither? Neither\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: Colorz of Rage hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley had blossoming careers independent of the urban drama film. True, False, or Neither? Neither\n###\nHeck's Department Store, a chain of West Virginia based discount department stores, was founded by Boone County natives and businessmen Fred Haddad, Tom Ellis, and Lester Ellis and wholesale distributor Douglas Cook. The Heck's name was a combination of the names Haddad, Ellis and Cook. Haddad served as President, Lester Ellis was Vice-President, and Tom Ellis was Secretary-Treasurer.\nQuestion: Heck's Department Store first name ends with s. True, False, or Neither?", "doc_id": 775, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13108, 26038, 14095, 18751], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Am\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics.\nQuestion: Am\u00e9lie Simone Mauresmo thought her opponents were easy during the 2004 Summer Olympics True, False, or Neither? Neither\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene.\nQuestion: The Foo Fighters were popular worldwide True, False, or Neither? Neither\n###\nUna Lillian Paisley (born 18 November 1922 in Kew in Melbourne in Victoria - died 1977 in Kew, Victoria) was an Australian cricket player. She played twelve Test matches for the Australia national women's cricket team. She captained the Australia national women's cricket team in four Test matches against New Zealand and England.\nQuestion: Una Lillian Paisley won a gold medal. True, False, or Neither? Neither\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: Nadeem F. Paracha is a talented author in Pakistan. True, False, or Neither? Neither\n###\nGiovanni Visconti \u2014 according to Lorenzo Cardella nephew of Pope Gregory X. He was ostensibly created cardinal-bishop of Sabina by his uncle in 1275 and in 1276 was named judge in the case concerning the translation of bishop Giovanni of Potenza to the archbishopric of Monreale, postulated by the cathedral chapter of Monreale. He died in 1277 or 1278.\nQuestion: Giovanni Visconti died in both 1277 and 1278. True, False, or Neither?", "doc_id": 91, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11460, 27200, 8810, 24136], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012.\nQuestion: Kaley Cuoco is an actress True, False, or Neither? Neither\n###\nThe Grove is a business district located between Vandeventer and Kingshighway in the Forest Park East neighborhood of St. Louis, Missouri. It is near Barnes-Jewish Hospital, Washington University School of Medicine, Saint Louis University Hospital, Saint Louis University School of Medicine, Forest Park, and Tower Grove Park.\nQuestion: The Grove is a business district in St. Louis, Missouri that has many hospitals and medical schools, to mention a few: Barnes-Jewish Hospital, Washington University School of Medicine, Saint Louis University Hospital, Saint Louis University School of Medicine, Forest Park, and Tower Grove Park and many others. True, False, or Neither? Neither\n###\nTexas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax.\nQuestion: Texas Monthly was successful in its legal action. True, False, or Neither? Neither\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: Wallace Michael Ross founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia while teaching at Sturgess School in Derby. True, False, or Neither? Neither\n###\nMelinda Jacobs (born August 2, 1967) is an American born entertainment reporter, currently providing content to Secrets of the City and [1]. Over the span of her 20-year career in television/radio Jacobs has interviewed celebrities such as Quincy Jones, Kathie Lee Gifford, Joan Rivers, Demi Lovato and Adrian Peterson.\nQuestion: They weren't successful. True, False, or Neither?", "doc_id": 887, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8269, 29922, 38244, 613], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Lady's Morals is a 1930 American Pre-Code film offering a highly fictionalized account of singer Jenny Lind. The movie features opera diva Grace Moore as Lind, Reginald Denny as a lover, and Wallace Beery as P. T. Barnum; Beery would play Barnum again four years later in \"The Mighty Barnum\". The film contains some fine opera arias by Moore and was directed by Sidney Franklin.\nQuestion: A Lady's Morals was made in the 1930s True, False, or Neither? True\n###\nThe Local Government (Northern Ireland) Act 1972 (1972 c. 9) was an Act of the Parliament of Northern Ireland that constituted district councils to administer the twenty-six local government districts created by the Local Government (Boundaries) Act (Northern Ireland) 1971, and abolished the existing local authorities in Northern Ireland.\nQuestion: Northern Ireland has several localized government districs True, False, or Neither? True\n###\nCraig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\".\nQuestion: He had a different career before becoming a director. True, False, or Neither? True\n###\nRonald Mayorga S\u00e1nchez (born June 26, 1984, Yumbo, Valle del Cauca, Colombia) is an award-winning Colombian journalist and TV anchor of \"Red\" in Caracol Television in Colombia. As a radio journalist who works with \"Blue Radio\" one of the radio station's imported from Latin America as a host in \"Vox Populi\".\nQuestion: Ronald Mayorga S\u00e1nchez has never been a tv anchor True, False, or Neither? False\n###\nHarry Brand (October 20, 1895 \u2013 February 22, 1989) was an American press agent. Described as \"the mastermind who made Shirley Temple the most famous child star in history, Betty Grable a GI Joe pinup girl and Marilyn Monroe a sex goddess,\" Brand was the head of publicity at 20th Century Fox from 1935 until 1962.\nQuestion: Brand was the head of publicity at Fox for half a decade True, False, or Neither?", "doc_id": 478, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17761, 36576, 34755, 14056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Asana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook.\nQuestion: Asana is a mobile only application. True, False, or Neither? False\n###\nDangerously Excited (; lit. \"I'm a Civil Servant\") is a 2012 South Korean comedy-drama film starring Yoon Je-moon as a stuffy municipal bureaucrat who learns to embrace life when a budding rock band moves into his basement. The film premiered at the 2011 Busan International Film Festival and also screened at the 2012 Udine Far East Film Festival.\nQuestion: The film was shown before 2012. True, False, or Neither? True\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village.\nQuestion: The population of Makri was 1919 in the viilliage True, False, or Neither? False\n###\nChicken Shack are a British blues band, founded in the mid-1960s by Stan Webb (guitar and vocals), Andy Silvester (bass guitar), and Alan Morley (drums), who were later joined by Christine Perfect (McVie) (vocals and keyboards) in 1968. Chicken Shack has performed with various line-ups, Stan Webb being the only constant member.\nQuestion: Chicken Shack is only a game. True, False, or Neither? False\n###\nGalli Galli Sim Sim (Devanagari: \u0917\u0932\u0940 \u0917\u0932\u0940 \u0938\u093f\u092e \u0938\u093f\u092e) is the Hindi language adaptation of the American children's television series \"Sesame Street\" (famous for its Muppets), for India. It is co-produced by Sesame Workshop and Turner Entertainment, through Miditech. The show's Indian production company is known as Sesame Workshop India.\nQuestion: Turner Entertainment did not want to work on this project. True, False, or Neither?", "doc_id": 877, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38435, 1170, 1809, 45130], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck.\nQuestion: Lucas Franchoys brothers often helped him . True, False, or Neither? Neither\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: Bosch is a film. True, False, or Neither? False\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years.\nQuestion: FMCG are sold slowly True, False, or Neither? False\n###\nCognizant is an American multinational corporation that provides IT services, including digital, technology, consulting, and operations services. It is headquartered in Teaneck, New Jersey, United States. Cognizant is listed in the NASDAQ-100 and the S&P 500 indices. It was founded as an in-house technology unit of Dun & Bradstreet in 1994, and started serving external clients in 1996.\nQuestion: Dun & Bradstreet is the IT service division of the Cognizant corporation. True, False, or Neither? False\n###\nBurton & Taylor is a BBC Four TV film directed by Richard Laxton, and based on the legendary acting duo and former husband and wife, Richard Burton and Elizabeth Taylor, during their preparation for a 1983 theatrical production of the play, \"Private Lives\". The film stars Helena Bonham Carter and Dominic West in the title roles.\nQuestion: Private Lies came out the year after 1981. True, False, or Neither?", "doc_id": 413, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21975, 26881, 21817, 41358], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife.\nQuestion: The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It was a great piece of work. True, False, or Neither? Neither\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest.\nQuestion: Beilin District is well traveled True, False, or Neither? Neither\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974.\nQuestion: Per\u00f3n would have been elected a third time if not for the coup. True, False, or Neither? Neither\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008.\nQuestion: Takeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and another company. True, False, or Neither? True\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title.\nQuestion: The 2009 British Speedway Championship took place more than 1001 days ago. True, False, or Neither?", "doc_id": 834, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34031, 40301, 24741, 36421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics.\nQuestion: Dave Smith is from Russia True, False, or Neither? False\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944.\nQuestion: Tippet was a prolific recording artist before meeting Holst. True, False, or Neither? Neither\n###\nCougar Town is an American television sitcom that ran for 102 episodes over six seasons, from September 23, 2009 until March 31, 2015. The first three seasons aired on ABC, with the series moving to TBS for the final three seasons. The pilot episode was broadcast after \"Modern Family\". ABC officially gave the series a full season pickup on October 8, 2009.\nQuestion: Cougar Town was forced to move to TBS True, False, or Neither? Neither\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date.\nQuestion: Hemorrhage (In My Hands)\" which reached #322 on the \"Billboard\" Hot 100 charts. True, False, or Neither? False\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District.\nQuestion: At the 2010 census more than 2 million people lived in the built up area made of 3 out of 4 urban districts. True, False, or Neither?", "doc_id": 450, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24604, 32715, 43858, 6884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "No. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville.\nQuestion: No. 59 Squadron RAAF is also located at RAAF Base Townsville True, False, or Neither? Neither\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress.\nQuestion: It does not have a name that is called a reporting name True, False, or Neither? False\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website.\nQuestion: Van Morrison sells the album at his concerts. True, False, or Neither? True\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland.\nQuestion: Phacelia pedicellata is not native to the United States True, False, or Neither? False\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station.\nQuestion: Grantham North Services has 3 parks True, False, or Neither?", "doc_id": 348, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34367, 25594, 34344, 22080], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gloria Marie Steinem (born March 25, 1934) is an American feminist, journalist, and social political activist, who became nationally recognized as a leader and a spokeswoman for the American feminist movement in the late 1960s and early 1970s. She is listed in Who's Who in America.\nQuestion: Gloria Marie Steinem was a lesbian True, False, or Neither? Neither\n###\nMichael Cunningham (born November 6, 1952) is a U.S. novelist and screenwriter. He is best known for his 1998 novel \"The Hours\", which won the Pulitzer Prize for Fiction and the PEN/Faulkner Award in 1999. Cunningham is a senior lecturer of creative writing at Yale University.\nQuestion: Michael Cunningham is over 50 years old today True, False, or Neither? True\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization.\nQuestion: 315th Airlift Wing consists of Air force reserves and active-duty military personal True, False, or Neither? Neither\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008.\nQuestion: Laura Elena Z\u00fa\u00f1iga Huizar is smart. True, False, or Neither? Neither\n###\nThe 1972 Grantland Rice Bowl was an NCAA College Division game following the 1972 season, between the Louisiana Tech Bulldogs and the Tennessee Tech Golden Eagles. Louisiana Tech quarterback Denny Duron was named outstanding offensive player, while his teammate linebacker Joe McNeely was named outstanding defensive player.\nQuestion: Louisiana Tech had both outstanding offensive player and outstanding defensive player, Joe McNeeley and Denny Duron, repectively. True, False, or Neither?", "doc_id": 136, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8585, 34382, 13933, 9216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\".\nQuestion: Daraar was critically well-received. True, False, or Neither? Neither\n###\nGloria Marie Steinem (born March 25, 1934) is an American feminist, journalist, and social political activist, who became nationally recognized as a leader and a spokeswoman for the American feminist movement in the late 1960s and early 1970s. She is listed in Who's Who in America.\nQuestion: Gloria Marie Steinem has 3 children True, False, or Neither? Neither\n###\nThe Newtown Pippin, also known as Albemarle Pippin, is an American apple originated in the late 17th or early 18th century and still cultivated on a small scale. At one time there were two very similar apple cultivars known as the 'Yellow Newtown' ('Albermarle Pippin') and 'Green Newtown' ('Brooke Pippin'), one of which perhaps originated as a sport of the other.\nQuestion: Yellow and Green Pippins are still produced on a small scale though they go by different names. True, False, or Neither? True\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals.\nQuestion: Di Natale had 70 goals. True, False, or Neither? False\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there.\nQuestion: Congressman Joseph Bouck was raised on Bouck's Island. True, False, or Neither?", "doc_id": 128, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39233, 26645, 35527, 25207], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season.\nQuestion: The MVP get sponsorships True, False, or Neither? Neither\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo.\nQuestion: Celly Cel is a very good True, False, or Neither? Neither\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer.\nQuestion: He had more success when he was back in the group True, False, or Neither? Neither\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress.\nQuestion: The MiG-31 interceptor can attack large high-speed targets such as a Boeing 747. True, False, or Neither? Neither\n###\nGeorge Montgomery (born April 26, 1962) is a retired American basketball player. He played basketball at Illinois, and was a second-round draft selection of the Portland Trail Blazers in the 1985 NBA Draft, though he never played in the NBA. He is the biological father of Warriors center JaVale McGee, but did not raise his son.\nQuestion: George Montgomery can dribble a basketball True, False, or Neither?", "doc_id": 626, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16315, 39231, 44248, 34568], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films.\nQuestion: Jonathan Michael Lovitz met Trump. True, False, or Neither? Neither\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: The Flirts split up in 1990. True, False, or Neither? Neither\n###\nRobert Cary Blanchard (November 5, 1968 \u2013 September 6, 2016) was an American football placekicker in the National Football League. He played eight years for five teams: the New York Jets for his first two years, the Indianapolis Colts after taking 1994 off, the Washington Redskins in 1998, the New York Giants in 1999, and the Arizona Cardinals in his final season.\nQuestion: Robert Blanchard's NFL career began in 1991 True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: In 2015, the City of Orlando tore down the grandstands at Tinker Field to build a new stadium. True, False, or Neither? Neither\n###\nNabokov's Congeries was a collection of work by Vladimir Nabokov published in 1968 and reprinted in 1971 as \"The Portable Nabokov\". Because Nabokov supervised its production less than a decade before he died, it is useful in attempting to identify which works Nabokov considered to be his best, especially among his short stories.\nQuestion: \"The Portable Nabokov\" was supervised by Nabokov 7 years before he died. True, False, or Neither?", "doc_id": 714, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27179, 18359, 1114, 30236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Liberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD).\nQuestion: Areilza had left Democratic Center Union. True, False, or Neither? True\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: Wallace Michael Ross died peacefully. True, False, or Neither? Neither\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points.\nQuestion: The Cornhuskers played 8 games in 1994 True, False, or Neither? Neither\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: The San Nicolao Tunnel took more than 10 years to construct True, False, or Neither? Neither\n###\n\"Legion\" is an American cable television series created for FX by Noah Hawley, based on the Marvel Comics character David Haller / Legion. It is connected to the \"X-Men\" film series, the first television series to do so. The first season, consisting of eight episodes, began airing on February 8, 2017. A second season was ordered in March 2017.\nQuestion: legion is an american series connected to x men and created for dx by noah hawley with 8 episode for season 1 True, False, or Neither?", "doc_id": 745, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3366, 9124, 5159, 25817], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) is a 1975 Soviet Ukrainian musical film, produced by Viktor Storozhenko starring Sofia Rotaru in the main role, as well as Soviet Ukrainian Smerichka vocal-instrumental band. The movie features songs in Ukrainian, Moldovan and Russian of Sofia Rotaru filmed in the background of Ukrainian Carpathian mountains.\nQuestion: Pisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) was filmed in 1975 True, False, or Neither? True\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: Home Depot also sells plants. True, False, or Neither? Neither\n###\nThe 18th Street Expressway (often shortened to 18th Street in everyday speech) is a freeway in Kansas City, Kansas that runs from Interstate 35 north to Interstate 70/U.S. Route 24/U.S. Route 40. It carries the U.S. Route 69 designation its entire length.\nQuestion: The 18th Street Expressway is one of the longest freeways in Kansas City. True, False, or Neither? Neither\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330.\nQuestion: Thomas Cooper was the best England international footballer. True, False, or Neither? Neither\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion.\nQuestion: \"The History of the Decline and Fall of the Roman Empire\" had six volumes published in four years. True, False, or Neither?", "doc_id": 375, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28631, 29826, 28521, 29347], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro.\nQuestion: Massaro's pupil Martoriello became a good painter. True, False, or Neither? Neither\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: Youth in Guatemala are redheads. True, False, or Neither? Neither\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\".\nQuestion: Idichapuli worked in multiple roles on films. True, False, or Neither? True\n###\nWilliam Irving Turner (1890\u20131950), commonly known as Tim Turner or W.I. Turner, was an American architect. He served as a U.S. Forest Service architect and is credited with much of the design of Timberline Lodge on Mount Hood in Oregon, an important and influential work.\nQuestion: William Irving Turner talked to Amy. True, False, or Neither? Neither\n###\nLouis Marx (August 11, 1896 \u2013 February 5, 1982) was an American toy maker and businessman whose company, Louis Marx and Company, was the largest toy company in the world in the 1950s. Described by many as an experienced businessman with the mind of child; Louis Marx\u2019s ability to see into the minds of children around the world guided his toy creations and advertising efforts.\nQuestion: Louis Marx lived through the Great Depression. True, False, or Neither?", "doc_id": 637, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26214, 39597, 44507, 31268], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster.\nQuestion: Mithun Chakraborty really likes being the Grandmaster True, False, or Neither? Neither\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season.\nQuestion: The Adriatic ABA League started in 1998. True, False, or Neither? Neither\n###\nPrincess Amalie \"Auguste\" of Anhalt-Dessau (German: \"Prinzessin Amalie Auguste von Anhalt-Dessau\" ; 18 August 1793 \u2013 12 June 1854) was a German princess of Anhalt-Dessau who was Princess consort of Schwarzburg-Rudolstadt from 1816 to 1854 as the wife of Friedrich G\u00fcnther, Prince of Schwarzburg-Rudolstadt.\nQuestion: Gunther was born in 1793. True, False, or Neither? Neither\n###\nJames Bongani Kamte (born 20 July 1982), nicknamed \"Cobra\", is a South African professional golfer. He has played on the Sunshine Tour, Challenge Tour, European Tour, and Asian Tour. He earned his tour card for the 2008 European Tour season by finishing in the top 30 of the qualifying school.\nQuestion: He started golfing at age 15 True, False, or Neither? Neither\n###\nAmargosa is an unincorporated community and census-designated place in Jim Wells County, Texas, United States. Its population was 291 as of the 2010 census. Prior to 2010, the community was grouped with nearby Owl Ranch as part of the Owl Ranch-Amargosa census-designated place. The community is named for the Amargosa Creek that runs nearby. The word \"amargosa\" means \"bitter\" in Spanish.\nQuestion: Amargosa is a place in Jum Wells County, Texas that has a large population. True, False, or Neither?", "doc_id": 739, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10008, 23932, 43160, 16598], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Identification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\".\nQuestion: The first feature film directed by Skolimowski was called Identification Marks: None (Polish: Rysopsis) in 1964. True, False, or Neither? True\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005.\nQuestion: Samuel Eto'o Fils was born in the second month of the year. True, False, or Neither? False\n###\nThe 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016.\nQuestion: Happy Valley is the host city to the ATP Tour. True, False, or Neither? True\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978.\nQuestion: Gary Sutherland will be inducted into the MLB Hall of Fame True, False, or Neither? Neither\n###\nKirill Olegovich Starkov (Russian:\u041a\u0438\u0440\u0438\u043b\u043b \u041e\u043b\u0435\u0433\u043e\u0432\u0438\u0447 \u0421\u0442\u0430\u0440\u043a\u043e\u0432, born March 31, 1987), is a professional Danish ice hockey player. He is playing for HC Red Ice in the Swiss National League B. He has previously played for CSKA Moscow, Syracuse Crunch, Youngstown Steelhounds, Red Deer Rebels, Fr\u00f6lunda HC, Timr\u00e5 IK, Esbjerg IK and IK Oskarshamn.\nQuestion: kirill olegovich is from russia True, False, or Neither?", "doc_id": 536, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34818, 38786, 18858, 34329], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Walking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis.\nQuestion: Walking on Sunshine was Leona Lewis first and last appearance True, False, or Neither? Neither\n###\nPrivate First Class Jose F. Valdez (January 3, 1925 - February 17, 1945) was a United States Army soldier who posthumously received the Medal of Honor \u2014 the United States' highest military decoration \u2014 for his actions near Rosenkranz, France, in the Battle of the Colmar Pocket during World War II.\nQuestion: Jose Valdez was in the army. True, False, or Neither? True\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists.\nQuestion: Contra Conspiracy is a 1999 action film True, False, or Neither? False\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall is in New Mexico True, False, or Neither? False\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\".\nQuestion: Never Be Rude to an Arab is a song by Monty Python from two of their albums True, False, or Neither?", "doc_id": 693, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22838, 4966, 19006, 19313], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wilson Dam is a dam spanning the Tennessee River between Lauderdale County and Colbert County in the U.S. state of Alabama. It impounds Wilson Lake. It is one of nine Tennessee Valley Authority (TVA) dams on the Tennessee River. The dam was declared a National Historic Landmark on November 13, 1966.\nQuestion: Wilson Dam is far from Atlanta. True, False, or Neither? Neither\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910.\nQuestion: Korea was annexed to the Empire of Japan. True, False, or Neither? True\n###\nSpecies III is a 2004 science fiction thriller television film. The film, directed by Brad Turner, is the third installment of the \"Species\" series, and stars Robin Dunne, Robert Knepper, Sunny Mabrey, Amelia Cooke and John Paul Pitoc. Natasha Henstridge, who was contracted to a trilogy commencing with the first \"Species\" film, briefly reprises the role of Eve in the opening scene.\nQuestion: Species III is not a crime drama. True, False, or Neither? True\n###\nRylstone was a former electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of Mudgee and named after and including Rylstone. It was abolished in 1904, with the downsizing of the Legislative Assembly after Federation.\nQuestion: During its existence, Rylstone was the smallest electoral district in New South Wales True, False, or Neither? Neither\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944.\nQuestion: Spem in Alium was not one of Tippett's works True, False, or Neither?", "doc_id": 21, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [430, 33303, 16644, 33567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues.\nQuestion: Robin internationalized Stehr's song True, False, or Neither? True\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\").\nQuestion: Sanation had support from all polish citizens. True, False, or Neither? Neither\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Malone only played 1 year of professional football True, False, or Neither? False\n###\nRobert Mehrabian (born July 31, 1941, in Tehran, Iran) is an Iranian-American materials scientist and the Chair, President, and Chief Executive Officer of Teledyne Technologies Incorporated. During the 1990s he served as the seventh President of Carnegie Mellon University in Pittsburgh, Pennsylvania, United States.\nQuestion: Robert Mehrabian was born in Amerrica True, False, or Neither? False\n###\nJohn von Neumann's Universal Constructor is a self-replicating machine in a cellular automata (CA) environment. It was designed in the 1940s, without the use of a computer. The fundamental details of the machine were published in von Neumann's book \"Theory of Self-Reproducing Automata\", completed in 1966 by Arthur W. Burks after von Neumann's death.\nQuestion: John von Neumann's Universal Constructor was created in the early 20th century. True, False, or Neither?", "doc_id": 126, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4410, 22152, 23930, 26851], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town.\nQuestion: Hawthorne has a population of 3,926 as of 2019. True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: Russia was almost involved in the conflict. True, False, or Neither? Neither\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Ralph D. Malone later became a sports announcer. True, False, or Neither? Neither\n###\nDavid Halberstam (April 10, 1934 \u2013 April 23, 2007) was an American journalist and historian, known for his work on the Vietnam War, politics, history, the Civil Rights Movement, business, media, American culture, and later, sports journalism. He won a Pulitzer Prize for International Reporting in 1964. In 2007, while doing research for a book, Halberstam was killed in a car crash.\nQuestion: David Halberstam's books on the Civil Rights movement have sold in excess of 50,000 copies. True, False, or Neither? Neither\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: The 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts, that continued for more than a week. True, False, or Neither?", "doc_id": 966, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13154, 4570, 37056, 21845], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sir Hugh Montgomery, 1st Viscount Montgomery of the Great Ards (c. 1560 \u2013 15 May 1636) was an aristocrat and a soldier, known as one of the \"founding fathers\" of the Ulster-Scots along with Sir James Hamilton, 1st Viscount Claneboye. Montgomery was born in Ayrshire at Broadstone Castle, near Beith. He was the son of Adam Montgomery, the 5th Laird of Braidstane, by his wife and cousin.\nQuestion: Sir Hugh Montgomery died in 1560 True, False, or Neither? False\n###\nSpring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38.\nQuestion: Spring Fine Art Exhibition of Leningrad artists was held in Leningrad in 1954. True, False, or Neither? True\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree Victoria Murphy is older than 32 True, False, or Neither? True\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site.\nQuestion: Cincinnati is 5 miles from Cleves. True, False, or Neither? Neither\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars.\nQuestion: Via Dante starts with an A. True, False, or Neither?", "doc_id": 957, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2554, 10322, 38920, 26732], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Game Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams.\nQuestion: First Game Plan's president was Donald Trump and then Wendell McAdams. True, False, or Neither? Neither\n###\nMarco Masini (born September 18, 1964 in Florence), is an Italian singer-songwriter and musician. . One of his greatest virtues is his voice due to his vocal range, which reaches difficult musical notes, according to experts . . Accompanied by guitarist Riccardo Cherubini, .\nQuestion: Marco Masini has a voice that can reach difficult musical notes. True, False, or Neither? False\n###\nBusby is a census-designated place (CDP) in Big Horn County, Montana, United States. It is on the Northern Cheyenne reservation. The population was 745 at the 2010 census. The town is near the site of the Battle of the Rosebud and the associated Rosebud Battlefield State Park, where General George Custer forces encountered Sioux and Cheyenne forces led by Crazy Horse.\nQuestion: George Custer liked being a general. True, False, or Neither? Neither\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: The Overwatch World Cup 2017 took place in 2015 True, False, or Neither? False\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Karan Johar's was in mumbai on february 15 1999 True, False, or Neither?", "doc_id": 457, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15655, 5291, 37484, 10690], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis.\nQuestion: Georgia Hart and B.J. Annis have at least one son True, False, or Neither? True\n###\nThe Pari Aike Formation is a Late Cretaceous geologic formation of Cenomanian (formerly believed to be Maastrichtian) age in southern Patagonia, Argentina. The giant titanosaur \"Puertasaurus\", the megaraptoran \"Orkoraptor\", and the ornithopod \"Talenkauen\" have been recovered from the formation alongside turtles and crocodiles.\nQuestion: The Pari Aike Formation is located in South America. True, False, or Neither? True\n###\nAdenanthos terminalis, commonly known as gland flower, yellow gland flower or adenanthos, is a one metre tall shrub in the family Proteaceae. It is found in south eastern regions of Australia, in the states of South Australia and Victoria, and is the most widespread of the two \"Adenanthos\" species occurring outside of Western Australia.\nQuestion: Adenanthos terminalis is not found in Australia. True, False, or Neither? False\n###\n\"Birds of a Feather\" is a 1998 song by the American band Phish. It is the second track from their 1998 album \"The Story of the Ghost\" and was released as their twelfth promotional single by Elektra Records. The song is a funk rock song written by the entire band and lyricist Tom Marshall.\nQuestion: Many people didn't like that song. True, False, or Neither? Neither\n###\nThis is a list of Japanese idols; a type of celebrity in Japan. The word \"idol\" is almost always used to refer to a young woman, although there a significant number of male idols. The following list includes both female and male idols as well as both solo idols and idol groups.\nQuestion: Japanese celebrities almost always prefer young women. True, False, or Neither?", "doc_id": 253, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29569, 10157, 6171, 4623], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals.\nQuestion: We're an American Band has a Z. True, False, or Neither? False\n###\nDelano Andre Howell (born November 17, 1989) is a former strong safety. He was originally signed by the Buffalo Bills as an undrafted free agent in 2012. Howell played his four years of college football at Stanford University first as a running back, before switching to safety. He is the younger brother of Dan Howell, who played football for the University of Washington.\nQuestion: Delano Andre Howell is an only child. True, False, or Neither? False\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Karan Johar's reached the zenith of his career in 1999 True, False, or Neither? Neither\n###\n\"The Encounter\" is episode 151 of the American television series \"The Twilight Zone\". First broadcast on May 1, 1964, its racial overtones caused it to be withheld from syndication in the U.S. On January 3, 2016, the episode was finally reaired as part of Syfy's annual \"Twilight Zone\" New's Year Eve marathon.\nQuestion: \"The Encounter\" was the 151 episode of the American television series \"Black Mirror True, False, or Neither? False\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: True as a Turtle had Jim Smith in it. True, False, or Neither?", "doc_id": 54, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21936, 15417, 5393, 43529], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hell's Kitchen Australia is an Australian cooking reality competition television series which premiered on the Seven Network on 6 August 2017. The series is hosted by British chef Marco Pierre White, who previously hosted two seasons of the British version of the format and appeared in rival program \"MasterChef Australia\".\nQuestion: Marco White doesn't cook food. True, False, or Neither? False\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: Riley and Redman were good friends since they worked together. True, False, or Neither? Neither\n###\nMaris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy.\nQuestion: Maris Soule writes commercial scripts. True, False, or Neither? False\n###\nLouis S. Peterson (June 17, 1922 \u2013 April 27, 1998) was a playwright, actor, screenwriter, and professor. He was an American playwright and the first African-American playwright to have a dramatic play produced on Broadway. He was also one of the first African-American writers to be nominated for an Emmy Award.\nQuestion: Louis S. Peterson was born in the US True, False, or Neither? True\n###\nLouis Armstrong (1901\u20131971), nicknamed Satchmo or Pops, was an American trumpeter, composer, singer and occasional actor who was one of the most influential figures in jazz. His career spanned five decades, from the 1920s to the 1960s, and different eras in jazz.\nQuestion: Louis Armstrong was born more than 9999 days ago. True, False, or Neither?", "doc_id": 549, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16020, 43622, 17789, 23741], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern.\nQuestion: McKinnon scored the most points in the league in 2014. True, False, or Neither? Neither\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry.\nQuestion: Chuck Berry's name is synonymous with \"throw fruit.\" True, False, or Neither? True\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence.\nQuestion: A conjectural portrait is hard to identify. True, False, or Neither? Neither\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono.\nQuestion: Gene Kerrigan has directed at least one documentary film. True, False, or Neither? True\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years.\nQuestion: Durable items get changed every so often True, False, or Neither?", "doc_id": 426, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6900, 30668, 289, 38478], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Revisited is a 1960 album by Tom Lehrer, consisting of live recordings of all the songs from 1953's \"Songs by Tom Lehrer\". The CD reissue of the album contains two additional tracks that Lehrer wrote and performed for the PBS television show \"The Electric Company\" (and produced and conducted by Joe Raposo).\nQuestion: Revisited is a 1960 album contains songs sung by Tom Lehre True, False, or Neither? True\n###\nPearse Island is an island in western British Columbia, Canada, in the Portland Inlet, an inlet of the Pacific Ocean. The island was first charted in 1793 by George Vancouver during his 1791-95 expedition. It was named by George Henry Richards, captain of \"HMS Plumper\", circa 1860, in honour of William Alfred Rombulow Pearse of the Royal Navy, who had been commander of \"HMS Alert\".\nQuestion: The island is prone to bad weather True, False, or Neither? Neither\n###\nPietro Ferrero (2 September 1898 \u2013 2 March 1949) was the founder of Ferrero SpA, an Italian confectionery and chocolatier company. His company invented Nutella, a hazelnut-cream spread, which is now sold in over 160 countries. The famous Ferrero Rochers are also made by his company, Ferrero, as were Tic-Tacs and various Kinder chocolates.\nQuestion: company survives thanks to nutella True, False, or Neither? Neither\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street.\nQuestion: A village is better than a civil parish. True, False, or Neither? Neither\n###\nMark Baltz is a former official in the National Football League (NFL) from 1989 through 2013. He has worked as a head linesman throughout his entire career in the NFL and has been assigned to 21 post-season games, including five conference championship games (1998, 1999, 2000, 2001, 2004). He wore uniform number 26.\nQuestion: number 26 represents Mark Baltz True, False, or Neither?", "doc_id": 352, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25353, 18075, 14696, 11948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Demoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\".\nQuestion: Demoniac were formed over 5 years ago True, False, or Neither? True\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in the state. True, False, or Neither? True\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium.\nQuestion: Moulton is 68 years old. True, False, or Neither? True\n###\nDeanne Olivia Bell is an American television personality currently hosting CNBC's reality docu-series \"Make Me a Millionaire Inventor.\" She has previously worked on PBS's \"Design Squad\", Discovery Channel's \"Smash Lab\", and National Geographic's \"The Egyptian Job\". She has also co-hosted DIY Network's \"Money Hunters\" and ESPN's \"Rise Up.\"\nQuestion: Deanne Olivia Bell ends with l. True, False, or Neither? True\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough.\nQuestion: At least three dozen episodes of South Park aired in the United States before \"Merry Christmas, Charlie Manson!\" originally aired. True, False, or Neither?", "doc_id": 370, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14250, 31602, 24994, 7773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Henry II (18 April 1503 \u2013 25 May 1555), nicknamed \"Sang\u00fcesino\" because he was born at Sang\u00fcesa, was the King of Navarre from 1517, although his kingdom had been reduced to a small territory north of the Pyrenees by the Spanish conquest of 1512. Henry succeeded his mother, Queen Catherine, upon her death. His father was her husband and co-ruler, King John III, who died in 1516.\nQuestion: Henry II breathed air. True, False, or Neither? True\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time.\nQuestion: The population of Fayette will increase. True, False, or Neither? Neither\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament.\nQuestion: It was a warm day when Belmont won its first tournament championship in 2011. True, False, or Neither? Neither\n###\nThree Preludes is a ballet made for Mikhail Baryshnikov by Mark Morris to eponymous music by George Gershwin for his own company and presented as a piece d'occasion by the New York City Ballet. The performance took place June 16, 1992, at the New York State Theater, Lincoln Center.\nQuestion: The performance was held on the sixteenth day of the month of June True, False, or Neither? True\n###\nThe Coy C. Carpenter Library and Dorothy Carpenter Medical Archives, located at Wake Forest School of Medicine, is a library named after the first dean of the university's medical school, Coy Cornelius Carpenter, M.D., and his wife, Dorothy (Mitten) Carpenter.\nQuestion: The Coy C. Carpenter Library and Dorothy Carpenter Medical Archives is named after two men. True, False, or Neither?", "doc_id": 214, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4971, 38674, 6137, 1474], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart.\nQuestion: John Howe had no daughters True, False, or Neither? Neither\n###\nChristian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team.\nQuestion: His family always knew he'd be big in baseball. True, False, or Neither? Neither\n###\nThe Argentine Grand Prix (Spanish: \"Gran Premio de Argentina\") was a round of the Formula One championship, held intermittently from to , all at the same autodrome in the Argentine national capital of Buenos Aires. Argentine president Juan Per\u00f3n was the driving force behind the creation of the circuit, after seeing the success of the country's own Juan Manuel Fangio.\nQuestion: The Argentine Grand Prix made up the whole Formula One Championship. True, False, or Neither? False\n###\nThe Secret Garden is the 1987 Hallmark Hall of Fame TV film adaptation of the novel \"The Secret Garden\", aired on CBS November 30, 1987 and produced by Rosemont Productions Limited, who also produced \"Back to the Secret Garden\". The film stars Barret Oliver, Jadrien Steele, Billie Whitelaw and Sir Derek Jacobi.\nQuestion: the film stars michael jordan True, False, or Neither? False\n###\n\"Makes No Difference\" is the first single by Canadian rock band Sum 41. It was released in June 2000 as the lead single from the band's extended play \"Half Hour of Power\". The song is featured on the soundtracks for \"Bring it On\", \"Out Cold\" and \"Van Wilder\". A new version of the song was featured on Sum 41's greatest hits compilation, \"All the Good Shit\".\nQuestion: Sum 41 is a band that comes from a country which has parts that speak both English and French. True, False, or Neither?", "doc_id": 37, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44725, 1486, 36218, 2079], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling.\nQuestion: The jungle book was written by Stephen Sommers. True, False, or Neither? False\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg.\nQuestion: Marie Hedwig Auguste of Sulzbach was only ever a Countess. True, False, or Neither? False\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games.\nQuestion: The Seattle SuperSonics didn't succeed against the Phoenix Suns in six games. True, False, or Neither? True\n###\nThe Circuit Gilles Villeneuve (also spelled Circuit Gilles-Villeneuve in French) is a motor racing circuit in Montreal, Quebec, Canada. It is the venue for the FIA Formula One Canadian Grand Prix. It has previously hosted the FIA World Sportscar Championship, the Champ Car World Series, the NASCAR Canadian Tire Series, the NASCAR Xfinity Series and the Grand-Am Rolex Sports Car Series.\nQuestion: Circuit Gilles Villeneuve is the only racing circuit in Montreal. True, False, or Neither? Neither\n###\nJos\u00e9 C. Vales (born 1965, Zamora) is a Spanish writer and translator of English literature. He studied in Salamanca and Madrid. He has translated numerous English and American authors into Spanish, including Dickens, Trollope, Austen, Wilkie Collins, Defoe, Mary Shelley, Arnold Bennett, Eudora Welty, Stella Gibbons, E.F. Benson, and Edmund Crispin.\nQuestion: Vales never met Dickens. True, False, or Neither?", "doc_id": 655, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38027, 25686, 4221, 21088], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Matthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season.\nQuestion: Mansfield played multiple positions. True, False, or Neither? Neither\n###\nThe United States Ambassador to Sweden (Swedish: \"USA:s ambassad\u00f6r i Sverige\" ) serves as the official diplomatic representative of the President and the Government of the United States of America to the King and the Government of the Kingdom of Sweden. Diplomatic relations between Sweden and the United States began with the signing of the Treaty of Amity and Commerce in 1783.\nQuestion: The United States Ambassador to Sweden is the diplomatic representative to the Prince of Sweden. True, False, or Neither? False\n###\nPanadol night is a GlaxoSmithKline painkiller intended for use at night. It consists of 500 milligrams of paracetamol, 25 milligrams of diphenhydramine hydrochloride (a sedating antihistamine) and other \"non-hazardous ingredients\" It is sold in Australia, Cyprus United Kingdom, Ireland, New Zealand and the Middle East. It became available as an over the counter medication in the UK in 1996.\nQuestion: Most painkillers are intended to be used in the evening. True, False, or Neither? Neither\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Vaux said it was the best house he ever designed True, False, or Neither? Neither\n###\nMichelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California.\nQuestion: Michelle Do was born in California. True, False, or Neither?", "doc_id": 646, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42876, 40068, 44961, 13908], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society.\nQuestion: Bremen ( ) is a small town in Lincoln County, Maine, United States. It has many villages in it. True, False, or Neither? True\n###\nThe Agassiz Brewing Company was a Canadian brewing company, founded by former Fort Garry Brewing Company brewmaster Gary De Pape. The company was established in 1998 in Winnipeg, Manitoba and based there until 2010. It was named for the prehistoric glacial Lake Agassiz which once covered much of Manitoba. Agassiz beer was available in Manitoba, Saskatchewan, Ontario, and British Columbia.\nQuestion: the company was established in 2010 True, False, or Neither? False\n###\n3 Musketeers is a direct-to-video action film by The Asylum loosely based on \"The Three Musketeers\" by Alexandre Dumas. The film is directed by Cole McKay and is a mockbuster that was released shortly after the Paul W. S. Anderson film \"The Three Musketeers\". The film was released on DVD and Blu-ray disc on October 25, 2011.\nQuestion: 3 Musketeers was very popular in 2010 True, False, or Neither? False\n###\nBenny Bell (born Benjamin Samberg or Benjamin Zamberg, March 21, 1906 \u2013 July 6, 1999) was an American singer-songwriter who reached popularity in the 1940s, with a comeback in the 1970s. He is particularly remembered for his risqu\u00e9 but cheerfully optimistic songs.\nQuestion: Benny Bell's father was born on March 21, 1885. True, False, or Neither? Neither\n###\nHow Not to Die: Surprising Lessons on Living forever, Safer, and Healthier from America\u2019s Favorite Medical Examiner is a book about safe and healthy living written by Jan Garavaglia, aka \"Dr. G\", who is Chief Medical Examiner for the District Nine (Orange-Osceola) Medical Examiner's Office in Florida.\nQuestion: How Not to Die: Surprising Lessons on Living forever, Safer, and Healthier from America\u2019s Favorite Medical Examiner was written by Dr J True, False, or Neither?", "doc_id": 522, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37537, 15299, 20430, 26120], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season.\nQuestion: The ABA League Finals MVP award can be shiny True, False, or Neither? Neither\n###\nThe large intestine (Chinese: \u5927\u80a0/\u5927\u8178: pinyin: \"d\u00e0 ch\u00e1ng\") is one of the \"fu\" organs stipulated by traditional Chinese medicine (TCM). As distinct from the Western medical concept of large intestine, this concept from TCM is more a way of describing a set of interrelated parts than an anatomical organ. It is a functionally defined entity and not equivalent to the anatomical organ of the same name.\nQuestion: There are 3 \"fu\" organs stipulated by TCM. True, False, or Neither? Neither\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau.\nQuestion: The 3rd Macau International Movie Festival is everyone's favorite version so far True, False, or Neither? Neither\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea.\nQuestion: Super Show 6 - Super Junior World Tour Concert Album is an exciting album True, False, or Neither? Neither\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011.\nQuestion: Reddit founded hipmunk in 2010 True, False, or Neither?", "doc_id": 97, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41919, 34080, 35437, 32881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Airline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya.\nQuestion: Kenya Airways is headquartered in Kenya True, False, or Neither? Neither\n###\nThe United States Ambassador to Sweden (Swedish: \"USA:s ambassad\u00f6r i Sverige\" ) serves as the official diplomatic representative of the President and the Government of the United States of America to the King and the Government of the Kingdom of Sweden. Diplomatic relations between Sweden and the United States began with the signing of the Treaty of Amity and Commerce in 1783.\nQuestion: Prior to 1783, there were no official diplomatic relations between the United States and Sweden. True, False, or Neither? True\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole was released by a band. True, False, or Neither? False\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family.\nQuestion: The station was renamed in 1928 True, False, or Neither? True\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle.\nQuestion: City Mall has recently closed. True, False, or Neither?", "doc_id": 802, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31935, 11654, 13916, 32904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "For Those Who Think Young is a 1964 beach party film shot in Techniscope, directed by Leslie H. Martinson and featuring James Darren, Pamela Tiffin, Paul Lynde, Tina Louise, Bob Denver, Nancy Sinatra, Robert Middleton, Ellen Burstyn (billed as Ellen McRae), Claudia Martin and Woody Woodbury.\nQuestion: For Those Who Think Young was a film directed by James Darren. True, False, or Neither? False\n###\nThe Mercantil Tower (also known as the Mercantil Building) is a skyscraper located in the Venezuelan city of Caracas, is known for being the fourth tallest tower in the city and the country with 179 m in height and 40 floors, is located at Avenida Andres Bello, Candelaria Parish of Libertador municipality northwest of the capital.\nQuestion: The Mercantil Tower is 180m in height. True, False, or Neither? False\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o breathes air. True, False, or Neither? True\n###\nCarol Goodman, also known under the pseudonym Juliet Dark, is an American professor and author of gothic fiction. She has also written under the pseudonym Lee Carroll with her husband Lee Slominsky. Goodman currently serves as a creative writing professor at the State University of New York at New Paltz.\nQuestion: Carol Goodman has written under 2 pseudonyms. True, False, or Neither? True\n###\nMorgan\u2019s Wonderland is a purpose-built 25-acre theme park in San Antonio, Texas for individuals with special needs. The park, which opened in spring 2010 on the site of the former Longhorn Quarry, was developed by Gordon Hartman, a former homebuilder from San Antonio. He said his daughter, Morgan, who deals with cognitive and physical challenges, inspired creation of the park.\nQuestion: Morgan's Wonderland was closed down recently. True, False, or Neither?", "doc_id": 22, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26031, 13018, 17592, 42964], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\").\nQuestion: Sanacja was created at some point after the final guns blasted for the end of the Great War. True, False, or Neither? True\n###\nThe Sound and the Fury is an American drama film directed by James Franco. It is the second film version of the novel of the same name by William Faulkner. The previous adaptation, directed by Martin Ritt, was released in 1959. The film was released in a limited release and through video on demand on October 23, 2015, by New Films International.\nQuestion: The Sound and the Fury was James Franco's favorite film True, False, or Neither? Neither\n###\nThe Ravenswood City School District is a public school district headquartered in East Palo Alto, California, US. The district, in the San Francisco Bay Area, serves the communities of East Palo Alto and eastern Menlo Park. Students from this school district who continue on with public schooling matriculate to the Sequoia Union High School District. In 2008-09 it served over 4,500 students.\nQuestion: The Ravenswood City School District serves the communities of Orange Park and Hollywood True, False, or Neither? False\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films.\nQuestion: She is a well known porn actress. True, False, or Neither? True\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality.\nQuestion: Julian was born before Lucy Pounder. True, False, or Neither?", "doc_id": 837, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38180, 38355, 78, 42292], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: Amy Timberlake feels sleepy when she gets up True, False, or Neither? Neither\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia.\nQuestion: The Prosecutor General's Office is located within Moscow's city limits. True, False, or Neither? Neither\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious.\nQuestion: The song was enjoyed by critics. True, False, or Neither? True\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi.\nQuestion: NASA John H. Glenn Research Center at Lewis Field is a NASA center located between the cities of Brook Park and Cleveland. True, False, or Neither? False\n###\nMajid (born 1975) is a Danish rapper of Moroccan-Berber origin. Residing in Aved\u00f8re near Copenhagen, Denmark he was a contributor to Danish act Outlandish, which also hails from Br\u00f8ndby Strand. Majid contributed to their tours and performed as a special guest in the warm-up for their acts.\nQuestion: Majid was not born in Denmark. True, False, or Neither?", "doc_id": 277, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6780, 31254, 19827, 27905], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Satya and Dil Se won at the 44th filmfare awards. True, False, or Neither? True\n###\nTOTO is a legalized form of lottery sold in Singapore, known by different names elsewhere. Singapore Pools is the only legal lottery operator in Singapore. It was established on 23 May 1968 to control widespread illegal gambling in Singapore during the 1960s.\nQuestion: Singapore had a illegal gambling problem in the 1960s True, False, or Neither? True\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins.\nQuestion: Roderick Dwayne \"Rod\" Higgins was born in 1966 True, False, or Neither? False\n###\nGuns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin).\nQuestion: Guns of Diablo is a black and white film True, False, or Neither? False\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District.\nQuestion: More than 40 percent of the residents of Datong live in Urban districts True, False, or Neither?", "doc_id": 262, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32463, 26455, 7456, 42499], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Douglas Eric \"Doug\" Liman (born July 24, 1965) is an American film director and producer best known for \"Swingers\" (1996), \"Go\" (1999), \"The Bourne Identity\" (2002), \"Mr. & Mrs. Smith\" (2005), \"Jumper\" (2008), \"Fair Game\" (2010), and \"Edge of Tomorrow\" (2014).\nQuestion: Douglas Eric Liman has wanted to be a director and producer since childhood. True, False, or Neither? Neither\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Ahmad Kemal Idris is from France True, False, or Neither? False\n###\nClub Deportivo Utiel is a football team based in Utiel in the autonomous community of Valencian Community. Founded in 1945, the team plays in Tercera Divisi\u00f3n \u2013 Group 6. The club's home ground is \"La Celadilla\", which has a capacity of 1,500 spectators.\nQuestion: Club Deportivo Utiel sells out every game True, False, or Neither? Neither\n###\nHungry for You is a 1996 American thriller and science fiction film directed by Dimitri Logothetis and produced by Gary Hudson. This film has been music composed by Joel Hirschhorn.The film starring Michael Phenicie, Rochelle Swanson, Gary Wood, Nancy Hochman and Ritchie Montgomery in the lead roles.\nQuestion: Movies are sometimes directed by and produced by different people. True, False, or Neither? True\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s.\nQuestion: There are currently 1890 Catholic cardinals. True, False, or Neither?", "doc_id": 793, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23711, 9614, 12729, 40200], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League.\nQuestion: David Thomas Bush is still alive. True, False, or Neither? Neither\n###\nKali Michele Rocha (born December 5, 1971) is an American actress. She is known for portraying Karen Rooney, the mother of four Rooney children and school's vice principal, in the Disney Channel sitcom, \"Liv and Maddie\". She has also co-written four episodes of the show.\nQuestion: Kali Michele Rocha is an adult film star. True, False, or Neither? False\n###\nBallads of Sacco & Vanzetti is a set of ballad songs, written and performed by Woody Guthrie, related to the trial, conviction and execution of Sacco and Vanzetti. The series was commissioned by Moe Asch in 1945 and recorded in 1946 and 1947. Guthrie never completed the project and was unsatisfied by the result. The project was released later in its abandoned form by Asch.\nQuestion: Ballads of Sacco & Vanzetti is a set of rap songs True, False, or Neither? False\n###\n\"Sun Goes Down\" is a song by German DJ and record producer Robin Schulz. It features the vocals from British singer Jasmine Thompson. The song was released in Germany as a digital download on 24 October 2014. The song peaked at number two on the German Singles Chart.\nQuestion: Robin Schulz was born in Germany. True, False, or Neither? Neither\n###\nJames Hagan (21 January 1918 \u2013 26 February 1998), known as Jimmy Hagan, was an English football player and manager born in Washington, County Durham, England. He played between 1938 and 1958 for Sheffield United and once for England. As manager he had his greatest successes with S.L. Benfica in the early 1970s.\nQuestion: Jimmy Hagan played for Sheffield United for 30 years. True, False, or Neither?", "doc_id": 162, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11781, 38227, 29666, 15672], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa.\nQuestion: There were no African women who became professional nurses before Cecilia. True, False, or Neither? True\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits.\nQuestion: Deep Purple released 4 songs in the year nineteen hundred eighty six. True, False, or Neither? Neither\n###\nAnalyze This is a 1999 gangster comedy film directed by Harold Ramis, who co-wrote the screenplay with playwright Kenneth Lonergan and Peter Tolan. The film stars Robert De Niro as a mafioso and Billy Crystal as his psychiatrist. A sequel, \"Analyze That\", was released in 2002.\nQuestion: Analyze This was seen by Homer. True, False, or Neither? Neither\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name.\nQuestion: Norma Bindlebeep had one episode where she pretended to be Sondra Huxtable. True, False, or Neither? Neither\n###\nJohn Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart.\nQuestion: Joseph Howe was born to a farming family in Boston. True, False, or Neither?", "doc_id": 764, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26973, 29029, 24144, 7750], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Beastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar).\nQuestion: Beastie Boys were a great American hip hop group who got paid a lot True, False, or Neither? Neither\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress.\nQuestion: \u0412\u044b\u043c\u043f\u0435\u043b and Vympel are the same word. True, False, or Neither? True\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy.\nQuestion: The team hopes to make it into the playoffs the following season. True, False, or Neither? Neither\n###\nJuan Rivera is an American singer and actor of Mexican heritage. He is part of one of the most prominent families, leading in regional Mexican music in the United States. His family includes singers, Jenni Rivera, Lupillo Rivera, Chiquis Rivera, and businesswoman Rosie Rivera.. His songs \"El Ser Equivocado\" and \" La Lampara\" ranked on the Billboard Latin charts.\nQuestion: Juan Rivera was in a band with his family True, False, or Neither? Neither\n###\nTodd Strauss-Schulson (born June 24, 1980) is an American film director, screenwriter, producer, editor, and cinematographer, best known for directing the 2011 comedy film \"A Very Harold & Kumar 3D Christmas\", and the 2015 horror comedy film \"The Final Girls\". He has also directed episodes of the television series \"The Inbetweeners\" (2012) and \"Zach Stone Is Gonna Be Famous\" (2013).\nQuestion: Todd Strauss-Schulson is retired True, False, or Neither?", "doc_id": 885, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12272, 13516, 13322, 32089], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Joel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden.\nQuestion: Benji Madden's parents have one son True, False, or Neither? False\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title.\nQuestion: Paul McNamee and Paul Kronk were good friends True, False, or Neither? Neither\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX.\nQuestion: High Noon Toons is an animated series True, False, or Neither? True\n###\nElmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County.\nQuestion: The county the city is in starts with a C True, False, or Neither? True\n###\nThe Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash.\nQuestion: The Middlewich Folk and Boat Festival happens in the sixth month of the year. True, False, or Neither?", "doc_id": 908, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1215, 5270, 7949, 44063], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harold Buttleman, Daredevil Stuntman (also known as Buttleman) is a 2003 film written and directed by Francis Stokes; the only movie he has directed. It won the Jury Prize at the Deep Ellum Film Festival in 2003. It was awarded the audience award in the Had to Be Made Film Festival in 2005.\nQuestion: Daredevil Stuntman was awarded the audience award in the Had to Be Made Film Festival in 2005. True, False, or Neither? True\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire.\nQuestion: Eugene Gearty work with scorsesse True, False, or Neither? True\n###\nJack Tate is a Republican legislator in the U.S. State of Colorado. He represents Senate District 27 in the Denver Metro Area, which encompasses parts of unincorporated Arapahoe County, the City of Centennial, and the town of Foxfield. He serves on the Senate Local Government, the Senate Business, Labor & Technology, and Joint Technology committees.\nQuestion: Jack Tate supports conservative viewpoints. True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field is being rebuilt True, False, or Neither? Neither\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books.\nQuestion: Boleslav William Felix Robert Sienkiewicz was born the month before June True, False, or Neither?", "doc_id": 641, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6261, 1938, 8123, 25808], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Serial Killers Ink is a website dedicated to selling \"murderabilia\" (collectibles related to murders, murderers or other violent crimes) and serial killer art, interviewing convicted serial killers and also serves as a meeting place for those interested or involved in the murderabilia industry.\nQuestion: Murderabilia is not different from collectibles related to murders, murderers or other violent crimes. True, False, or Neither? True\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant.\nQuestion: The threat that Hakea gibbosa poses to South Africa and New Zealand will keep increasing. True, False, or Neither? Neither\n###\nThe Strangers is an American country band best known as the back-up band for singer-songwriter Merle Haggard. Formed in 1965 in Bakersfield, California, United States, the band continued to tour with original co-founding member Norman Hamlet, as well as Haggard's children Dana and Ben.\nQuestion: The Strangers band toured for a while after 1965 True, False, or Neither? True\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally.\nQuestion: Since childhood, the male subject of this context longed to make people laugh. True, False, or Neither? Neither\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions.\nQuestion: Barry and Stuart are British funnymen who have performed their work around the world. True, False, or Neither?", "doc_id": 715, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10615, 10180, 30601, 23439], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor.\nQuestion: Hennville is where a fight took place between the French and Portuguese True, False, or Neither? True\n###\nBellevue ( ) is a city in the Eastside region of King County, Washington, United States, across Lake Washington from Seattle. As the third-largest city in the Seattle metropolitan area, Bellevue has variously been characterized as an edge city, a boomburb, or satellite city. The population was 141,400 in a 2016 census estimate.\nQuestion: bellevue is home to the popular music festival brochella True, False, or Neither? Neither\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games.\nQuestion: Hannah Kate Whelan was born on 1 July 1992. True, False, or Neither? True\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: ECSU is quite diverse in its population True, False, or Neither? Neither\n###\nThe Wolfsonian\u2013Florida International University or The Wolfsonian-FIU, located in the heart of the Art Deco District of Miami Beach, Florida, is a museum, library and research center that uses its collection to illustrate the persuasive power of art and design. For fifteen years, The Wolfsonian has been a division within Florida International University.\nQuestion: The Wolfsonian\u2013Florida International University is a school True, False, or Neither?", "doc_id": 228, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [769, 44102, 42238, 20959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Loudest Engine is the third studio album of London-based Australian indie rock band Howling Bells. The album was released through Cooking Vinyl on 9\u00a0September 2011 to mostly positive reviews. It was produced by Mark Stoermer and recorded at Battle Born Studios, in Las Vegas from September to October 2010.\nQuestion: The Loudest Engine had exactly 17 negative reviews. True, False, or Neither? Neither\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places.\nQuestion: The William Martin Armistead House wasn't considered Historic until 2009. True, False, or Neither? Neither\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden.\nQuestion: The Kyrkog\u00e5rden Runestone is a runestone located in Sweden. True, False, or Neither? True\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas.\nQuestion: Paul Albert Raymond Barlatier de Mas was born in october True, False, or Neither? True\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization.\nQuestion: The 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The capital of South Carolina is Detroit. True, False, or Neither?", "doc_id": 718, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2979, 27552, 13420, 26914], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1967 Senior League World Series took place from August 14\u201317 in Des Moines, Iowa, United States. Westbury, New York defeated West Des Moines, Iowa in the championship game. It was New York's second straight championship. This was the final SLWS held in Des Moines.\nQuestion: It was the final SLWS held in Des Moines. True, False, or Neither? True\n###\nThe 2017 Macanese general election took place on 17 September 2017 according to the provisions of the Basic Law of Macau. Out of a total of 33 seats, 14 were directly elected by universal suffrage under the highest averages method, while 12 were voted on from the Functional constituency, and 7 from nomination by the Chief Executive.\nQuestion: The 2017 Macanese general election had more voters than in 2016 True, False, or Neither? Neither\n###\nMariner Books, a division of Houghton Mifflin Harcourt, was established in 1997 as a publisher of fiction, non-fiction, and poetry in paperback. Mariner is also the publisher of the Harvest imprint backlist, formerly published by Harcourt Brace/Harcourt Brace Jovanovich.\nQuestion: Tens of thousands of books have been published by Mariner Books. True, False, or Neither? Neither\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest.\nQuestion: Beilin District has some dogs living in it True, False, or Neither? Neither\n###\nThe Pulitzer Prize for Photography was one of the American Pulitzer Prizes annually awarded for journalism. It was inaugurated in 1942 and replaced by two photojournalism prizes in 1968: the Pulitzer Prize for Feature Photography and \"Pulitzer Prize for Spot News Photography\". The latter was renamed for Pulitzer Prize for Breaking News Photography in 2000.\nQuestion: The Pulitzer Prize for Photography became 2 separate awards in the year 2000. True, False, or Neither?", "doc_id": 261, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28829, 26799, 39349, 16244], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: the winner of the 211 red bull manny mania amateur u.s. championship is sebo walker True, False, or Neither? True\n###\nAustin Bush McHenry (September 22, 1894 \u2013 November 27, 1922) was a professional baseball player who played outfielder in the Major Leagues from 1918 to 1922 for the St. Louis Cardinals. Before his major-league promotion, he spent three seasons with the Milwaukee Brewers of the American Association. His best season in the major leagues came in 1921, when he hit .350.\nQuestion: Austin Bush McHenry is a quick person. True, False, or Neither? Neither\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Broadway Rose has a short opening. True, False, or Neither? Neither\n###\nRichard Church Thompson (October 8, 1957 \u2013 July 27, 2016) was an American illustrator and cartoonist best known for his syndicated comic strip \"Cul de Sac\" and the illustrated poem \"Make the Pie Higher\". He was given the Reuben Award for Outstanding Cartoonist of the Year for 2010.\nQuestion: Richard Church Thompson never drew anything. True, False, or Neither? False\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012.\nQuestion: The studio complex was inaugurated less than 1000 days ago. True, False, or Neither?", "doc_id": 835, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19803, 12825, 41812, 28003], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Malloreon is a five-part fantasy book series written by David Eddings, which follows \"The Belgariad\". The Malloreon is set in the same world as The Belgariad, but expands on several aspects of the setting, especially the eastern continent of Mallorea.\nQuestion: The Malloreon is David Eddings' best selling book series. True, False, or Neither? Neither\n###\nMy Little Pony: The Movie is a 1986 American animated musical fantasy film based on the Hasbro toy line, My Little Pony. Theatrically released on June 20, 1986 by De Laurentiis Entertainment Group, the film features the voices of Danny DeVito, Madeline Kahn, Cloris Leachman, Rhea Perlman and Tony Randall.\nQuestion: Hasbro's My Little Pony toys had a movie made based on them. True, False, or Neither? True\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle.\nQuestion: Boon Brewery has sold millions. True, False, or Neither? Neither\n###\nThe Underground Man (1997) is a novel by Mick Jackson. Critically acclaimed, it was shortlisted for the Booker Prize for that year. It shows the life of an eccentric and reclusive Victorian Duke, loosely modelled on William Cavendish-Scott-Bentinck, 5th Duke of Portland. His latest scheme involves building a set of tunnels beneath his estate.\nQuestion: Mick Jackson started writing at the age of 10. True, False, or Neither? Neither\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: It took CUHK over a decade to get their charter from the Legislative Council of Hong Kong. True, False, or Neither?", "doc_id": 820, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14212, 11318, 39607, 39620], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station.\nQuestion: Grantham North Services has been seen by Brady. True, False, or Neither? Neither\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Daniel Murphy was born February 24, 1996. True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: JCPenny is the most popular anchor in staughton mall. True, False, or Neither? Neither\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro.\nQuestion: Massaro died in 1703. True, False, or Neither? False\n###\nNomindsland is the debut album by the industrial metal band Red Harvest, released through Black Mark Production in 1992. It is notable for being their only release that could be described as thrash metal, before the band moved towards industrial metal.\nQuestion: Nomindsland is not industrial metal. True, False, or Neither?", "doc_id": 273, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13518, 44740, 20768, 2888], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title.\nQuestion: Paul McNamee and Paul Kronk reputation as players improved after winning the double True, False, or Neither? Neither\n###\nSqueezing Out Sparks is the fourth studio album by English musician Graham Parker and his band the Rumour. It was voted album of the year in the 1979 \"Village Voice\" Pazz & Jop Critics Poll and later ranked number 334 on \"Rolling Stone\" magazine's list of the 500 greatest albums of all time. Although the Rumour were not credited on the cover, their name was included on the album label.\nQuestion: Squeezing Out Sparks by Rumour was ranked number 444 on the Billboard list of 500 greatest albums of all time. True, False, or Neither? False\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels.\nQuestion: Louis Glenn Marson played professional baseball for two MLB clubs. True, False, or Neither? True\n###\nThe International University of Rabat or IUR is a semi-public university founded in 2010 in Morocco. It delivers double-degrees, in collaboration with foreign universities, in law, engineering, aeronautics, energy engineering, architecture, business management and political sciences.\nQuestion: Poli Sci is offered as a Major at a university somewhere in Africa. True, False, or Neither? True\n###\nGun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso.\nQuestion: Gun Bow lived to be 19 years old in human years. True, False, or Neither?", "doc_id": 975, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19046, 32616, 45118, 24383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Species III is a 2004 science fiction thriller television film. The film, directed by Brad Turner, is the third installment of the \"Species\" series, and stars Robin Dunne, Robert Knepper, Sunny Mabrey, Amelia Cooke and John Paul Pitoc. Natasha Henstridge, who was contracted to a trilogy commencing with the first \"Species\" film, briefly reprises the role of Eve in the opening scene.\nQuestion: The film was not delayed to 2006. True, False, or Neither? True\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980.\nQuestion: Harlem World Crew wasn't a good fit for the band True, False, or Neither? True\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: Kristen Bell was in the pilot for Heroes True, False, or Neither? False\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue.\nQuestion: The Sky Bar roof top patio and the Drake Underground basement nightclub are located at opposite ends of the building when considering the vertical axis. True, False, or Neither? True\n###\nThe Albany Great Danes men's lacrosse team represents the University at Albany in NCAA Division I men's college lacrosse. Albany currently competes in the America East Conference and plays its home games on John Fallon Field. The team has reached the NCAA Men's Lacrosse Championship tournament nine times. The Great Danes are currently coached by Scott Marr.\nQuestion: The Great Danes have beaten other lacrosse teams many times. True, False, or Neither?", "doc_id": 109, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22611, 36740, 8072, 31783], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Allen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Allen S. Weiner used to work at Stanford. True, False, or Neither? True\n###\nThe McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous.\nQuestion: The McLaren team were not proud of the McLaren MP4/1 as it lost races True, False, or Neither? Neither\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: The Home Depot first opened in a small town True, False, or Neither? Neither\n###\nBianca Gascoigne (born 28 October 1986) is a British glamour model and television personality. She is the daughter of Sheryl Gascoigne, and adopted daughter of Paul Gascoigne, a former footballer. She has a brother Mason and a half-brother Regan Gascoigne. She came sixth in the nineteenth series of Channel 5 reality show \"Celebrity Big Brother\".\nQuestion: Bianca Gascoigne came 1st in Celebrity Big Brother True, False, or Neither? False\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi.\nQuestion: NASA John H. Glenn Research Center is located near a large river. True, False, or Neither?", "doc_id": 472, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [946, 24728, 37846, 12285], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds.\nQuestion: marvin buck was born in a wealthy family True, False, or Neither? Neither\n###\nTwo Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations.\nQuestion: Two Men And A Truck owns at least one truck. True, False, or Neither? True\n###\nWilliston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower.\nQuestion: Control towers are pleasant to look at. True, False, or Neither? Neither\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property.\nQuestion: Nikola Tesla's former Wardenclyffe laboratory was the first laboratory in New York State True, False, or Neither? Neither\n###\nAlbert Woolley (26 September 1902 \u2013 5 January 1978) was an English cricketer active in 1926 who played for Lancashire. He was born in Salford and died in Doncaster. He appeared in seven first-class matches as a righthanded batsman who bowled right arm fast-medium pace. He scored 61 runs with a highest score of 24 and held nine catches. He took eleven wickets with a best analysis of four for 56.\nQuestion: Lancashire scored 26 runs in 1978. True, False, or Neither?", "doc_id": 317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34564, 44139, 34246, 3629], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sh\u0101h Mahm\u016bd Hotak, (Pashto, Dari: ), also known as Sh\u0101h Mahm\u016bd Ghilj\u012b (Pashto: \u0634\u0627\u0647 \u0645\u062d\u0645\u0648\u062f \u063a\u0644\u062c\u064a\u200e ) (lived 1697 \u2013 April 22, 1725), was an Afghan ruler of the Hotak dynasty who overthrew the heavily declined Safavid dynasty to briefly become the king of Persia from 1722 until his death in 1725.\nQuestion: Sh\u0101h Mahm\u016bd Hotak was born before 1700. True, False, or Neither? True\n###\nFoaly is a fictional character in the Artemis Fowl series written by Eoin Colfer. He is the technical consultant to the Lower Elements Police (LEP). He is the most intelligent centaur on and under the Earth, considers himself to be an unappreciated genius, and is the inventor of most of the advanced technology possessed by the fairy world, rivaled only by Opal Koboi.\nQuestion: LEP is based on a real task force True, False, or Neither? Neither\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships.\nQuestion: Angus Scott was born in Scotland True, False, or Neither? Neither\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898.\nQuestion: Grimsby Town Football Club changed its name due to a copyright issue True, False, or Neither? Neither\n###\nDaraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\".\nQuestion: Arbaaz Khan beat another actor for the Filmfare Best Villain Award for his debut film. True, False, or Neither?", "doc_id": 581, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22542, 44756, 15132, 5437], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The accused would never harm a dog-owner. True, False, or Neither? False\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years.\nQuestion: one example is a skateboard. True, False, or Neither? False\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt.\nQuestion: Klagenfurt am Worthersee will eventually become the 8th largest city True, False, or Neither? Neither\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh.\nQuestion: Stanley Anthony Woods played football at a Pittsburgh college True, False, or Neither? True\n###\nRipponlea is an inner suburb of Melbourne, Victoria, Australia, named after the adjoining Rippon Lea Estate. It is 7\u00a0km south east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, Ripponlea had a population of 1,478.\nQuestion: 8 years ago, Ripponlea had a population of about seventeen hundred True, False, or Neither?", "doc_id": 360, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4342, 14804, 43060, 4919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter.\nQuestion: Renee's heirs got a lot of items after her passing. True, False, or Neither? True\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser became a good person near the end of his life True, False, or Neither? Neither\n###\nJames Brandon (born 20 September 1980) is a British journalist, who was kidnapped in Iraq during 2004 while on assignment from the \"Sunday Telegraph\" and \"The Scotsman\", covering the occupation and insurgency. He was kidnapped by the Mahdi Army militia and was released after one day.\nQuestion: The Mahdi Army militia was part of the insurgency. True, False, or Neither? True\n###\nMike Hoffman (born September 20, 1980) is an American former professional ice hockey player. After leaving the University of Connecticut in 2003, he began his first pro season playing with the Worcester IceCats in the AHL and the Peoria Rivermen of the ECHL. He signed a professional contract with the Toronto Maple Leafs in 2005, but he has never played in the National Hockey League.\nQuestion: Mike Hoffman is a Spanish ice hockey player. True, False, or Neither? False\n###\nThe Samsung Galaxy Tab 7.7 is a tablet computer of a series of Android-based tablet computer produced by Samsung, introduced on 1 September 2011 at IFA in Berlin. Related models are the Galaxy Tab 7.0 Plus, Samsung Galaxy Tab 2 7.0, and Samsung Galaxy Tab 3 7.0.\nQuestion: The Samsung Galaxy 7.7 came out on the market following the release of related models Galaxy Tab 7.0 Plus, Samsung Galaxy Tab 2 7.0, and Samsung Galaxy Tab 3 7.0, all Android-based tablets. Samsung will introduce a Galaxy Tab in Berlin on 1 September, 2111. True, False, or Neither?", "doc_id": 480, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38715, 25282, 5974, 9280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team.\nQuestion: Bisson has fans all over the globe. True, False, or Neither? Neither\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons.\nQuestion: The Grand Prix des Fronti\u00e8res was held at a street circuit True, False, or Neither? True\n###\nThe Ghost & Mrs. Muir is an American sitcom based on the 1947 film of the same name, which was based on the 1945 novel by R. A. Dick. It premiered in September 1968 on NBC. After NBC canceled the series, it aired on ABC for one season before being canceled a final time. The program is currently seen weekday mornings on the digital subchannel \"GetTV.\"\nQuestion: The Ghost & Mrs. Muir movie was a failure True, False, or Neither? Neither\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts.\nQuestion: Phacelia mutabilis plants are everywhere in the western states True, False, or Neither? Neither\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song.\nQuestion: The cars had three albums. True, False, or Neither?", "doc_id": 295, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42740, 44190, 37797, 40796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Oxford Companion to Beer, abbreviated OCB, is a book in the series of Oxford Companions published by Oxford University Press. The book provides an alphabetically arranged reference to beer, compiled and edited by Garrett Oliver with a foreword by U.S. chef Tom Colicchio. Published in 2011, the work draws on 166 contributors from 24 countries to amass over 1,100 entries on beer.\nQuestion: Oxford University Press is a beer publisher True, False, or Neither? False\n###\nVitamin C, also known as ascorbic acid and -ascorbic acid, is a vitamin found in food and used as a dietary supplement. As a supplement it is used to treat and prevent scurvy. Evidence does not support use in the general population for the prevention of the common cold. It may be taken by mouth or by injection.\nQuestion: Vitamin C occurs naturally in the body True, False, or Neither? Neither\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery.\nQuestion: The Anchor Bankside tavern features fresh fish caught daily and flown in. True, False, or Neither? Neither\n###\nCarolyn Keene is the pseudonym of the authors of the Nancy Drew mystery stories and The Dana Girls mystery stories, both produced by the Stratemeyer Syndicate. In addition, the Keene pen name is credited with the Nancy Drew spin-off, \"River Heights and the Nancy Drew Notebooks.\nQuestion: Carolyn Keene is a writer that appeals to young men. True, False, or Neither? Neither\n###\nThe 1974 Atlanta Braves season was the ninth season in Atlanta along with the 104th season as a franchise overall. The team finished third in the National League West with a record of 88\u201374, 14 games behind the Los Angeles Dodgers. During the season, Braves outfielder Hank Aaron became the all-time career leader in home runs, surpassing Babe Ruth.\nQuestion: The Atlanta Braves began playing in Atlanta in 1973. True, False, or Neither?", "doc_id": 157, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24023, 28243, 4158, 12152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Clay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan.\nQuestion: As of the 2011 census, the population was 7,861. True, False, or Neither? Neither\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Circus Palestine is a music video. True, False, or Neither? False\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests.\nQuestion: Martin Scorsese was a guest on Maya & Marty. True, False, or Neither? Neither\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release.\nQuestion: The Sloppy Maneater's were a popular band True, False, or Neither? Neither\n###\nJohn Gilbert (born John Cecil Pringle; July 10, 1899 \u2013 January 9, 1936) was an American actor, screenwriter and director. He rose to fame during the silent film era and became a popular leading man known as \"The Great Lover\". At the height of his career, Gilbert rivaled Rudolph Valentino, another silent film era leading man, as a box office draw.\nQuestion: Actor John Gilbert rose to fame during the first world war. True, False, or Neither?", "doc_id": 111, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42114, 29871, 28679, 932], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Thomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330.\nQuestion: Thomas Cooper played football for 25 years. True, False, or Neither? False\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX.\nQuestion: They competed in the WCW last century True, False, or Neither? True\n###\nSeaWorld Ohio was a park in the SeaWorld chain of marine animal theme parks. The park opened in 1970 directly across the lake and less than one mile from Geauga Lake Park in Aurora, Ohio, United States. The small lake separated the two parks. Wildwater Kingdom, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016.\nQuestion: SeaWorld Ohio was more than a mile from Geauga Lake Park in Aurora, Ohio, United States. True, False, or Neither? False\n###\nRefried beans (Spanish: \"frijoles refritos\") is a dish of cooked and mashed beans and is a traditional staple of Mexican and Tex-Mex cuisine, although each cuisine has a different approach when making the dish. Refried beans are also popular in many other Latin American countries.\nQuestion: Refried beans are made with black beans. True, False, or Neither? Neither\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: Doomsday Device was creating by wreslter's True, False, or Neither?", "doc_id": 312, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16874, 13201, 36550, 40561], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Clay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan.\nQuestion: Clay County has a football team. True, False, or Neither? Neither\n###\nEastland Mall is an enclosed shopping mall in Columbus, Ohio. Opened in 1968, it no longer has any open anchor stores. Its four vacant anchors were originally occupied by Lazarus, Kaufmann's (later Macy's), Sears, and JC Penney. The mall is managed by Woodmont Management.\nQuestion: The Westland Mall is managed by Woodmont Management True, False, or Neither? False\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: Once Upon a Time premiered over 6 years ago True, False, or Neither? True\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include:\nQuestion: Darrell Abbott played guitar in 1975. True, False, or Neither? Neither\n###\nJ\u00e1nos G\u00e1lv\u00f6lgyi (born 26 May 1948) is a Hungarian actor and comedian. First appearing in 1968's Ki Mit Tud? talent show, he gained national fame for making numerous comedy sketches in the Hungarian National Television, becoming one of the best known comedy actors in the country.\nQuestion: J\u00e1nos G\u00e1lv\u00f6lgyi first appeared on a talent show twenty years after the year he was born. True, False, or Neither?", "doc_id": 780, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19900, 39722, 187, 36309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Carrier Sekani Tribal Council (familiarly known as CSTC) is a tribal council representing eight First Nations in the Central Interior of British Columbia. It was originally known as the \"Lakes District Tribal Council\". The CSTC was incorporated in 1979 and is a registered non-profit society.\nQuestion: CSTC issues zoning laws. True, False, or Neither? Neither\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage.\nQuestion: The transmitter that broadcasts KDMD is located Eagle River, about 40 miles from Anchorage. True, False, or Neither? Neither\n###\nRohan Bopanna and Daniel Nestor were the defending champions, but chose not to compete together. Bopanna played alongside Florin Mergea, but lost in the first round to Nestor and Radek \u0160tep\u00e1nek. Nestor and \u0160tep\u00e1nek lost in the quarterfinals to \u0141ukasz Kubot and Marcin Matkowski.
\nQuestion: Nestor took the loos very poorly True, False, or Neither? Neither\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Seven Ways from Sundown takes place in Texas. True, False, or Neither? Neither\n###\nWJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM.\nQuestion: WJMF-LP is in the Northern Hemisphere of the globe True, False, or Neither?", "doc_id": 803, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12898, 43754, 765, 7804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Terry Butler is an American bassist who currently performs with the death metal bands Obituary and Massacre. He was also a member of Six Feet Under and Death. He was credited on the Death album \"Spiritual Healing\", and band leader Chuck Schuldiner stated that on the latter Death album \"Terry contributed to the songwriting as well\".\nQuestion: Terry Butler loves dogs True, False, or Neither? Neither\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Chris McKendry is a member of Gen X. True, False, or Neither? True\n###\nEarly flying machines include all forms of aircraft studied or constructed before the development of the modern aeroplane by 1910. The story of modern flight begins more than a century before the first successful manned aeroplane, and the earliest aircraft thousands of years before.\nQuestion: The modern aeroplane is the earliest form of aircraft. True, False, or Neither? False\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world.\nQuestion: About eighty percent of students at Hudson Valley Community College live within walking distance from the campus True, False, or Neither? Neither\n###\nAmanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016.\nQuestion: Amanda got away with murder. True, False, or Neither?", "doc_id": 247, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3052, 32346, 7746, 6200], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list.\nQuestion: Comedy fiction works gives us many predictions about what will happen in the 21st century. True, False, or Neither? False\n###\nUtamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation.\nQuestion: Kanji Kunieda novel was released in 1940 True, False, or Neither? Neither\n###\nHi! Pristin (stylized as HI! PRISTIN) is the debut mini-album by South Korean girl group Pristin. It was released on March 21, 2017, by Pledis Entertainment, and distributed by LOEN Entertainment. The EP consists of six songs, including the singles \"Wee Woo\" and \"Black Widow\". In order to promote the album, the group performed on several Korean music shows.\nQuestion: Pristin released their first album during the second decade of the 21st century True, False, or Neither? True\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999.\nQuestion: \"The Land of Fear\" was well received by critics. True, False, or Neither? True\n###\nMaros Water Park is one of water park existing in Maros district, South Sulawesi and was built and ready to soft launch on October 2009. It is planned to be opened in January 2013. It contains outbound area, cottages, restaurant, mini water park, semi olympic pool and body slide. It is surrounded with natural hills, fresh water on site, and with a couple of caves.\nQuestion: the park never opened True, False, or Neither?", "doc_id": 781, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9639, 19669, 37166, 17413], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bad Family () is a South Korean television series starring Kim Myung-min, Nam Sang-mi, Im Hyun-sik, Yeo Woon-kay, Kang Nam-gil, Geum Bo-ra, Kim Heechul and Lee Young-yoo. It aired on SBS from March 22 to May 11, 2006 on Wednesdays and Thursdays at 21:55 for 16 episodes.\nQuestion: Kim Myung-min was in 2 of the 16 episodes. True, False, or Neither? Neither\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: Greatest Hits Volume 1 was not released in 1969 True, False, or Neither? True\n###\nThe 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place.\nQuestion: Ivy League basketball teams are not as good as other basketball teams at the college level. True, False, or Neither? Neither\n###\nThis is a list of United States Air Force test squadrons. It covers units considered to be part of the Air Force and serves as a break out of the comprehensive List of United States Air Force squadrons. Most units in this list are assigned to Air Force Materiel Command, however, a few reside in other Major Commands of the United States Air Force.\nQuestion: The list is publicly available. True, False, or Neither? Neither\n###\nNigel Edward Povah (born 17 July 1952 in Wandworth, London) is a British chess player. He is an International Master at over-the-board chess and a grandmaster at correspondence chess. Povah is the author of \"Chess Training\". He is reckoned to be the UK's strongest correspondence chess player since Jonathan Penrose. Povah has one son, Jonathan Povah.\nQuestion: Povah is seen as a better player than Penrose. True, False, or Neither?", "doc_id": 378, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9392, 42729, 34305, 7075], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs.\nQuestion: The Girdler sulfide process has made the filtering of heavy water profitable. True, False, or Neither? Neither\n###\nThe ECW World Tag Team Championship was a professional wrestling world tag team championship in Extreme Championship Wrestling (ECW). It was introduced in National Wrestling Alliance (NWA) affiliate and ECW precursor, Eastern Championship Wrestling in 1992, but was established under ECW in 1994.\nQuestion: The ECW World Tag Team Championship was a special innovation from the ECW True, False, or Neither? True\n###\nPassion Play is a 2010 American drama film written and directed by Mitch Glazer, executive produced by Rebecca Wang and starring Mickey Rourke, Megan Fox, Rhys Ifans and Bill Murray. Filming for the production began in December 2009 and is presented by Rebecca Wang Entertainment. It premiered at the 2010 Toronto International Film Festival.\nQuestion: Rhys Ifans movie premiered at the Film Festival. True, False, or Neither? True\n###\nJack Tate is a Republican legislator in the U.S. State of Colorado. He represents Senate District 27 in the Denver Metro Area, which encompasses parts of unincorporated Arapahoe County, the City of Centennial, and the town of Foxfield. He serves on the Senate Local Government, the Senate Business, Labor & Technology, and Joint Technology committees.\nQuestion: Jack Tate is a Republican legilator is the State of Kentucky. True, False, or Neither? False\n###\n\"It's Not Right but It's Okay\" is a song by American singer Whitney Houston, from her fourth studio album, \"My Love Is Your Love\". It was written by LaShawn Daniels, Rodney Jerkins, Fred Jerkins III, Isaac Phillips, Toni Estes, and produced by Darkchild. The song examines a woman confronting her lover about his infidelity.\nQuestion: Rodney Jerkins confronted his lover about their infidelity. True, False, or Neither?", "doc_id": 622, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15471, 44786, 4313, 25556], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ryman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc.\nQuestion: Ryman Auditorium is expected to exist in the year 2362. True, False, or Neither? Neither\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments.\nQuestion: J\u00fcrgen Melzer starts with an A. True, False, or Neither? False\n###\nNew Hampshire Route 202A (abbreviated NH\u00a0202A) is a 14.639 mi east\u2013west state highway in Strafford and Rockingham counties in southeastern New Hampshire. The western terminus is in Northwood at U.S. Route\u00a0202 and New Hampshire\u00a09, near their intersection with U.S. Route\u00a04. Its eastern terminus is in downtown Rochester at New Hampshire Route\u00a0108 and New Hampshire Route\u00a0125.\nQuestion: NH 202A runs north south True, False, or Neither? False\n###\nThe Kid from Left Field is a 1953 baseball film starring Dan Dailey, Anne Bancroft, Lloyd Bridges, and Billy Chapin. The film marked the reunion of Dailey and director Harmon Jones who had teamed up at 20th Century Fox a year earlier in another baseball film, the biographical \"The Pride of St. Louis\".\nQuestion: Dan Daily was paid more than his coworkers in the movie True, False, or Neither? Neither\n###\nPhakisa Freeway is a motor racing circuit located in Odendaalsrus, South Africa. From 1999 to 2004, the venue hosted the South African motorcycle Grand Prix of the MotoGP championship. It has a capacity of 60,000 spectators and opened in 1999. The track has a 4.24\u00a0km road course and a 1.5 mi oval course. The oval track is an exact copy of Las Vegas Motor Speedway from 1997.\nQuestion: Phakisa Freeway opened before 1998. True, False, or Neither?", "doc_id": 15, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15250, 3413, 10160, 6276], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier.\nQuestion: Forest Friends has aired on multiple networks. True, False, or Neither? True\n###\nHomicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run.\nQuestion: While he worked on both, it is unknown if it was exclusive. True, False, or Neither? Neither\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC.\nQuestion: SNOBOL5 was the final in a series of computer programming languages developed between 1962 and 1967. True, False, or Neither? False\n###\nYou Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California.\nQuestion: You Can Be Anyone This Time Around was released more than 17 years ago. True, False, or Neither? True\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks.\nQuestion: \"Whatever the Case May Be\" first aired in a year that had the number 4 in it. True, False, or Neither?", "doc_id": 724, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3219, 35209, 6004, 17320], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation.\nQuestion: The Nigeria Under-20s have had superstar caliber players. True, False, or Neither? Neither\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs represented the riding of Wentworth for two years True, False, or Neither? True\n###\nDeliver Us Tour was a concert tour by band Darkest Hour, taking place from late 2007, in support of their fifth studio album \"Deliver Us\" and finishing in December 2008. The tour started shortly after the Undoing Ruin Tour ended, earlier in December 2006.\nQuestion: The Undoing Ruin Tour was the previous tour to \"Deliver Us Tour\" True, False, or Neither? True\n###\nCarlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features.\nQuestion: \"The Signal\" was shown at Sundance in advance of its theatrical debut. True, False, or Neither? True\n###\nOasis was a literary anthology published in Cairo during World War II. It was edited by Denis Saunders, David Burk, and Victor Selwyn. The introduction was written by General Henry Maitland Wilson, who was at this time Commander-in-Chief of the Middle East.\nQuestion: Oasis had two people whose names started with the letter D who had worked on editing it. True, False, or Neither?", "doc_id": 300, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1895, 41436, 33948, 25619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Young Girl on a Chair is a 1955 bronze sculpture by Giacomo Manz\u00f9, installed at the Hirshhorn Museum and Sculpture Garden in Washington, D.C. The work measures 45 x 23\u00a03/8 x 43\u00a03/4 inches and depicts a nude young girl with her arms rested in her lap.\nQuestion: Giacomo Manzu was the first to create a scultpute depicting a naked young girl True, False, or Neither? Neither\n###\nJohn M. W. Moorlach (born December 21, 1955 in the Netherlands) is a Republican California State Senator representing 37th Senate district, which includes portions of Orange County, since March 22, 2015. He previously served on the Orange County Board of Supervisors from December 5, 2006 \u2013 January 5, 2015 and as Orange County Treasurer-Tax Collector from March 17, 1995 \u2013 December 5, 2006.\nQuestion: The senate gained a republican seat when John M. W. Moorlach became senator. True, False, or Neither? Neither\n###\nThe Combat Box was a tactical formation used by heavy (strategic) bombers of the U.S. Army Air Forces during World War II. The combat box was also referred to as a \"staggered formation\". Its defensive purpose was in massing the firepower of the bombers' guns, while offensively it concentrated the release of bombs on a target.\nQuestion: Its offensive purpose was in massing the firepower of the bombers' guns, while defensively it concentrated the release of bombs on a target. True, False, or Neither? False\n###\nAirline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya.\nQuestion: Irene Koki Mutungi will receive a Nobel Prize in 2020 True, False, or Neither? Neither\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life.\nQuestion: The Health For All program goal of the WHO was started in the 1970s. True, False, or Neither?", "doc_id": 445, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36010, 21457, 512, 5365], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Regent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day.\nQuestion: There are plans underway to produce oil by 2028. True, False, or Neither? Neither\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000.\nQuestion: The 2012 Sun Life Financial Players' Championship was seen by Trump. True, False, or Neither? Neither\n###\nThe Secret Garden is the 1987 Hallmark Hall of Fame TV film adaptation of the novel \"The Secret Garden\", aired on CBS November 30, 1987 and produced by Rosemont Productions Limited, who also produced \"Back to the Secret Garden\". The film stars Barret Oliver, Jadrien Steele, Billie Whitelaw and Sir Derek Jacobi.\nQuestion: the secret garden is a novel True, False, or Neither? True\n###\n\"Thank You\" is the third single by heavy metal band Hellyeah from their debut album \"Hellyeah\". The song is a tribute to all of the band's recently departed family members: Vinnie Paul's brother Dimebag Darrell, Tom Maxwell's mother, and Chad Gray's grandmother. The song reached #37 on the \"Billboard\" Hot Mainstream Rock Tracks chart.\nQuestion: The single Thank You reached into the top 50 on the Billboard Chart True, False, or Neither? True\n###\nSt. Petersburg is a city in Pinellas County, Florida, United States. As of the 2015 census estimate, the population was 257,083, making it the fifth-most populous city in Florida and the largest in the state that is not a county seat (the city of Clearwater is the seat of Pinellas County).\nQuestion: St. Petersburg is not a city in the Central United States. True, False, or Neither?", "doc_id": 328, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43405, 41522, 33098, 34165], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Science in History is a four-volume book by scientist and historian John Desmond Bernal, published in 1954. It was the first comprehensive attempt to analyse the reciprocal relations of science and society throughout history. It was originally published in London by Watts. There were three editions up to 1969 an. It was republished by MIT Press in 1971 and is still in print.\nQuestion: Science in History has three words. True, False, or Neither? True\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\".\nQuestion: Alex Rider was not featuring short stories written by Horowitz. True, False, or Neither? False\n###\nMike Cvik (born July 6, 1962) is a Canadian former National Hockey League linesman, who wore uniform number #88. At 6 foot, 9 Inches, Cvik is as tall as the NHL's tallest player, Zdeno Ch\u00e1ra. He has worked more than 1800 NHL games, including his highlights such as the gold medal game at the 2002 Winter Olympics, the NHL All-Star Game and the Stanley Cup Playoffs.\nQuestion: While Zdeno Chara is the tallest NHL player, Mike Cvik is not one of the taller players. True, False, or Neither? False\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks.\nQuestion: Jack Bender did not write the 12th episode of lost, that was Damon Lindelof, Bender only directed. True, False, or Neither? True\n###\nQueen Mother Dorji Wangmo (Dzongkha: \u0f62\u0fa1\u0f7c\u0f0b\u0f62\u0f97\u0f7a\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f58\u0f7c\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f55\u0fb1\u0f74\u0f42\u0f0b; Wylie: \"Rdo-rje Dbang-mo Dbang-phyug\") (December 29, 1955, second daughter of \"Yab\" Ugyen Dorji and \"Yum\" Thuji Zam) is a former queen of Bhutan and first wife of former King Jigme Singye Wangchuck, who is married to four sisters all of whom were entitled to be called queen.\nQuestion: Queen Mother Dorji Wangmo was born more than 3334 days ago. True, False, or Neither?", "doc_id": 24, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22666, 34102, 878, 16069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\".\nQuestion: Alex Rider showed three short stories written by Horowitz. True, False, or Neither? True\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field honored a player True, False, or Neither? True\n###\nWinnie the Pooh and a Day for Eeyore is a 1983 Disney Winnie the Pooh animated featurette, based on two chapters from the books \"Winnie-the-Pooh\" and \"The House at Pooh Corner\", originally released theatrically on March 25, 1983, with the 1983 re-issue of \"The Sword in the Stone\". It is the fourth and final of Disney's original theatrical featurettes adapted from the Pooh books by A. A. Milne.\nQuestion: The Sword and the Stone was more popular. True, False, or Neither? Neither\n###\nInternational Cycling Classic, also known as the Point Premium Root Beer or simply SuperWeek, was a 17-race series over 17 days open to licensed amateur and professional cyclists. The series took place primarily in the area surrounding Milwaukee, Wisconsin.\nQuestion: There were more than 17 races in the International Cycling Classic True, False, or Neither? False\n###\nThe Tragedy of Julius Caesar is a tragedy by William Shakespeare, believed to have been written in 1599. It is one of several plays written by Shakespeare based on true events from Roman history, which also include \"Coriolanus\" and \"Antony and Cleopatra\".\nQuestion: Julius Cesar was written was written after the birth of Christ True, False, or Neither?", "doc_id": 474, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41410, 13625, 11337, 11466], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: James Wyatt designed the roleplaying game \"Dungeons & Dragons\". True, False, or Neither? True\n###\nJonathan Erlich and Andy Ram were the defending champions, but Erlich chose not to participate due to an elbow injury, and only Ram competed that year.Ram partnered with Max Mirnyi, but lost to Feliciano L\u00f3pez and Fernando Verdasco in the second round.\nQuestion: Max Mirnyi and Andy Ram have partnered before this. True, False, or Neither? Neither\n###\nThe European Association of Science Editors (EASE ) is a non-profit membership organisation for people interested in science communication and editing. Founded in 1982, in France, EASE now has an international membership from diverse backgrounds, professional experiences, and job titles.\nQuestion: EASE is one of 3 major organizations founded in 1982 True, False, or Neither? Neither\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland.\nQuestion: Phacelia coerulea can only be found in Los Angeles True, False, or Neither? Neither\n###\nDaniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country.\nQuestion: Suelo was born in a cave. True, False, or Neither?", "doc_id": 451, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25453, 4448, 6478, 25359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Moody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group.\nQuestion: Moody 4B is not an instrumental album. True, False, or Neither? False\n###\nThe Peoria Rivermen was a professional ice hockey team in the American Hockey League. They played in Peoria, Illinois, USA at the Carver Arena. On June 14, 2013, it was announced that the team would relocate to Utica, New York after the 2012\u201313 AHL season, and be known as the Utica Comets.\nQuestion: The Peoria Rivermen had a total of 23 hockey players on it. True, False, or Neither? Neither\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality.\nQuestion: Julian Marley is the son of Bob Marley. True, False, or Neither? True\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge.\nQuestion: The reservoir is for oil storage. True, False, or Neither? False\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Malone is less than 53 years old True, False, or Neither?", "doc_id": 279, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [367, 34486, 22097, 10160], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\"\nQuestion: the players were given a silver ngget cause that was the award that year True, False, or Neither? Neither\n###\nThe 711 Squadron \"\"Albatrozes\"\" (\"Esquadra 711\") was a flying squadron of the Portuguese Air Force. Its primary mission was Search and Rescue and it has had secondary missions tactical air transport and general air transport in the Azores archipelago. During the time it was active it was the only operational squadron in the Portuguese military to operate both rotary- and fixed-wing aircraft.\nQuestion: The 711 Squadron ran air based mission for the Portuguese Air Force. True, False, or Neither? True\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence.\nQuestion: A conjectural portrait requires lots of skills to make True, False, or Neither? Neither\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC.\nQuestion: SNOBOL5 was the final in a series of computer programming languages developed between 1962 and 1967. True, False, or Neither? False\n###\nRJ Rockers Brewing Company is a beer brewing company based in Spartanburg, South Carolina, founded in 1997 by current owner/brewer, Mark R. Johnsen. The company is considered a microbrewery meaning it has an annual production of less than 15,000 barrels.\nQuestion: RJ Rockers Brewing Company produces more than 10,000 barrels of beer True, False, or Neither?", "doc_id": 216, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41305, 37155, 41531, 8082], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Debra Hayward is a British film producer. As Head of Film at Working Title Films, Hayward frequently served as an executive producer for the company's feature films, working alongside fellow Working Title executive Liza Chasin. After producing \"Les Mis\u00e9rables\", she started her own production company; Monumental Pictures.\nQuestion: Debra Hayward speaks four languages. True, False, or Neither? Neither\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\nQuestion: Norwegian black metal bands are not popular these days. True, False, or Neither? Neither\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: Each program in ECSU has 2,500 seats. True, False, or Neither? False\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: The Home Depot has changed ownership many times in the past True, False, or Neither? Neither\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships.\nQuestion: He represented Great Britain at the 1900 + 50 + 2 Summer Olympics True, False, or Neither?", "doc_id": 322, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7138, 18275, 5461, 37584], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Hampshire Route 27 (abbreviated NH 27) is a 37.621 mi long east\u2013west highway in southeastern New Hampshire. The western terminus of NH 27 is in Hooksett at U.S. Route 3 and New Hampshire Route 28 north of Manchester. The eastern terminus is in Hampton Beach at New Hampshire Route 1A, which runs along the New Hampshire coastline adjacent to the Atlantic Ocean.\nQuestion: New Hampshire Route 27 is a busy highway. True, False, or Neither? Neither\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: Dariush examined the bourgeoisie of Iran using a film, but Homayoun lead the film. True, False, or Neither? True\n###\nWar/Dance is a 2007 American documentary film written and directed by Sean Fine and Andrea Nix Fine and produced by Shine Global's Susan MacLaury, a professor at Kean University, and Albie Hecht. It was nominated for the 2008 Academy Award for Best Documentary Feature and received the Emmy Award for Best Documentary and Best Cinematography in 2010.\nQuestion: War/Dance is a documentary that won a Tony Award in 2007. True, False, or Neither? False\n###\n\"Aster\" (M915) is a Tripartite-class minehunter of the Belgian Naval Component, launched on 16 December 1985 at the Mercantile-Belyard shipyard in Rupelmonde and christened by Queen Paola of Belgium. The patronage of \"Aster\" was accepted by the city of Blankenberge. \"Aster\" was the first of the Belgian Tripartite-class minehunters.\nQuestion: \"Aster\" (M915) is a helicopter True, False, or Neither? False\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Alice Sue Claeys never finished in the top 3. True, False, or Neither?", "doc_id": 362, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35686, 15515, 39943, 35072], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley.\nQuestion: Art of Dying does not have a front man True, False, or Neither? False\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: The Friant-Kern Canal is a 152 mi Central Valley Project aqueduct delivers water south to Bakersfield. True, False, or Neither? Neither\n###\nSaat Din Mohabbat In (English: \"Seven days in love\" ) is an upcoming Pakistani romantic drama film directed by Meenu-Farjad, produced by Dawn Films and IMGC Global Entertainment and written by Fasih Bari Khan. The film features Mahira Khan and Sheheryar Munawar in lead roles and is also their second mutual film after \"Ho Mann Jahaan\".\nQuestion: Saat Din Mohabbat will screen in Pakistani theaters True, False, or Neither? Neither\n###\nThe Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958.\nQuestion: It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras but it wasn't a very good movie. True, False, or Neither? Neither\n###\nAfter Dark is a brand of Indian whisky, manufactured by Radico Khaitan. The whisky was test marketed in 2010, and rolled out nationwide in India by September 2011. It is a 100% grain-based whisky manufactured at Radico's Rampur distillery. It is available in 750ml, 375ml and 180ml bottles. The brand's tagline is \"One Life, Many Passions...Why wait\".\nQuestion: After Dark will make you tipsy. True, False, or Neither?", "doc_id": 151, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22310, 39002, 17473, 34046], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: Pata Nahi Rabb Kehdeyan Rangan Ch Raazi was translated in 5 different languages True, False, or Neither? Neither\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: They became a well known band in England. True, False, or Neither? Neither\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House.\nQuestion: Spittal is a very large village in Scotland. True, False, or Neither? False\n###\nDavid Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics.\nQuestion: A racer born in 1975 finished tenth in the 1984 olympic 20 km men's race. True, False, or Neither? False\n###\nAhmad Jovdat Ismayil oglu Hajiyev (June 18, 1917 - January 18, 2002) was one of the major Azerbaijani composers of the Soviet period. He is remembered for his monumental orchestral works, having been the first Azerbaijani to compose a symphony (1936). He studied under Azerbaijan's Founder of Composed Music, Uzeyir Hajibeyov and under Russian composer Dmitri Shostakovich.\nQuestion: Uzeyir Hajibeyov was the first composer to compose a musical in Russia. True, False, or Neither?", "doc_id": 983, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15250, 23821, 32564, 13231], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier.\nQuestion: Forest Friends has aired on multiple networks. True, False, or Neither? True\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s.\nQuestion: The private, commercial structure was placed on the National Register of Historic Places more than 1980 days ago. True, False, or Neither? True\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name.\nQuestion: Benedict is referenced as Jehovah blessed the Egyptian\u2019s house for Joseph\u2019s sake True, False, or Neither? Neither\n###\nKim Won-sik (Hangul:\u00a0\uae40\uc6d0\uc2dd , born February 15, 1993) better known by his stage name Ravi (Hangul:\u00a0\ub77c\ube44 ), is a South Korean rapper, singer-songwriter, producer, signed under Jellyfish Entertainment. He is a member of the South Korean boy group VIXX and VIXX sub-unit VIXX LR. He debuted as a solo artist on January 9, 2017, with the release of his debut mini album \"R.EAL1ZE\".\nQuestion: Kim Won-sik is a 50s baby True, False, or Neither? False\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer.\nQuestion: In 2002, Capello produced music in the bad Los Fabulosos. True, False, or Neither?", "doc_id": 332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6012, 19479, 11180, 5380], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Painted Thin was a Canadian hardcore punk band, formed in Winnipeg, and active from 1993 to 1999. The core of the band consisted of vocalist and guitarist Stephen Carroll and bassist and vocalist Paul Furgale, with a variety of guest musicians, including James Ash, Dan McCafferty and Jason Tait, on individual recordings.\nQuestion: Painted Thin played internationally True, False, or Neither? Neither\n###\nA sodium bicarbonate rocket (sometimes called an Alka-Seltzer rocket) is a model rocket fashioned from a 35mm film canister and propelled by the pressure of a gas, often carbon dioxide, generated from the reaction of an acid with sodium bicarbonate. Sodium bicarbonate rockets are often used in science classes to demonstrate principles of chemistry and physics.\nQuestion: Sodium bicarbonate are also as model to show to students some principle. True, False, or Neither? True\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: People in South Korea watch more television during the winter months. True, False, or Neither? Neither\n###\nLiving on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc.\nQuestion: L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc is the french title of Living On The Edge. True, False, or Neither? True\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly.\nQuestion: Stannis Baratheon is the son of Cassana Estermont True, False, or Neither?", "doc_id": 704, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27478, 35670, 39555, 19967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gilford is a town in Belknap County, New Hampshire, United States. The population was 7,126 at the 2010 census. Situated on Lake Winnipesaukee, Gilford is home to Governors Island, Ellacoya State Beach, Belknap Mountain State Forest, Gunstock Mountain Ski Resort, and Bank of New Hampshire Pavilion at Meadowbrook, a seasonal outdoor concert venue.\nQuestion: Gilford is a town in Belknap County, New Hampshire, United States had four people less than 7130 living there according to the the census taken at the end of the first decade of the twenty-first century. True, False, or Neither? True\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006.\nQuestion: There have been several more releases by EMI Music Japan of Dancemania's anime remix albums since 2006. True, False, or Neither? Neither\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Edmund Golding lived in New York City. True, False, or Neither? Neither\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013.\nQuestion: Brandon Tyler McManus is over 20 years old True, False, or Neither? True\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall.\nQuestion: Tuancheng Fortress is in the Northern Hemisphere. True, False, or Neither?", "doc_id": 330, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17630, 40941, 44063, 22639], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Manos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor.\nQuestion: Manos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Americanmodeling agency since 2004. True, False, or Neither? False\n###\nDonald Joseph Stanhouse (born February 12, 1951 in Du Quoin, Illinois) is a retired baseball pitcher who had a ten-year major league career from 1972 to 1980, 1982. He played for the Texas Rangers and Baltimore Orioles of the American League and the Montreal Expos and Los Angeles Dodgers of the National League.\nQuestion: Donald Joseph Stanhouse has never worn cleats True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field is being rebuilt True, False, or Neither? Neither\n###\nRoland Buerk (born 1973), was a journalist working for the BBC. He was the Tokyo Correspondent for BBC News and is best known for his coverage of the 2011 T\u014dhoku earthquake and tsunami. He is the son of former BBC newsreader and current BBC Radio 4 presenter Michael Buerk. He left the BBC in mid-2012, to work for Nissan in the United Arab Emirates.\nQuestion: Roland Buerk was a journalist working and typing for the BBC. True, False, or Neither? Neither\n###\nPeter Joseph Wysocki (October 3, 1948 \u2013 June 14, 2003) was an American football linebacker who played his entire six-year career with the Washington Redskins from 1975 to 1980 in the National Football League (NFL). Wysocki previously played four seasons in the Canadian Football League (CFL) for the Hamilton Tiger-Cats, Toronto Argonauts and Saskatchewan Roughriders.\nQuestion: Peter Joseph Wysocki played for more than 3 teams True, False, or Neither?", "doc_id": 990, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7612, 34409, 13574, 20173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999.\nQuestion: Daoud Abdel Sayed is an American director True, False, or Neither? False\n###\nMelinda Heather \"Mindy\" Cohn (born May 20, 1966) is an American actress, voice actress, comedian and singer. She is known for her role as Natalie Green, the student of Edna Garrett (played by Charlotte Rae) in the long-running sitcom \"The Facts of Life\", and for being the voice of Velma Dinkley in the \"Scooby-Doo\" franchise from 2002 to 2015.\nQuestion: Melinda heather is an actress who was born in America before May 20,1966. True, False, or Neither? False\n###\nThe 1934 Australian Grand Prix was a motor race held at the Phillip Island circuit in Victoria, Australia on 19 March 1934. The 200 mile race, which was organised by the Light Car Club of Australia, was the seventh Australian Grand Prix. Contested as a handicap race, it was won by Bob Lea-Wright, driving a Singer 9 Le Mans.\nQuestion: The 1934 Australian Grand Prix was less than 50 miles long. True, False, or Neither? False\n###\nA Merry Friggin' Christmas is a 2014 American black comedy film directed by Tristram Shapeero and written by Phil Johnston. The film stars an ensemble cast featuring Joel McHale, Lauren Graham, Clark Duke, Oliver Platt, Wendi McLendon-Covey, Tim Heidecker, Candice Bergen and Robin Williams. The film was released on November 7, 2014, by Phase 4 Films.\nQuestion: A Merry Friggin Christmas has an all star cast. True, False, or Neither? Neither\n###\nThe Angel on the Roof: The Stories of Russell Banks (2000) is a collection of short stories by Russell Banks. It consists of a total of thirty-one previously published stories, including twenty-two stories that appeared in earlier short story collections, along with nine that were previously uncollected.\nQuestion: The Angel on the Roof consists of 30 published stories True, False, or Neither?", "doc_id": 60, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31694, 22957, 17503, 27154], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall is very expensive to maintain True, False, or Neither? Neither\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor.\nQuestion: I Am That Change has been seen by Zack. True, False, or Neither? Neither\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues.\nQuestion: Opinions on politics are dangerous. True, False, or Neither? Neither\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD).\nQuestion: The president of ACL was Jose Maria de Areilza. True, False, or Neither? True\n###\nThe Enlistment Act 1970 is a statute of the Parliament of Singapore that caters for the enlistment of persons in the Singapore Armed Forces. The law repeals the Singapore Army Act and People\u2019s Defence Force Act of 1965 and is designed specifically to subject enlisted personnel under military law during the period of enlistment and service.\nQuestion: The Enlistment Act 1970 repeals the previous law and enables the Singapore army to breathe underwater True, False, or Neither?", "doc_id": 501, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23148, 39923, 22477, 30919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH).\nQuestion: a lot of people think the IDH is bogus True, False, or Neither? Neither\n###\n\"I'm Living in Two Worlds\" is a song written by Jan Crutchfield, which was recorded and released by American country artist Bonnie Guitar. The song reached number nine on the \"Billboard\" Hot Country Singles chart and number ninety-nine on the \"Billboard\" Hot 100 in early 1966. \"I'm Living in Two Worlds\" became Guitar's first Country top-ten single and her first charting single since 1959.\nQuestion: jan clutchfield song was recorded,edited and released by guitar True, False, or Neither? Neither\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: The Pear Tree is a movie about the Iranian bourgeoisie True, False, or Neither? True\n###\nKJEF-CA, channel 13, was a class A television station in Jennings, Louisiana. Owned by Townsquare Media, the station was an independent station. It was the only television station owned by Townsquare, a company that otherwise specializes exclusively in radio.\nQuestion: KJEF-CA had 500 employees True, False, or Neither? Neither\n###\nJames Conroy (born February 6, 1977) is an American voice actor, television writer and actor. He is known for appearing on television shows, such as \"Celebrity Deathmatch\", \"Kenny the Shark\" and \"Fetch! with Ruff Ruffman\", radio commercials and video games. He worked for companies such as WGBH, The Walt Disney Company and Discovery Channel.\nQuestion: James Conroy is currently dead. True, False, or Neither?", "doc_id": 374, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2848, 45319, 21012, 6090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "St. Ives Town F.C. is a football club based in St Ives, Cambridgeshire, England. They play in the Southern League Premier Division. This St Ives Town should not be confused with the Cornwall Combination team playing in St Ives, Cornwall, which is also called St Ives Town F.C.\nQuestion: St Ives is a popular name for football. True, False, or Neither? Neither\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers.\nQuestion: McColo was open in 2009. True, False, or Neither? False\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: nobody lives in st clements True, False, or Neither? False\n###\nThe 1997 Indian vice-presidential election was held on 16 August 1997 to elect Vice-President of India. Krishan Kant defeated Surjit Singh Barnala to become 10th Vice-President of India. At the time of the election, VP office was vacant since the incumbent, K. R. Narayanan, had already inaugurated as President following his victory in the presidential election.\nQuestion: The 1997 Indian vice-presidential election elected the 8th vp True, False, or Neither? False\n###\nThe 2007 Internazionali BNL d'Italia was the 2007 edition of the Rome Masters tennis tournament. The men's tournament was part of the 2007 ATP Masters Series and was held on May 5-13. The women's event was a 2007 WTA Tier I Series event and was held on May 13-20.\nQuestion: The 2007 Internazionali BNL d'Italia occurred in the Southern hemisphere True, False, or Neither?", "doc_id": 989, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41171, 5197, 29888, 14468], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site.\nQuestion: The Eglinton Castle estate was once home to the famous Montgomeries. True, False, or Neither? True\n###\nAndrea Louise Riseborough (born 20 November 1981) is an English stage and film actress. Her film appearances include \"Birdman or (The Unexpected Virtue of Ignorance)\", \"Oblivion\", \"Welcome to the Punch\", \"Disconnect\", \"Shadow Dancer\", \"W.E.\", \"Brighton Rock\", \"Made in Dagenham\", \"Never Let Me Go\", \"Happy-Go-Lucky\", and \"Venus\".\nQuestion: Andrea Louise Riseborough has played in less than 8 films. True, False, or Neither? False\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden.\nQuestion: Ellon Castle has been seen by Trump. True, False, or Neither? Neither\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: Kasey Peters was also a member of Tr-Cities Fever. True, False, or Neither? True\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016.\nQuestion: The girl group Fifth Harmony formed in two thousand eleven. True, False, or Neither?", "doc_id": 612, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22322, 6391, 18290, 23586], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The man had depression. True, False, or Neither? Neither\n###\nState Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929.\nQuestion: Basalt is a populous town True, False, or Neither? Neither\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: Ranila is a city in the Charkhi Dadri district of the Indian state of Haryana. True, False, or Neither? False\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day.\nQuestion: Tunnel Vision has been read by Carla. True, False, or Neither? Neither\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008.\nQuestion: Takeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and one other subsidiary. True, False, or Neither?", "doc_id": 87, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11265, 13422, 27289, 25960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Matoian (born 1949) is a businessman and television industry executive. He was a vice-president of the CBS Entertainment division. He later became the president of Entertainment at Fox Broadcasting in September 1995. In 1996 he became the president of HBO.\nQuestion: John Matoian became vice-president at HBO in 1996. True, False, or Neither? False\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel.\nQuestion: Castle Wolfenstein was released during the 20th century True, False, or Neither? True\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency.\nQuestion: O'Donnell High School is a 1A school. True, False, or Neither? True\n###\nIn poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown.\nQuestion: Triple Crown is a poker term referred to winning the title in two major tours True, False, or Neither? False\n###\n\"White as Snow\" is a song by Irish rock band U2 and the ninth track on their 2009 album \"No Line on the Horizon\". It was written from the perspective of a dying soldier serving in Afghanistan, and lasts the length of time it takes him to die. The track is based on the hymn \"Veni, veni Emmanuel\", and is the only political song on the album.\nQuestion: \"White as Snow\" is the only political song by U2. True, False, or Neither?", "doc_id": 393, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41305, 15863, 25902, 11312], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Debra Hayward is a British film producer. As Head of Film at Working Title Films, Hayward frequently served as an executive producer for the company's feature films, working alongside fellow Working Title executive Liza Chasin. After producing \"Les Mis\u00e9rables\", she started her own production company; Monumental Pictures.\nQuestion: Debra Hayward speaks four languages. True, False, or Neither? Neither\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station.\nQuestion: BP is the only business near Grantham North Services. True, False, or Neither? Neither\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships.\nQuestion: Angus Scott (16 August 127 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. True, False, or Neither? False\n###\nBuilth Castle (Welsh: \"Castell Llanfair-ym-Muallt\" ) was a castle built under King Edward I, just outside Builth Wells, Powys, Wales. At one time it was an impressive stone-built castle but all the masonry has been removed over the years and all that remains are the mound on which it stood, the ditches and embankments.\nQuestion: Builth Castle is a tourist attraction True, False, or Neither? Neither\n###\nThe discography of Death, a metal band, consists of seven studio albums and four live albums. Death was an American metal band founded in 1983. The band's founder, Chuck Schuldiner, is considered \"a pioneering force in death metal and grindcore\". The band ceased to exist after Schuldiner died of brain cancer in 2001, though it remains an enduring metal brand.\nQuestion: Schuldiner died from complications of chemotherapy. True, False, or Neither?", "doc_id": 243, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42711, 9211, 17890, 1774], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Timothy Donald Cook (born November 1, 1960) is an American business executive, industrial engineer, and developer. Cook is the Chief Executive Officer of Apple Inc., previously serving as the company's Chief Operating Officer, under its founder Steve Jobs.\nQuestion: Timothy Donald Cook passed away last year. True, False, or Neither? False\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: The City of Port Phillip is in the south and in the south-west is Port Melbourne. True, False, or Neither? True\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris.\nQuestion: Vampire Diaries had an episode written by a woman. True, False, or Neither? True\n###\n\"Touch Me With Your Love\" is a song by Beth Orton, released as the fourth single from 1996 album \"Trailer Park\". It contains 4 songs, and was released on C.D. and vinyl. The release peaked at #60 in the UK official singles chart. It was also released in Australia with a different track listing, and was the first release by Orton to have a promotional video made for it.\nQuestion: trailer park was only released on vinyl True, False, or Neither? False\n###\nThe position of South African ambassador to the United States is the most prestigious and top diplomatic post in South Africa. The position was first held in March 1949, following the upgrade of South Africa's diplomatic mission to an embassy. The post has been held by many important politicians and is currently held by M. J. Mahlangu.\nQuestion: South African Ambassador was first held while Taft was president of the United States. True, False, or Neither?", "doc_id": 268, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32658, 40886, 42473, 25256], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Williston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower.\nQuestion: Williston Municipal Airport is in a state of disrepair True, False, or Neither? Neither\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft.\nQuestion: Carl Filip Anton Forsberg was selected 3rd overall in the 2011 NHL Entry Draft, but didn't sign a contract with the team that drafted him True, False, or Neither? False\n###\nThe Little Girl Next Door is a 1912 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. The film starred William Garwood and Marguerite Snow in the lead roles. Prints of the film are in the Library of Congress and other collections.\nQuestion: Many prints of the Little Girl Next Door are in the Library of Congress. True, False, or Neither? True\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: King Gizzard & the Lizard Wizard were great friends True, False, or Neither? Neither\n###\nZero to One: Notes on Startups, or How to Build the Future is a 2014 book (release date September 16, 2014) by venture capitalist, PayPal co-founder, and early Facebook investor Peter Thiel along with Blake Masters. It is a condensed and updated version of a highly popular set of online notes taken by Masters for the CS183 class on startups taught by Thiel at Stanford University in Spring 2012.\nQuestion: The full name of the 2014 book by Peter Thiel and Blake Masters is called Notes on Startups, or How to Build the Future. True, False, or Neither?", "doc_id": 987, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8172, 4418, 26078, 26820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy.\nQuestion: Maris Soule was a South American author. True, False, or Neither? False\n###\nThe Emami Kolkata Open ATP Challenger Tour (formerly known as State Bank of India ATP Challenger Tour) is a professional tennis tournament played on outdoor hard courts. It is currently part of the Association of Tennis Professionals (ATP) Challenger Tour. It is held annually at the Bengal Tennis Association Stadium in Kolkata, India since 2014.\nQuestion: The tour was played in 2018. True, False, or Neither? True\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: The 1997 Porsche Tennis Grand Prix took place in 1995 True, False, or Neither? False\n###\nThe UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company.\nQuestion: UKOTCF is not based in france. True, False, or Neither? True\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition.\nQuestion: Subsequent to their debut in Japan, 4Minute eventually released three editions of their Japanese compilation album. True, False, or Neither?", "doc_id": 819, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8790, 26659, 42824, 3383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms.\nQuestion: People at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms watched a limited release of a film called The canyons. True, False, or Neither? True\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo.\nQuestion: Celly Cel is a very proliffic rapper True, False, or Neither? Neither\n###\nThe NBA Finals is the championship series of the National Basketball Association (NBA). The entrants are determined by the victors of the Eastern and Western conferences, who engage in a best-of-seven game series to determine the league champion. The winners of the Finals are awarded the Larry O'Brien Championship Trophy, which replaced the Walter A. Brown Trophy in 1983.\nQuestion: the entrants will play 7 games True, False, or Neither? True\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The North African ostrict or red-necked ostrich is the largest bird in Noth Africa that can fly. True, False, or Neither? Neither\n###\nArthur Charles Valerian Wellesley, 9th Duke of Wellington, (born 19 August 1945) is a British aristocrat and politician. He has served as Conservative Party Member of the European Parliament (1984\u20131989) for Surrey West and currently sits as a hereditary peer in the House of Lords since 2015.\nQuestion: The House of Lords has been around since 1945. True, False, or Neither?", "doc_id": 331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28918, 32239, 17823, 17708], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\".\nQuestion: Frederick Wiseman is disliked by some people True, False, or Neither? Neither\n###\nJunoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani.\nQuestion: A flight of pigeons is a multi part novella True, False, or Neither? Neither\n###\nCari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite.\nQuestion: Cari Elizabeth Roccaro is 30 years old this year. True, False, or Neither? False\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson.\nQuestion: Great Balls of Fire! is a film. True, False, or Neither? True\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels.\nQuestion: Louis Marson played in baseball legal as an outfielder. True, False, or Neither?", "doc_id": 961, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2752, 15267, 40427, 25846], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Valentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood.\nQuestion: Valentine will have a remake. True, False, or Neither? Neither\n###\nHolly Weber (born September 20, 1984) is an American glamour model and actress. As a model, she has appeared in \"Maxim\", \"FHM\", \"Muscle & Fitness\", \"Glamour\", and as no. 66 on AskMen's Top 99 Most Desirable Women of 2009. She has made uncredited appearances in a number of movies and TV series.\nQuestion: Holly appeared in other shows that were similar to \"Maxim\" True, False, or Neither? Neither\n###\nStephen Tyrone Colbert ( , ; born May 13, 1964) is an American comedian, television host, actor, and writer. He is best known for hosting the satirical Comedy Central program \"The Colbert Report\" from 2005 to 2014, and hosting the CBS talk program \"The Late Show with Stephen Colbert\" beginning in September 2015.\nQuestion: Stephen Colbert, born May 13, 1954, became famous as a result of hosting the satirical ABC talk program, \"The Colbert Report\". True, False, or Neither? False\n###\nVampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction.\nQuestion: Vampire Vs Vampire is a film made in 1345 True, False, or Neither? False\n###\nThe Oakland County Child Killer (OCCK) is an unidentified serial killer responsible for the murders of four or more children, two girls and two boys, in Oakland County, Michigan, United States in 1976 and 1977. Several theories and suspects have been named in the case, but despite all these theories, the cases remain unsolved and the killer(s) have never been identified.\nQuestion: There was more than one person involved in the OCCK murders. True, False, or Neither?", "doc_id": 145, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15186, 14417, 5420, 3076], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi.\nQuestion: An Indian crime film was inspired by the movie, \"The Godfather\", which was re-released 5 years later, but dubbed in Tamil. True, False, or Neither? True\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966).\nQuestion: Ben Barzman was born more than 200 years ago. True, False, or Neither? False\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city.\nQuestion: Ogallala is a city that was once a stop for the transcontinental railroad. True, False, or Neither? True\n###\nAdam Best is a fictional character from the BBC soap opera \"EastEnders\", played by David Proud, the first adult actor with a visible disability to appear regularly in the soap. Both Proud and his character live with spina bifida. The character made his first appearance in the episode broadcast on 10 September 2009 and his last in the one broadcast on 19 July 2010.\nQuestion: david proud have a spina bifida True, False, or Neither? True\n###\nTaki's Magazine, called \"Takimag\" for short, is an online magazine of politics and culture published by the Greek paleoconservative journalist and socialite Taki Theodoracopulos and edited by his daughter Mandolyna Theodoracopulos. Initially called Taki's Top Drawer, the site was redesigned and relaunched under its current title in March 2008 with a subsequent redesign in 2010.\nQuestion: Takimag was redesigned in the fourth month of 2008. True, False, or Neither?", "doc_id": 207, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19237, 1440, 2457, 38838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "After the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal.\nQuestion: The Empire of Japan invaded and occupied the Northeast over 10 Years ago. True, False, or Neither? True\n###\nConcrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr.\nQuestion: Ryan Adams is a musician. True, False, or Neither? True\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg.\nQuestion: Marie Hedwig Auguste of Sulzbach was forced by her family to marry the Archduke of Austria thereby becoming an archduchess and cementing German ties with Austria. True, False, or Neither? Neither\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: Jim Mooty was named Most Valuable Player along with Maxie Baughan but people thought it should have been someone else. True, False, or Neither? Neither\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\".\nQuestion: The Brown Spectator is a student-run journal that is printed on brightly colored paper. True, False, or Neither?", "doc_id": 455, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4120, 27675, 13765, 2305], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Reckless is the third book in the The It Girl novels by the German American author Cecily von Ziegesar. The series is ghostwritten from the original idea by Ziegesar. The series, aimed toward young adults, is a spin-off from the bestselling \"Gossip Girl\" series. It was released in 2006 by Little, Brown.\nQuestion: Cecily von Ziegesar did not write Reckless. True, False, or Neither? True\n###\nAllen West (born October 17, 1967, Brandon, Florida) is an American death metal guitarist who has been a member of Massacre, Obituary, Six Feet Under, Lowbrow, and Southwicked. He is considered to be a pioneering figure of the death metal genre in the 1980s.\nQuestion: Allen West is not well known today True, False, or Neither? Neither\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Netflix will make an adaptation for Coriolano: eroe senza patria in the future True, False, or Neither? Neither\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: The song \"Look at My Dab\" originally had another name. True, False, or Neither? True\n###\nThe San Diego Chargers announced their 40th Anniversary Team in 2000 to honor the top players and coaches in the history of the National Football League team. The Chargers began play in 1960 as part of the American Football League. The anniversary team included 31 players and coaches voted on by fans and a media panel. The team became the Los Angeles Chargers after relocating in 2017.\nQuestion: The 40th anniversary team mostly lived in Los Angeles. True, False, or Neither?", "doc_id": 84, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18543, 18760, 22691, 27597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerry Kupcinet is a five-time Emmy winning director and producer. Kupcinet has directed shows such as \"Judge Judy\", \"Judge Joe Brown\", \"20/20\", \"Home\", \"That's Incredible!\", The Live Aid concert, \"The Dating Game\", \"The Richard Simmons Show\", \"Entertainment Tonight\" and many others.\nQuestion: Jerry Kupcinet has directed \"The Dating Game\" True, False, or Neither? True\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Nauter has scored 59 goals in his career. True, False, or Neither? Neither\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long.\nQuestion: the route used to be shorter but was doubled out of necessity True, False, or Neither? Neither\n###\nThe 2016 MBC Entertainment Awards () presented by Munhwa Broadcasting Corporation (MBC), took place on December 29, 2016 at MBC Public Hall in Sangam-dong, Mapo-gu, Seoul. It was hosted by Kim Sung-joo, Jun Hyun-moo and Lee Sung-kyung. The nominees were chosen from MBC variety, talk and comedy shows that aired from December 2015 to November 2016.\nQuestion: The 2016 MBC Entertainment Awards () presented by Munhwa Broadcasting Corporation (MBC), took place on november 29, 2016 True, False, or Neither? False\n###\nThe third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas.\nQuestion: Next show can be done outside of the US True, False, or Neither?", "doc_id": 122, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27351, 25179, 11591, 10796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park.\nQuestion: Newnes railway line has never been closed. True, False, or Neither? False\n###\nDonaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville.\nQuestion: Donaldson Center Airport is in Canada. True, False, or Neither? False\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority.\nQuestion: Dickinson owns an airport in North Dakota True, False, or Neither? True\n###\nNeil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\".\nQuestion: Neil Sedaka speaks Italian. True, False, or Neither? True\n###\nNeilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep.\nQuestion: \"The Slide Project\" was released under the \"E Pluribus Unum\" label. True, False, or Neither?", "doc_id": 321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40049, 37429, 32636, 4881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Love's Labour's Lost is a 2000 adaptation of the comic play of the same name by William Shakespeare, directed by and starring Kenneth Branagh. It was the first feature film to be made of this lesser-known comedy. Branagh's fourth film of a Shakespeare play (he did not direct the 1995 \"Othello\", although he did play Iago), \"Love's Labour's Lost\" was a box-office and critical disappointment.\nQuestion: Kenneth Branagh has directed other film that was adapted after Williams Shakespeare. True, False, or Neither? True\n###\nHomebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line.\nQuestion: There are other open-source software package management systems that are cheaper True, False, or Neither? Neither\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: Amy Timberlake wrote the Art of the Deal True, False, or Neither? False\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels.\nQuestion: Peter L.N. Padfield was an astute observer of literary trends. True, False, or Neither? Neither\n###\nDavid K. Lam is a Chinese-born American technology entrepreneur. He founded Lam Research Corporation in 1980. He presently serves as Chairman of Multibeam Corporation (Santa Clara, CA), which manufactures complementary electron beam lithography (CEBL) systems. He also heads the David Lam Group, an investor and business advisor for high-growth technology companies.\nQuestion: The David Lam Group is headed by an American born man True, False, or Neither?", "doc_id": 47, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26610, 12308, 16951, 31142], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming.\nQuestion: Cape Vakop was chartered over 60 years ago True, False, or Neither? True\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release.\nQuestion: Shameless Self-Promotion is the first album True, False, or Neither? True\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed.\nQuestion: Kojima directed Metal Gear. True, False, or Neither? True\n###\nOlivia Genevieve Wells (born 29 April 1994 in Melbourne) is an Australian charity worker and beauty pageant titleholder who was crowned Miss Universe Australia 2013 and represented Australia at Miss Universe 2013 in Moscow, Russia on 9 November 2013. She is the first woman to be crowned Miss Universe Australia from the state of Victoria.\nQuestion: Olivia Genevieve Wells was from Moscow. True, False, or Neither? False\n###\nBilly Jacques was a rugby union and professional rugby league footballer who played in the 1890s, and 1900s, playing representative level rugby union (RU) for Yorkshire, and at club level for Hull F.C. (Prior to the 1895\u201396 Northern Rugby Football Union season, Hull F.C. was a rugby union club), and playing club level rugby league (RL) for St. Helens, and Hull F.C.\nQuestion: he was the most successful rugby player in the 80's True, False, or Neither?", "doc_id": 856, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40316, 25337, 608, 43953], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rutgers University (officially known as Rutgers, The State University of New Jersey) is an institution of higher learning with campuses across the State of New Jersey its main flagship campus in New Brunswick and Piscataway, and two other campuses in the cities of Newark and Camden, New Jersey.\nQuestion: Rutgers University has only four campuses True, False, or Neither? True\n###\nThe 1941 U.S. Open was the 45th U.S. Open, held June 5\u20137 at Colonial Country Club in Fort Worth, Texas. Craig Wood, who had lost in a playoff at the U.S. Open two years earlier, finally broke through and claimed his first U.S. Open title, three strokes ahead of runner-up Denny Shute in sweltering heat. Eight years earlier, Shute had defeated him in a playoff at the 1933 British Open.\nQuestion: The 1941 U.S. Open was held on the 5th True, False, or Neither? True\n###\nLena \"Lenny\" Kaligaris is a fictional character in \"The Sisterhood of the Traveling Pants\", a best-selling series of young adult novels by Ann Brashares. In the 2005 film version of the first book, and the 2008 sequel, \"The Sisterhood of the Traveling Pants 2\", she is portrayed by Alexis Bledel.\nQuestion: Ann Brashares doesn't know how to read. True, False, or Neither? False\n###\n\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording.\nQuestion: The song was first performed by Groff. True, False, or Neither? True\n###\nKirill Olegovich Starkov (Russian:\u041a\u0438\u0440\u0438\u043b\u043b \u041e\u043b\u0435\u0433\u043e\u0432\u0438\u0447 \u0421\u0442\u0430\u0440\u043a\u043e\u0432, born March 31, 1987), is a professional Danish ice hockey player. He is playing for HC Red Ice in the Swiss National League B. He has previously played for CSKA Moscow, Syracuse Crunch, Youngstown Steelhounds, Red Deer Rebels, Fr\u00f6lunda HC, Timr\u00e5 IK, Esbjerg IK and IK Oskarshamn.\nQuestion: Youngstown Steelhounds is a hockey team in the Swiss National League. True, False, or Neither?", "doc_id": 248, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35648, 34069, 28829, 41098], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "weRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo.\nQuestion: weRead is on the internet True, False, or Neither? True\n###\nThe London Saturday Journal was a general interest magazine publishing short fiction and nonfiction pieces published in London, England in the Victorian era. The magazine was published by William Smith. During its existence the magazine had four volumes the last of which was issued in 1842.\nQuestion: The London Saturday Journal was the best selling magazine in London, England in 1841 True, False, or Neither? Neither\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: the winner of the 211 red bull manny mania amateur u.s. championship is sebo walker True, False, or Neither? True\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\".\nQuestion: Good Morning Lancaster has received several primetime Emmy awards. True, False, or Neither? Neither\n###\nThe Highway of Hope is a 1917 American Western silent film directed by Howard Estabrook and written by Harvey Gates and Willard Mack. The film stars House Peters, Sr., Kathlyn Williams, Jim Farley and Harry De Vere. The film was released on May 17, 1917, by Paramount Pictures.\nQuestion: The Highway of Hope had five actors in it. True, False, or Neither?", "doc_id": 864, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36152, 26174, 35530, 35480], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area.\nQuestion: Stuart is very rich in history. True, False, or Neither? Neither\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla.\nQuestion: The icon is normally orange, with hex code #FA9B32. The original icon was created by Stephen Horlander, a designer at Mozilla. True, False, or Neither? False\n###\nUSS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941.\nQuestion: US CHRISTOPER was built for world war I True, False, or Neither? False\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines is a for-profit organization True, False, or Neither? False\n###\nThe 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015.\nQuestion: The 2015 Latrobe City Traralgon ATP Challenger had a different name True, False, or Neither?", "doc_id": 687, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13925, 21128, 42139, 16695], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice.\nQuestion: The Mannlicher\u2013Sch\u00f6nauer killed the most people. True, False, or Neither? Neither\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\".\nQuestion: Linkin Park is from America. True, False, or Neither? True\n###\nCity Hall Station is a station on Seoul Subway lines 1 (Blue Line) and 2 (Green Line). As its name suggests, Seoul City Hall is located right next to the station. Deoksugung, a historic palace of the Joseon dynasty, is on the other side of the boulevard named Taepyeongno.\nQuestion: You must take the train to get from City Hall Station to Seoul City Hall True, False, or Neither? Neither\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani.\nQuestion: Kozani Stadium is not located in Athens. True, False, or Neither? True\n###\n\"Sultans of Swing\" is a song by British rock band Dire Straits from their eponymous debut album, which band frontman Mark Knopfler wrote and composed. Although it was first released in 1978, it was its 1979 re-release that caused it to become a hit in both the UK and U.S.\nQuestion: \"Sultans of Swing\" is a song by British rock band Dire Straits that was released and re-released only one years apart. True, False, or Neither?", "doc_id": 427, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44037, 7452, 16967, 16407], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948.\nQuestion: The author was African American. True, False, or Neither? Neither\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets.\nQuestion: Rodrequis La'Vant Stephens lived in Georgia, and played in the NFL for multiple teams. True, False, or Neither? True\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million.\nQuestion: The Eolica Sarichioi Wind Farm will be profitable. True, False, or Neither? Neither\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis.\nQuestion: Escape from Suburbia: Beyond the American Dream made millions. True, False, or Neither? Neither\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas.\nQuestion: Snecma and Rolls-Royce are two European aircraft engine manufacturers. True, False, or Neither?", "doc_id": 172, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29104, 21399, 5993, 24962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time.\nQuestion: Thomas Jefferson was the governor of Virginia True, False, or Neither? True\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins.\nQuestion: Roderick Dwayne \"Rod\" Higgins is fat. True, False, or Neither? Neither\n###\nMike Hoffman (born September 20, 1980) is an American former professional ice hockey player. After leaving the University of Connecticut in 2003, he began his first pro season playing with the Worcester IceCats in the AHL and the Peoria Rivermen of the ECHL. He signed a professional contract with the Toronto Maple Leafs in 2005, but he has never played in the National Hockey League.\nQuestion: Mike Hoffman is still as good at hockey today as he was 2 years ago. True, False, or Neither? Neither\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line.\nQuestion: The Takoma Langley Crossroads Transit Center is a favourite of bus drivers True, False, or Neither? Neither\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million.\nQuestion: Sarichioi is a rural city. True, False, or Neither?", "doc_id": 879, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22316, 36920, 42636, 7567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hell's Kitchen Australia is an Australian cooking reality competition television series which premiered on the Seven Network on 6 August 2017. The series is hosted by British chef Marco Pierre White, who previously hosted two seasons of the British version of the format and appeared in rival program \"MasterChef Australia\".\nQuestion: \"MasterChef Australia\" is better than hell's kitchen True, False, or Neither? Neither\n###\nUniversity of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund.\nQuestion: University of Maryland Eastern Shore is a rubbish university True, False, or Neither? Neither\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd.\nQuestion: Maurice Anthony Foley currently has a long list of likes. True, False, or Neither? Neither\n###\nLeonard \"Boogie\" Weinglass (born 1941) is a charismatic American businessman who founded retailer Merry-Go-Round, a chain of restaurants named Boogie\u2019s Diner, and whose early life was portrayed by actor Mickey Rourke in the 1982 classic American film \"Diner\".\nQuestion: Leonard Wineglass's popularity as a Google search term rose when the movie with Mickey Rourke was released. True, False, or Neither? Neither\n###\nBallads of Sacco & Vanzetti is a set of ballad songs, written and performed by Woody Guthrie, related to the trial, conviction and execution of Sacco and Vanzetti. The series was commissioned by Moe Asch in 1945 and recorded in 1946 and 1947. Guthrie never completed the project and was unsatisfied by the result. The project was released later in its abandoned form by Asch.\nQuestion: Moe Asch completed the Ballads of Sacco & Vanzetti after the writer, Woody Guthrie, abandoned the project. True, False, or Neither?", "doc_id": 865, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43421, 42124, 9876, 18694], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia.\nQuestion: The Head of The Investigative Committee of Russia was born in the summertime, True, False, or Neither? True\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics.\nQuestion: ALGOL 68 is important. True, False, or Neither? Neither\n###\nA governorate is an administrative division of a country. It is headed by a governor. As English-speaking nations tend to call regions administered by governors either states, provinces, or colonies, the term \"governorate\" is often used in translation from non-English-speaking administrations.\nQuestion: A governorate is made up of over 1000 people. True, False, or Neither? Neither\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister.\nQuestion: Lou Gosset was not involved in J.D.'s Revenge True, False, or Neither? False\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla.\nQuestion: The Feed icon was invited for the use of RSS. True, False, or Neither?", "doc_id": 487, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35870, 32093, 5218, 21447], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nicotiana langsdorffii, Langsdorff's tobacco, is a species of the \"Nicotiana\" genus (tobacco). It is an annual plant with large leaves (up to 10 inches long) with tall 2 inch nodding long tubular bell shaped flowers that are apple green in colour, with blue anthers. \"N. langsdorfii\" lacks fragrance unlike some of the other tall species. It is grown as an ornamental garden plant.\nQuestion: Langsdorff's tobacco is a not a very popular green plant True, False, or Neither? Neither\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Tasmania was made popular due to a television cartoon series. True, False, or Neither? Neither\n###\nMount Doom is a fictional volcano in J.R.R. Tolkien's Middle-earth legendarium. It is located in the northwest of the Black Land of Mordor and close to Barad-d\u00fbr. Alternative names, in Tolkien's invented language of Sindarin, include Orodruin (\"fiery mountain\") and Amon Amarth (\"mountain of fate\").\nQuestion: Mount Doom is located in Mordor True, False, or Neither? True\n###\nFinniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla.\nQuestion: There are more than 5 towns in South Australia. True, False, or Neither? True\n###\nLuke Strong is a fictional character from the British ITV soap opera, \"Coronation Street\". Portrayed by Craig Kelly, the character appeared throughout 2009. Luke took over Carla Connor's share of the Underworld factory with Tony Gordon. He knew Carla's deceased husband, Paul Connor.\nQuestion: Luke Strong and Carla Connor worked together True, False, or Neither?", "doc_id": 323, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28511, 21532, 16538, 39097], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction.\nQuestion: Lam Ching-ying's favorite film is Vampire Vs Vampire True, False, or Neither? Neither\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex.\nQuestion: Croton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It has an ugly red and thick latex. True, False, or Neither? Neither\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister.\nQuestion: Revenge is a common theme in blaxploitation movies. True, False, or Neither? Neither\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene.\nQuestion: \"Something from Nothing\" was the lead single from the Foo Fighters' eponymous studio album. True, False, or Neither? False\n###\nThe third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale.\nQuestion: \"Gossip Girl\" is a teen drama that attracts adult audience as well. True, False, or Neither?", "doc_id": 839, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32474, 25085, 16764, 20238], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada.\nQuestion: The ACT has events in New York City. True, False, or Neither? Neither\n###\nKaalamellam Kaathiruppen (Tamil: \u0b95\u0bbe\u0bb2\u0bae\u0bc6\u0bb2\u0bcd\u0bb2\u0bbe\u0bae\u0bcd \u0b95\u0bbe\u0ba4\u0bcd\u0ba4\u0bbf\u0bb0\u0bc1\u0baa\u0bcd\u0baa\u0bc7\u0ba9\u0bcd ; English: I Will Wait Forever ) is 1997 Tamil romance film directed by R. Sundarrajan. The film stars Vijay and Dimple in the lead roles, while R. Sundarrajan, Jaishankar, Srividya, Karan, Manivannan play other pivotal roles. The music for the film was composed by Deva and the film released on 14 January 1997.\nQuestion: Deva was one of the lead roles in the movie. True, False, or Neither? False\n###\n\"Outro\" is a song by French electronic music artist M83, released as the final track on the group's sixth studio album, \"Hurry Up, We're Dreaming\" (2011). It is a dramatic, symphonic rock song which has evoked \"heartbreak, nostalgia, anticipation, jubilation and triumph\".\nQuestion: Outro was sung by Obama. True, False, or Neither? Neither\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\").\nQuestion: Ernest Guiraud wrote \"Carmen\" True, False, or Neither? False\n###\nFerry County is a county located in the U.S. state of Washington. As of the 2010 census, the population was 7,551, making it the fourth-least populous county in Washington. The county seat and largest city is Republic. The county was created out of Stevens County on February 21, 1899 and is named for Elisha P. Ferry, the state's first governor.\nQuestion: U.S. state of Washington has a population bigger 7,556. True, False, or Neither?", "doc_id": 826, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27954, 16281, 24885, 593], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Joseph Smith had a Mother. True, False, or Neither? True\n###\nDual-role transvestism is the formal diagnosis used by psychologists and physicians to describe people who wear clothes of the opposite sex to experience being the opposite sex temporarily, but don't have a sexual motive or want gender reassignment surgery. The International Classification of Diseases (ICD-10) list three diagnostic criteria for \"Dual-role transvestism\" (F64.1):\nQuestion: Dual-role transvestism is what Bowie has. True, False, or Neither? Neither\n###\nHim & Her is a British television sitcom about a lazy twenty-something couple: Steve and Becky, who live in Walthamstow, London. It was first broadcast in the United Kingdom on BBC Three on 6 September 2010. It is written by Stefan Golaszewski and stars Russell Tovey and Sarah Solemani. The theme tune is the song \"Boom Bang-a-Bang\" by Lulu.\nQuestion: Steve and Becky combined age is 60 True, False, or Neither? False\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others.\nQuestion: Joseph Lovett won an academy award for this documentary. True, False, or Neither? Neither\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla.\nQuestion: The Feed icon was created by a team of 3 people. True, False, or Neither?", "doc_id": 79, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13904, 32942, 21610, 44948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o starts with a J. True, False, or Neither? True\n###\nThe United Nations Peacekeepers Medal (Irish: \"An Bonn Chosant\u00f3ir\u00ed Sioch\u00e1na na N\u00e1isi\u00fan Aontaithe\" ) is awarded to those members of the Irish Defence Forces or Chaplaincy Service who have served overseas on a United Nation Mission or United Nations Mandated Mission.\nQuestion: The United Nations Peacekeepers Medal is awarded every December. True, False, or Neither? Neither\n###\nSaiyuki (\u6700\u904a\u8a18 , Saiy\u016bki ) is a manga series by Kazuya Minekura which was serialized in \"G-Fantasy\" from 1997 to 2002. It spawned multiple manga sequels, anime adaptations, video games and other media. The story is loosely based on the Chinese novel \"Journey to the West\".\nQuestion: Saiyuki was born to fantasy in the chinese novel\\ True, False, or Neither? Neither\n###\nYear 493 BC was a year of the pre-Julian Roman calendar. At the time, it was known as the Year of the Consulship of Auruncus and Viscellinus (or, less frequently, year 261 \"Ab urbe condita\"). The denomination 493 BC for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years.\nQuestion: 493 BC was 100 years ago True, False, or Neither? False\n###\nPhichai Railway Station is a railway station located in Nai Mueang Subdistrict, Phichai District, Uttaradit. It is located 447.553\u00a0km from Bangkok Railway Station and is a class 2 railway station. It is on the Northern Line of the State Railway of Thailand. Phichai Railway Station opened as part of the Northern Line extension from Phitsanulok to Ban Dara Junction in November 1908.\nQuestion: Phichai Railway Station was under construction in 1906. True, False, or Neither?", "doc_id": 596, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17566, 38842, 19983, 17386], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Neilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep.\nQuestion: Hubbard created his own three solo albums prior to being in Strays Don't Sleep. True, False, or Neither? True\n###\nJohnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\".\nQuestion: Johnny Kidd also sang in German. True, False, or Neither? Neither\n###\nMarwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros.\nQuestion: He is in his forties. True, False, or Neither? False\n###\nMargarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos.\nQuestion: The inspiration behind Margarita la tornera was a poem True, False, or Neither? True\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound.\nQuestion: The Bavarian Mountain Hound is found in London. True, False, or Neither?", "doc_id": 731, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25165, 28380, 33774, 30082], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\".\nQuestion: \"The Call\" charted in the UK. True, False, or Neither? Neither\n###\nGuns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin).\nQuestion: Guns of Diablo won many awards. True, False, or Neither? Neither\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185.\nQuestion: In 2005 South Cove and Wenham Parva both had small populations. True, False, or Neither? True\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers.\nQuestion: McColo was founded in the 20th century. True, False, or Neither? Neither\n###\nSt Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795.\nQuestion: The City of Port Philip has a population of 17,795 True, False, or Neither?", "doc_id": 569, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23743, 45143, 3530, 24235], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Hold Me Tight\" is a rock and roll song by English rock group the Beatles from their 1963 album \"With the Beatles\". It was first recorded during the \"Please Please Me\" album session, but not selected for inclusion and re-recorded for their second album.\nQuestion: The \"Please Please Me\" album recording session lasted for a few months in 1961 True, False, or Neither? Neither\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees.\nQuestion: Long is the first in his family to go into baseball. True, False, or Neither? False\n###\nLes Sept Paroles du Christ sur la Croix (composed 1859) is a musical setting of The Seven Last Words of Christ by C\u00e9sar Franck, though the name in French often refers to an equally well or better known homonymous work by Charles Gounod \"Les sept paroles de N.S. Jesus-Christ sur la croix\".\nQuestion: Les Sept Paroles du Christ sur la Croix was a successful musical written and performed in France True, False, or Neither? Neither\n###\nFatsia japonica(syn. \"Aralia japonica\" Thunb., \"A. sieboldii\" Hort. ex K.Koch), also glossy-leaf paper plant, fatsi, paperplant or Japanese aralia, is a species of flowering plant in the family Araliaceae, native to southern Japan, southern Korea, and Taiwan.\nQuestion: Fatsia Japonica is native to the US True, False, or Neither? False\n###\nWalking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis.\nQuestion: Walking on Sunshine was the debut film for two people. True, False, or Neither?", "doc_id": 991, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15683, 36201, 11321, 34518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Builth Castle (Welsh: \"Castell Llanfair-ym-Muallt\" ) was a castle built under King Edward I, just outside Builth Wells, Powys, Wales. At one time it was an impressive stone-built castle but all the masonry has been removed over the years and all that remains are the mound on which it stood, the ditches and embankments.\nQuestion: The castle stood on a mound True, False, or Neither? True\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city.\nQuestion: There was a census in 2010 True, False, or Neither? True\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Daniel Murphy was born February 24, 1994. True, False, or Neither? Neither\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock.\nQuestion: \"Pour Me\" sold 500000 copies True, False, or Neither? Neither\n###\nThe Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project.\nQuestion: In the adjacent RV park there are at least 1 RV's. True, False, or Neither?", "doc_id": 866, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34, 6247, 43439, 18874], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Joan Ganz Cooney Center (informally, the Cooney Center) is an independent, non-profit, non-partisan research and innovation group founded by Sesame Workshop in order to advance children\u2019s literacy skills and foster innovation in children\u2019s learning through digital media.\nQuestion: Sesame Workshop has put millions into the Joan Ganz Cooney Center for further development of digital media learning. True, False, or Neither? Neither\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center.\nQuestion: George Balanchine was a choreographer of ballet. True, False, or Neither? True\n###\nThe Altar Valley is a 45-mile (72\u00a0km) long north-south valley, trending slightly northeast from Sasabe, Arizona on the Mexico border to the Avra Valley west of the Tucson Mountains. It is delimited by Arizona State Route 86, from east-to-west on the north separating it from the Avra Valley which then trends \"northwesterly\", merging into the plains and drainage of the Santa Cruz River.\nQuestion: Altar extends over 7900 yards across the valley True, False, or Neither? True\n###\nManos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor.\nQuestion: Krystalis is engaged to a model. True, False, or Neither? Neither\n###\nThe Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times.\nQuestion: The book was awarded a prize within 12 months of being published. True, False, or Neither?", "doc_id": 968, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20485, 42362, 36698, 18987], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "George White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation.\nQuestion: George White is a big man True, False, or Neither? Neither\n###\n\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single,\nQuestion: The song Inbetweener by English Britpop band Sleeper is over 100 years old True, False, or Neither? False\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting.\nQuestion: Dungeons and dragons is a boring game True, False, or Neither? Neither\n###\nThe Prime Minister's XI or PM's XI (formerly Australian Prime Minister's Invitation XI) is an invitational cricket team picked by the Prime Minister of Australia for an annual match held at the Manuka Oval in Canberra against an overseas touring team. The Australian team usually includes up and coming players.\nQuestion: Cricket is a difficult sport. True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall is a mall in , Virginia, United States True, False, or Neither?", "doc_id": 130, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34240, 15817, 13344, 14565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Smithereens was produced by a member of another band. True, False, or Neither? True\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: The Pursuit of Happyness flopped at the box office True, False, or Neither? Neither\n###\n51829 Williemccool (2001 OD ) is an asteroid named for astronaut Willie McCool, who was killed in the STS-107 (\"Columbia\") space shuttle reentry disaster on February 1, 2003. 51829 Williemccool was discovered on July 21, 2001 at Palomar Observatory by the JPL Near Earth Asteroid Tracking Program.\nQuestion: \"Wherever US is, We are.\" was a slogan from 1965 True, False, or Neither? Neither\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II.\nQuestion: Astra Modelo 400 was the sidearm standard in the army of the Spanish. True, False, or Neither? True\n###\nBig Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999.\nQuestion: The band played at The band played at the Super Bowl XXXII True, False, or Neither?", "doc_id": 578, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39771, 7334, 14759, 40419], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: He attended the University of Virginia and was a leading member of three NCAA Men's Tennis Championship winning teams. True, False, or Neither? Neither\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017.\nQuestion: The 2017 City of Onkaparinga ATP Challenger is a tennis tournament True, False, or Neither? True\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: von Houwald loved his mother. True, False, or Neither? Neither\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added.\nQuestion: Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". Members of the band decided to replace Sean O'Hagan with keyboardist Katharine Gifford changing the guitar sounds for the keyboard. True, False, or Neither? Neither\n###\nInferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India.\nQuestion: Inferno has no plot. True, False, or Neither?", "doc_id": 174, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10237, 22357, 15259, 14194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grotto Geyser is a fountain-type geyser located in the Upper Geyser Basin in Yellowstone National Park in the United States. Grotto Geyser is the namesake for the group of geysers that includes Grotto Fountain Geyser, South Grotto Fountain Geyser, Indicator Spring, Spa Geyser, and Rocket Geyser.\nQuestion: Rocket Geyser is part of a group of five geysers. True, False, or Neither? True\n###\nSwift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933.\nQuestion: The novel was set in 1835 True, False, or Neither? True\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel.\nQuestion: Castle Wolfenstein was first a Playstation game. True, False, or Neither? False\n###\nThe San Diego Chargers announced their 40th Anniversary Team in 2000 to honor the top players and coaches in the history of the National Football League team. The Chargers began play in 1960 as part of the American Football League. The anniversary team included 31 players and coaches voted on by fans and a media panel. The team became the Los Angeles Chargers after relocating in 2017.\nQuestion: The San Diego Chargers has been visited by Clinton. True, False, or Neither? Neither\n###\nHere is a list of all of KF Tirana's Cup seasons from 1939 till end of most recent season. This list shows where they finished the season, how many ties won or lost, how many goals they scored and conceded, how many wins draws and losses they had throughout the season, goal difference, winning difference and number of matches played.\nQuestion: The implied list shows KF Tirana's Cup season from at least the past 90 years. True, False, or Neither?", "doc_id": 925, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33074, 24286, 9449, 24009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company.\nQuestion: UKOTCF helps other countries with environment issues as well. True, False, or Neither? Neither\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards.\nQuestion: Many engineers prefer the Proteus Design Suite over other software available in the design automation world. True, False, or Neither? Neither\n###\nSt. Petersburg is a city in Pinellas County, Florida, United States. As of the 2015 census estimate, the population was 257,083, making it the fifth-most populous city in Florida and the largest in the state that is not a county seat (the city of Clearwater is the seat of Pinellas County).\nQuestion: St. Petersburg is a city in Alabama, United States. True, False, or Neither? False\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards.\nQuestion: The Proteus Design Suite is used by the US army True, False, or Neither? Neither\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: Golshifteh Farahani was a famous actress when The Pear Tree was released. True, False, or Neither?", "doc_id": 179, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38543, 44864, 19240, 40788], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Midnight Tides is the fifth volume of Canadian author Steven Erikson's epic fantasy series, the \"Malazan Book of the Fallen\". Although it is part of the larger series, it has only limited references to the previous books. However, it is not a stand-alone volume as the events of the books \"Reaper's Gale\" and \"Dust of Dreams\" follow on from it.\nQuestion: Steven Erickson is a citizen of a country that is north of Brazil. True, False, or Neither? True\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile.\nQuestion: There were 4 installments of the \"Child's Play\" franchise before Bride of Chucky in 1998 True, False, or Neither? False\n###\nThe following are lists of the most populous fully defined incorporated settlements in Nigeria by population. This page consists three different tables, with different kinds of settlements; a list for \"defined cities\", listing the population, strictly within the defined city limits, a list for \"urban area\" population, and another list for the population within metropolitan areas.\nQuestion: This page consists of three of the same tables. True, False, or Neither? False\n###\nBath Salt Zombies is a 2013 American horror comedy directed by Dustin Mills, written by Mills and Clint Weller, and starring Josh Eal, Ethan Holey, Jackie McKown, Dave Parker, and Brandon Salkil. It is about zombie attacks brought on by concentrated bath salts.\nQuestion: Bath Salt Zombies is a comedy True, False, or Neither? True\n###\nAz-Zahir Ali Hakim (born June 3, 1977) is a former American football wide receiver. He played college football at San Diego State. He was drafted by the St. Louis Rams in the fourth round (96th overall) of the 1998 NFL Draft. He also was a member of the Detroit Lions, New Orleans Saints, San Diego Chargers, Miami Dolphins, and Las Vegas Locomotives.\nQuestion: Hakim was drafted by both the army and the St. Louis Rams. True, False, or Neither?", "doc_id": 339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44079, 34590, 18303, 32683], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zaiga Jansone-Ivanova (born (1951--)24 1951 ) is a former Soviet Latvian tennis player and tennis coach. She was a five-time Soviet champion in women's doubles, 1973 Summer Universiade champion in women's doubles (all with Olga Morozova) and winner of the exhibition tennis event of 1968 Olympics in mixed doubles (with (Vladimir Korotkov).\nQuestion: Zaiga Jansone-Ivanova will neither or always be remembered as winner of the exhibition tennis event of 1968 Olympics. True, False, or Neither? Neither\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season.\nQuestion: Their 2016 season was a losing season. True, False, or Neither? Neither\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: Elizabeth City State University is mostly a caucasian school True, False, or Neither? False\n###\nAz-Zahir Ali Hakim (born June 3, 1977) is a former American football wide receiver. He played college football at San Diego State. He was drafted by the St. Louis Rams in the fourth round (96th overall) of the 1998 NFL Draft. He also was a member of the Detroit Lions, New Orleans Saints, San Diego Chargers, Miami Dolphins, and Las Vegas Locomotives.\nQuestion: azzahir played wideouts in football True, False, or Neither? True\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Weltenbrand was a commercial failure True, False, or Neither?", "doc_id": 386, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37520, 17085, 44769, 1995], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "NASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi.\nQuestion: NASA John H. Glenn Research Center at Lewis Field is a NASA center between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River. True, False, or Neither? True\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television.\nQuestion: Sebastian Philip Bierk was a Canadian hip hop artist True, False, or Neither? False\n###\nJohn Garman \"J. G.\" Hertzler Jr. (born March 18, 1950) is an American actor, author, screenwriter, and activist best known for his role on \"\" as the Klingon General (and later Chancellor) Martok, whom he portrayed from 1995 until the series' end in 1999.\nQuestion: John Garman \"J. G.\" Hertzler Jr. doesn't speak the official language of the United States. True, False, or Neither? False\n###\nThe \"Minas Geraes\" class, spelled \"Minas Gerais\" in some sources, consisted of two battleships built for the Brazilian Navy in the early twentieth century. Named \"Minas Geraes\" and \"S\u00e3o Paulo\" , the ships were intended to be Brazil's first step towards becoming an international power, and they consequently initiated a South American naval arms race.\nQuestion: There were two battleships made for a navy in the early twentieth century. True, False, or Neither? True\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie.\nQuestion: The film was released in the cinema. True, False, or Neither?", "doc_id": 497, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1404, 20968, 39523, 6755], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Concrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr.\nQuestion: The lead track for \"Concrete Sky\" is not an original song made for the album. True, False, or Neither? True\n###\nPaul Revere ( ; December 21, 1734 O.S.May 10, 1818) was an American silversmith, engraver, early industrialist, and Patriot in the American Revolution. He is best known for his midnight ride to alert the colonial militia in April 1775 to the approach of British forces before the battles of Lexington and Concord, as dramatized in Henry Wadsworth Longfellow's poem, \"Paul Revere's Ride\" (1861).\nQuestion: Revere took years to become an accomplished silversmith. True, False, or Neither? Neither\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training.\nQuestion: The Boulton Paul Balliol and Sea Balliol were expensive aircraft True, False, or Neither? Neither\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg.\nQuestion: Australia had a great depression in the 20th century. True, False, or Neither? True\n###\nDallas was a town in Ouray County, Colorado, United States. It lay about 3 miles (5\u00a0km) north of the present town of Ridgway at the confluence of Dallas Creek and the Uncompahgre River. A community named in tribute to the historic town bearing the name Dallas Meadows now exists at its historic location.\nQuestion: Dallas lies over 5000 yards north of Ridgway True, False, or Neither?", "doc_id": 106, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43851, 29207, 29309, 14706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Wire is an American crime drama television series set and produced in Baltimore, Maryland. Created and primarily written by author and former police reporter David Simon, the series was broadcast by the cable network HBO in the United States. \"The Wire\" premiered on June 2, 2002, and ended on March 9, 2008, comprising 60 episodes over five seasons.\nQuestion: David Simon was a police reporter in Baltimore. True, False, or Neither? True\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant.\nQuestion: The USS Voyager does not have many aliens on it True, False, or Neither? Neither\n###\nWalking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis.\nQuestion: The film Walking on Sunshine was released in Los Angeles, California on June 27, 2014. True, False, or Neither? Neither\n###\nAnsar al-Sharia in Tunisia (\"\"Supporters of Islamic Law in Tunisia\"\") is a radical Islamist group that operates in Tunisia. It has around 1,000 people as part of the movement. It has been listed as a terrorist group by the Tunisian government as well by the United Nations, the UAE, the United Kingdom and the United States. Some of its members may be linked to the 2015 Sousse attacks.\nQuestion: Sharia Law will become a dominate force withing twenty years! True, False, or Neither? Neither\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie.\nQuestion: G.I. Joe: Ninja Battles was released on DVD more than 2002 years ago. True, False, or Neither?", "doc_id": 923, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1855, 13658, 6688, 31193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Trois sonneries de la Rose+Croix (\"Three Sonneries of the Rose+Cross\") is a piano composition by Erik Satie, first published in 1892, while he was composer and chapel-master of the Rosicrucian \"Ordre de la Rose-Croix Catholique, du Temple et du Graal \", led by S\u00e2r Jos\u00e9phin P\u00e9ladan.\nQuestion: S\u00e2r Jos\u00e9phin P\u00e9ladan was not a rosicrucian.\n True, False, or Neither? False\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics.\nQuestion: Am\u00e9lie Simone Mauresmo won two Grand Slam singles titles at age of 17. True, False, or Neither? Neither\n###\nSc\u00e8nes de ballet is a ballet made by New York City Ballet balletmaster John Taras to Stravinsky's eponymous music from 1944. The premiere took place June 22, 1972, as part of the City Ballet's Stravinsky Festival at the New York State Theater, Lincoln Center.\nQuestion: Scenes de ballet is the only ballet made by New York City balletmaster John Taras True, False, or Neither? Neither\n###\nThe AIR Charts are the official sales charts for Australian independent music released by Australian owned, independent record labels. Presented by AIR, the Australian Independent Record Labels Association, the charts are calculated according to official sales figures provided by the ARIA Charts, which includes legal MP3 download sales.\nQuestion: The AIR Charts has verified sales data. True, False, or Neither? True\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro.\nQuestion: Salvatore Rosa started painting before Nicola. True, False, or Neither?", "doc_id": 639, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31953, 30838, 5175, 26861], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character.\nQuestion: Adrianne Palicki looked beautiful in the wonder woman costume. True, False, or Neither? Neither\n###\nThe Thebaid ( ; Latin: \"Th\u0113ba\u00efs\") is a Latin epic in 12 books written in dactylic hexameter by Publius Papinius Statius (AD c. 45 \u2013 c. 96). The poem deals with the Theban cycle and treats the assault of the seven champions of Argos against the city of Thebes.\nQuestion: Statius wrote at least one book True, False, or Neither? True\n###\nThe Multiwavelength Atlas of Galaxies is a textbook and atlas of 35 well studied galaxies (including our Galaxy) authored by Glen Mackie of the Centre for Astrophysics & Supercomputing, Swinburne University of Technology. It was originally published in 2011 by Cambridge University Press.\nQuestion: The Multiwavelength Atlas of Galaxies studies more galaxies than any other book. True, False, or Neither? Neither\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups.\nQuestion: Corn smut is actually a disease of a corn plant, a disease that is eaten in Mexico True, False, or Neither? True\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd.\nQuestion: Maurice Foley was the Speaker of the British house of commons before Betty Boothroyd. True, False, or Neither?", "doc_id": 884, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40701, 7036, 4580, 26029], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Tunn (born 18 January 1974) is an Australian radio announcer and television presenter. He was hired by Australia's national youth station Triple J in 1990 at the age of 17, making him Australia's youngest professional radio presenter at the time.\nQuestion: Michael Tunn wasn't the youngest professional radio presenter in 1909. True, False, or Neither? False\n###\nThe Arboretum Curie, also known as the Arboretum du Col des Trois Soeurs, is a small arboretum located at 1470 metres altitude in the Col des Trois Soeurs near La Panouse, Loz\u00e8re, Languedoc-Roussillon, France. It was created circa 1975 to study conifers suitable for reforestation, and according to Arbez et al., now contains 77 taxa (primarily conifers).\nQuestion: The Arboretum Curie contains over 77 taxa at an elevation of 1470 metres in the country of France. Its purpose is to research a number of conifers for their possible reforestation properties as of the year 1975. True, False, or Neither? True\n###\nThe Fondation Prince Pierre was established by Prince Rainier III of Monaco in February 1966 to promote culture and the arts through the creation and the awarding of prizes. Prince Rainier III created the foundation in tribute to his father, Pierre de Polignac a great patron of the arts.\nQuestion: Prince Rainier III created the foundation in tribute to a great patron of the arts. True, False, or Neither? True\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\"\nQuestion: People really like using Flatbush Avenue to get out of queens\n True, False, or Neither? Neither\n###\nCharles Farrell (August 9, 1900 \u2013 May 6, 1990) was an American film actor of the 1920s silent era and into the 1930s, and later a television actor. Farrell is probably best recalled for his onscreen romances with actress Janet Gaynor in more than a dozen films, including \"7th Heaven\", \"Street Angel\", and \"Lucky Star\".\nQuestion: Farrell and Gaynor were romantically involved. True, False, or Neither?", "doc_id": 697, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17163, 31232, 2099, 31480], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Geoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\".\nQuestion: Geoffrey Zakarian was born July 1, 1955 True, False, or Neither? False\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle.\nQuestion: City Mall is a large mall in Jordan that has been open for over a decade. There are many films shown in the cinema here. Some of the films are American. True, False, or Neither? True\n###\nThe Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper.\nQuestion: Jack the Ripper was famous for robbing people True, False, or Neither? False\n###\nAlbert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion.\nQuestion: The United States influences judicial rulings in the Virgin Islands True, False, or Neither? True\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide.\nQuestion: The show was released the year after 2006. True, False, or Neither?", "doc_id": 369, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31425, 12540, 8939, 14427], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The shooting of John Crawford III occurred on August 5, 2014. Crawford was a 22-year-old African-American man shot to death by Beavercreek police officer Sean Williams, in a Walmart store in Beavercreek, Ohio, near Dayton, while holding a toy BB gun.\nQuestion: The shooting of John Crawford III did not occur in Dayton, Ohio. True, False, or Neither? True\n###\n\"Eve\" is an American television sitcom that was broadcast on United Paramount Network (UPN) from September 15, 2003, to May 11, 2006. A total of 66 episodes of \"Eve\" were broadcast over three seasons. Created by Meg DeLoatch, the series follows New York City fashion designer Shelly Williams (Eve) through her relationship with physical therapist J.T. Hunter (Jason George).\nQuestion: Episodes of Eve aired in the month of December. True, False, or Neither? Neither\n###\nDeliver Us Tour was a concert tour by band Darkest Hour, taking place from late 2007, in support of their fifth studio album \"Deliver Us\" and finishing in December 2008. The tour started shortly after the Undoing Ruin Tour ended, earlier in December 2006.\nQuestion: Deliver Us Tour was performed in large venues. True, False, or Neither? Neither\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists.\nQuestion: Coraz\u00f3n Valiente setting was in a Hospital. True, False, or Neither? Neither\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament.\nQuestion: In the year that equals one thousand times two plus ten plus one, the Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium. True, False, or Neither?", "doc_id": 529, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41227, 4851, 31612, 31804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Chris Gardner conceived a child with a woman before the events of The Pursuit of Happyness. True, False, or Neither? Neither\n###\nJon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International.\nQuestion: Jon Luther is the Chairman of the Board of many companies, he may be a shareholder in others as well True, False, or Neither? Neither\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism.\nQuestion: The population is now over 5 thousand True, False, or Neither? Neither\n###\nRubicon Drinks Ltd (formerly Rubicon Products Ltd) is a soft drink manufacturer based in Milton Keynes, UK. The company specialises in exotic soft drinks, and produces a variety of different flavours of drink, many of which contain sugar or artificial sweeteners, and which come in both still and sparkling varieties. In the United Kingdom, their drinks are bottled and distributed by A.G. Barr plc.\nQuestion: Rubicon Drinks is the 2nd largest soda distributor in the UK. True, False, or Neither? Neither\n###\nJoe Fryer is an American journalist and storyteller working for NBC News as a west coast correspondent based at the NBC News West Coast Bureau in Universal City, California. Fryer joined NBC News in 2013 as a part-time correspondent and officially joined NBC News as a full-time correspondent on October 21, 2013.\nQuestion: ABC News West Coast Bureau is located in Universal City, CA. True, False, or Neither?", "doc_id": 42, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32839, 6917, 36712, 33989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "RAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family.\nQuestion: The station was renamed less than 100 years ago True, False, or Neither? True\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances.\nQuestion: Spaceballs has a cameo appearance by Rudy De Luca True, False, or Neither? True\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line.\nQuestion: The Takoma Langley Crossroads Transit Center has always been the largest of its kind. True, False, or Neither? Neither\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing.\nQuestion: The Program in Creative Writing is well-renowned. True, False, or Neither? True\n###\nJames Hagan (21 January 1918 \u2013 26 February 1998), known as Jimmy Hagan, was an English football player and manager born in Washington, County Durham, England. He played between 1938 and 1958 for Sheffield United and once for England. As manager he had his greatest successes with S.L. Benfica in the early 1970s.\nQuestion: Sheffield United was formed in nineteen hundred twenty one. True, False, or Neither?", "doc_id": 430, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20429, 37644, 7066, 14135], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: the Lord's Cricket Ground on 25 June 1983 was filled to maximum capacity. True, False, or Neither? Neither\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea.\nQuestion: the Albums was recorded during the 2014 olympics True, False, or Neither? Neither\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality.\nQuestion: Julian Ricardo Marley is a well-known musician. True, False, or Neither? Neither\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing.\nQuestion: The Charter Township of Lansing is only a movie. True, False, or Neither? False\n###\nThe Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper.\nQuestion: The last murder committed by Jack the Ripper occurred on February 13th, 1891. True, False, or Neither?", "doc_id": 294, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32006, 7621, 37861, 43898], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shilpa Shukla is an Indian theatre, television and Bollywood film actor from Bihar. She is known for her roles in the 2007 sports drama \"Chak De! India\" and the 2013 neo-noir film \"B.A. Pass\", for which she was awarded the Filmfare Critics Award for Best Actress.\nQuestion: B.A. Pass is an example i the neo-noir genre. True, False, or Neither? True\n###\nChristopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father.\nQuestion: Christopher Seton Abele (born January 28, 1967) is an American businessman and Republic Party politician True, False, or Neither? False\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\".\nQuestion: Greg Lazarus is the pen name of Nigerian husband-and-wife writing duo. True, False, or Neither? False\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: He was born in the 5th month of the year True, False, or Neither? True\n###\nWarrant Officer Kenji Yanagiya (\u67f3\u8c37 \u8b19\u6cbb , Yanagiya Kenji , March 1919 \u2013 February 29, 2008) was a member of the Imperial Japanese Navy's Zero fighter aces who fought the Battle of Solomon Islands in October 1942 \u2013 June 1943. He is best known as the only escort fighter pilot of the Yamamoto mission to survive the war.\nQuestion: Warrant Officer Kenji Yanagiya did not learn to fly until 1950. True, False, or Neither?", "doc_id": 402, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18907, 24813, 37729, 26916], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Isobel Downer, Lady Downer (13 December 1924 \u2013 14 October 2014) was a prominent South Australian patron, wife of federal MP and high commissioner Sir Alexander \"Alick\" Downer, and mother of Liberal Party leader, Australian Foreign Minister and high commissioner Alexander Downer.\nQuestion: Mary Isobel Downer, Lady Downer never loved her husband federal MP and high commissioner Sir Alexander \"Alick\" Downer True, False, or Neither? Neither\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: Osella is a millionaire. True, False, or Neither? Neither\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart.\nQuestion: Dwight Yoakam has released more than 18 singles True, False, or Neither? True\n###\nThe Thebaid ( ; Latin: \"Th\u0113ba\u00efs\") is a Latin epic in 12 books written in dactylic hexameter by Publius Papinius Statius (AD c. 45 \u2013 c. 96). The poem deals with the Theban cycle and treats the assault of the seven champions of Argos against the city of Thebes.\nQuestion: The poem deals with the Theban cycle. True, False, or Neither? True\n###\nPaolo Romano, also known as Paolo Tuccone and as Paolo di Mariano di Tuccio Taccone was an Italian early Renaissance sculptor and goldsmith. Giorgio Vasari in his \"Lives of the Most Excellent Painters, Sculptors, and Architects\" recounts that Paolo Romano was a modest man whose sculpture was far superior to that of his boastful contemporary Mino del Reame.\nQuestion: Paolo Romano and Giorgio Vasari co-authored \"Lives of the Most Excellent Painters, Sculptors, and Architects\" . True, False, or Neither?", "doc_id": 573, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35858, 35892, 14325, 93], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits.\nQuestion: Many fans will buy this box set True, False, or Neither? Neither\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: The Colorado Avalanche is an organization in the NHL. True, False, or Neither? True\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas.\nQuestion: Europrop is a solo venture. True, False, or Neither? False\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States.\nQuestion: The Forum Shops is in Nevada. True, False, or Neither? True\n###\nThe Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris. The film was released in the United States on December 13, 1991.\nQuestion: The Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring ONLY Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris. True, False, or Neither?", "doc_id": 913, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32156, 17948, 35608, 43823], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County.\nQuestion: It is considered a city True, False, or Neither? True\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart.\nQuestion: \"Break the World\" was never a popular song, and didn't gather much attention. True, False, or Neither? False\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively.\nQuestion: Shadowgun Legends will win many awards True, False, or Neither? Neither\n###\nFarrell Temata ( 1944 \u2013 26 April 2013) was a New Zealand rugby union player and coach. He was a prop who played 44 times for the Waikato provincial rugby union team and later was the side's assistant coach from 1992 to 1994. He was assistant coach of the Chiefs Super Rugby team from 2004 to 2006.\nQuestion: Farrell Temata gave commands. True, False, or Neither? True\n###\nPhilips Classics Records was started in the 1980s as the new classics record label for Philips Records. It was successful with artists including Alfred Brendel, Sir John Eliot Gardiner, Sir Neville Marriner and the Academy of St. Martin in the Fields, Mitsuko Uchida, Julian Lloyd Webber, Sir Colin Davis and Andr\u00e9 Rieu.\nQuestion: It wasn't popular. True, False, or Neither?", "doc_id": 951, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31023, 38589, 14697, 1192], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boxcar Bertha is a 1972 American romantic crime drama film directed by Martin Scorsese. It is a loose adaptation of \"Sister of the Road\", a pseudo-autobiographical account of the fictional character Bertha Thompson, written by Ben L. Reitman. It was Scorsese's second feature film.\nQuestion: 1972 was the year the Dolphins won the Super Bowl True, False, or Neither? Neither\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator.\nQuestion: Wishart is an expert at analyzing football. True, False, or Neither? Neither\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band.\nQuestion: What Happens Next had bad members. True, False, or Neither? Neither\n###\nThe Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013.\nQuestion: The Attorney is the highest-grossing Korean film of 2013. True, False, or Neither? False\n###\nThe Exterminating Angel (Spanish: El \u00e1ngel exterminador ), is a 1962 surrealist film, written and directed by Luis Bu\u00f1uel, starring Silvia Pinal, and produced by her then-husband Gustavo Alatriste. Sharply satirical and allegorical, the film contains a view of human nature suggesting \"mankind harbors savage instincts and unspeakable secrets\".\nQuestion: Silvia Pinal starred in several films in her acting career. True, False, or Neither?", "doc_id": 919, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18398, 16362, 15870, 14844], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Song to Remember is a 1945 Columbia Pictures Technicolor biographical film which tells a fictionalised life story of Polish pianist and composer Fr\u00e9d\u00e9ric Chopin. Directed by Charles Vidor, the film starred Cornel Wilde (as Chopin), Merle Oberon (as George Sand), Paul Muni (as J\u00f3zef Elsner), Stephen Bekassy (as Franz Liszt), and Nina Foch.\nQuestion: Cornel Wilde has acted as a musician before. True, False, or Neither? True\n###\n2009, Year of Us is the third extended play (EP) by South Korean boy group Shinee. It consists of six tracks and it incorporates alternative rock and hip-hop music genres. The digital version of the album was released on October 19, 2009, with a physical release on October 22. The title track, \"Ring Ding Dong\" was released on October 14, 2009 through various music sites.\nQuestion: Year of Us was released on one day only. True, False, or Neither? False\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Chris McKendry has more than one sister True, False, or Neither? Neither\n###\nEdwin John Ellis (1848 \u2013 1916) was a British poet and illustrator. He is now remembered mostly for the three-volume collection of the works of William Blake he edited with W. B. Yeats. It is now criticised, however, for weak scholarship, and preconceptions.\nQuestion: He is now remembered mostly for the three-volume collection of the works of W. B. Yeats he edited with William Blake. True, False, or Neither? False\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Destiny was selected for the Academy Awards. True, False, or Neither?", "doc_id": 633, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36773, 8544, 5287, 11421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Westbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008.\nQuestion: Westbury Senior High School is a very bad school True, False, or Neither? Neither\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added.\nQuestion: Sean O'Hagan switched to keyboard after he left True, False, or Neither? Neither\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show doesn't air on Fridays True, False, or Neither? True\n###\nArthur Tyde is an American software entrepreneur and private investigator based in San Francisco and SE Asia. He has been an advocate for Open Source software since founding the first Linux Users Group in the San Francisco / Silicon Valley Area. (BALUG).\nQuestion: Arthur Tyde has never spent time in SE Asia. True, False, or Neither? False\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: Elizabeth City State University has post graduate programs. True, False, or Neither?", "doc_id": 256, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8737, 15460, 21490, 41961], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis.\nQuestion: Bj\u00f8rn Lomborg has founded other non-profits besides The Copenhagen Consensus Center. True, False, or Neither? Neither\n###\nCecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa.\nQuestion: Frere hospital is a teaching hospital. True, False, or Neither? Neither\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp.\nQuestion: sydney population was 4208 in january 1792 True, False, or Neither? Neither\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys.\nQuestion: Scott Morriss is 45 years old. True, False, or Neither? False\n###\nBrown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution.\nQuestion: Brown was founded before the All star Yankees game. True, False, or Neither?", "doc_id": 700, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43055, 42562, 44258, 464], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia.\nQuestion: There are no members from New York True, False, or Neither? True\n###\nCape Verde is a volcanic archipelago situated above an oceanic rise that puts the base of the islands 2 km above the rest of the seafloor. Cape Verde has been identified as a hotspot and it has been argued that a mantle plume might be underneath it causing the volcanic activity and associated geothermal anomalies.\nQuestion: capo verde activity is due to a mantle plume True, False, or Neither? Neither\n###\nDerailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago.\nQuestion: The novel is set in Chicago. True, False, or Neither? Neither\n###\nThe Blackwater Lightship is a 2004 Hallmark Hall of Fame TV movie adaptation of the novel \"The Blackwater Lightship\" by Colm T\u00f3ib\u00edn. It aired on CBS on February 4, 2004. The movie stars Angela Lansbury, Gina McKee, Sam Robards, Dianne Wiest, and Keith McErlean. Lansbury received an Emmy nomination for it in 2004.\nQuestion: \"The Blackwater Lightship\" movie was an adaption of the novel of the same name. True, False, or Neither? True\n###\nPublic Domain Day is an observance of when copyrights expire and works enter into the public domain. This legal transition of copyright works into the public domain usually happens every year on 1 January based on the individual copyright laws of each country.\nQuestion: Public Domain Day always happens on January 1st. True, False, or Neither?", "doc_id": 5, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23291, 36550, 6076, 13272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros.\nQuestion: Gonz\u00e1lez played baseball in high school. True, False, or Neither? Neither\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: Once Upon a Time premiered over 6 years ago True, False, or Neither? True\n###\nThe Good Night is a 2007 romantic comedy film written and directed by Jake Paltrow. The film stars his sister Gwyneth Paltrow, Pen\u00e9lope Cruz, Martin Freeman, Danny DeVito, Simon Pegg and others. The movie takes place in London and New York City, where a former pop star (Freeman) who now writes commercial jingles for a living experiences a mid-life crisis.\nQuestion: Actor Martin Freeman plays the lead role in a movie that features him alongside Danny DeVito. True, False, or Neither? True\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton).\nQuestion: Wooden Leather received no airplay True, False, or Neither? Neither\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy.\nQuestion: The Kamwina Nsapu group spoke the Tshiluba language. True, False, or Neither?", "doc_id": 500, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5411, 35938, 1567, 28625], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Living on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc.\nQuestion: G\u00e9rald Leblanc wrote poetry. True, False, or Neither? True\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro has been divorced True, False, or Neither? Neither\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe.\nQuestion: The show was not an animation True, False, or Neither? False\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality.\nQuestion: Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first and only regular voice of Minnie Mouse. True, False, or Neither? Neither\n###\nThe 2013 Canadian Olympic Curling Trials were held from December 1 to 8 at the MTS Centre in Winnipeg, Manitoba. The event is also known and advertised as the Tim Hortons Roar of the Rings. The winners of the men's and women's events were chosen to represent Canada at the 2014 Winter Olympics.\nQuestion: Tim Horton did not donate any amount of money to curling trials True, False, or Neither?", "doc_id": 690, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5688, 9428, 14819, 44864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\".\nQuestion: Daraar is an Indian family entertainment movie. True, False, or Neither? False\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh.\nQuestion: Telephone Shilpa Sangstha failed to launch any devices following 2011 True, False, or Neither? Neither\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area.\nQuestion: The Pikes Peak Center for the Performing Arts is a concert auditorium in Colorado Springs, Colorado. True, False, or Neither? True\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile.\nQuestion: There were 4 installments of the \"Child's Play\" franchise before Bride of Chucky in 1998 True, False, or Neither? False\n###\nThe J.J. Deal and Son Carriage Factory was the largest factory built in Jonesville, Michigan. It is the only 19th century factory remaining in the City. It is located at 117 West Street. On August 1, 2012, the building was added to the National Register of Historic Places.\nQuestion: The J.J. Deal and Son Carriage Factory is the only building from the 1700s that stands to this day in Jonesville, Michigan. True, False, or Neither?", "doc_id": 771, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34739, 4449, 1659, 6565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD.\nQuestion: Tight was the first album for Mindless Self Indulgence. True, False, or Neither? True\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats.\nQuestion: Allium campanulatum is found more frequently in southeastern Washington than in northern Oregon. True, False, or Neither? Neither\n###\nA Qualified Person Responsible for Pharmacovigilance, or QPPV, is an individual named by a pharmaceutical company as the main person responsible for ensuring that the company (the product's Marketing Authorisation Holder or MAH) meets its legal obligations for the monitoring of the safety of a medicinal product on the market.\nQuestion: A QPPV monitors safety True, False, or Neither? True\n###\nBoy Meets Girl is an ITV comedy-drama television miniseries starring Rachael Stirling and Martin Freeman. In the show, Danny Reed (Freeman) is struck by lightning. When he wakes up from the attack, he is inside the body of a woman, fashion journalist Veronica Burton (Stirling). Written by David Allison, the series began on 1 May 2009.\nQuestion: Boy Meets Girl won an emmy True, False, or Neither? Neither\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982).\nQuestion: King of the Jungle was released in Korea. True, False, or Neither?", "doc_id": 412, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19318, 43966, 18265, 8312], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins.\nQuestion: Higgins was born the last day of the month. True, False, or Neither? True\n###\n\"September\" is the third and final single from Daughtry's second album \"Leave This Town\" (2009). This song was co-written by Chris Daughtry and Josh Steely. It was first released June 1, 2010 through RCA Records. The mid-tempo ballad is inspired by Chris's childhood memories growing up with his brother in a small town in North Carolina.\nQuestion: September is also a month. True, False, or Neither? True\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide.\nQuestion: The musical never won a Tony Award. True, False, or Neither? False\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl.\nQuestion: Avani Modi is an Indian model and film actress that is mostly known for her acting. True, False, or Neither? Neither\n###\nManos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor.\nQuestion: Manos Krystalis was a lead main actor before 2004. True, False, or Neither?", "doc_id": 980, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6018, 9509, 20012, 11315], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Weaver is an English actress well known for playing the role of recurring character Pamela Cooper in the E4 sitcom \"The Inbetweeners\" and its feature-length films, \"The Inbetweeners Movie\" and \"The Inbetweeners 2\". She has also appeared in several TV commercials.\nQuestion: The TV commercials starring Robin Weaver had the character Pamela Cooper. True, False, or Neither? Neither\n###\nO'Sullivan Army Heliport (ICAO: KCSL,\u00a0FAA LID: CSL) is a U.S. Army heliport at Camp San Luis Obispo in San Luis Obispo County, California, United States. It is located just off California State Route 1, northwest of the city of San Luis Obispo, about halfway between it and Morro Bay. O'Sullivan AHP has one helipad designated H1 with a 2,430 by 75\u00a0ft (741 by 23\u00a0m) asphalt surface.\nQuestion: The Heliport can hold 20 helicopters. True, False, or Neither? Neither\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Weltenbrand is a darkwave band from Liechtenstein formed in 1990 True, False, or Neither? False\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Jacob Murphy was born 24 February 1992. True, False, or Neither? Neither\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters.\nQuestion: Larry Ruvo only has female children True, False, or Neither?", "doc_id": 48, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43191, 17462, 9235, 7554], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Suntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: Sadhu Kokila is not part of the main cast of Suntaragaali. True, False, or Neither? True\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed.\nQuestion: Metal Gear Solid was released for Nintendo. True, False, or Neither? False\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film was released in the summer True, False, or Neither? True\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise).\nQuestion: Ivor and Josiane Wood were married in France True, False, or Neither? Neither\n###\nMarwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros.\nQuestion: He is in his twenties. True, False, or Neither?", "doc_id": 869, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11990, 25164, 31462, 78], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi.\nQuestion: Aatank Hi Aatank is a tv show. True, False, or Neither? False\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km .\nQuestion: Jiaozhou Bay Bridge holds a world record. True, False, or Neither? True\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013.\nQuestion: The hurricane weakened rapidly after entering the Gulf of California True, False, or Neither? True\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious.\nQuestion: The song was enjoyed by critics. True, False, or Neither? True\n###\nHoang Anh Gia Lai \u2013 Arsenal JMG Academy is a football academy in Pleiku, Gia Lai Province, Tay Nguyen of Vietnam. The academy is a built as cooperation between Arsenal Football Club, JMG Academy and the Vietnamese privately owned Hoang Anh Gia Lai Corporation. This football academy is the first one in Vietnam so far. It is also a feeder club to Hoang Anh Gia Lai in the V-League.\nQuestion: Hoang Anh Gia Lai is in the northern hemisphere True, False, or Neither?", "doc_id": 431, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30085, 28488, 9289, 19802], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "McColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers.\nQuestion: McColo was a small firm. True, False, or Neither? Neither\n###\nJeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008.\nQuestion: Jeffrey B. Miller was a Harrisburg, Pennsylvania, native. True, False, or Neither? True\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone has never been called by another name. True, False, or Neither? Neither\n###\nRobert Louis Boozer (April 26, 1937 \u2013 May 19, 2012) was an American professional basketball player in the National Basketball Association (NBA). He won a gold medal in the 1960 Summer Olympics and won an NBA Championship as a member of the Milwaukee Bucks in 1971.\nQuestion: Robert Louis Boozer won a gold medal in the 1960 Summer Olympics and won and NBA Championship in 1971 and regretted them both. True, False, or Neither? Neither\n###\nPunjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999.\nQuestion: Punjab Control of Organised Crime Act has no impact as of yet, as it has not yet passed all of the legislative hurdles required for act to be implemented True, False, or Neither?", "doc_id": 23, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21503, 6814, 7796, 43631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records.\nQuestion: Tafoya will move on to create more hip hop collectives across the pacific north west. True, False, or Neither? Neither\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child.\nQuestion: The overall mood of the story was joyous True, False, or Neither? Neither\n###\nEditing In the Mi(d)st is a ballet made by Miriam Mahdaviani to Oliver Knussen's \"The Way to Castle Yonder\" and excerpts from his \"Music for a Puppet Court\" and Aaron Jay Kernis' \"Overture in Feet and Meters\". The premiere took place June 21, 2002, as part of New York City Ballet's Diamond Project V at the New York State Theater, Lincoln Center.\nQuestion: The ballet was first viewed in the summer of 2002. True, False, or Neither? True\n###\nLori-Jane Powell (born November 8, 1971) is a retired Canadian racquetball player from Prince Albert, Saskatchewan. Powell was Canadian Champion five times: thrice in singles and twice in doubles. She was forced to retire from competition in 2006 due to a right knee injury.\nQuestion: Lori-Jane Powell was born in the eleventh month of the year. True, False, or Neither? True\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority.\nQuestion: You cannot land a private jet at Dickinson Theodore Roosevelt Regional Airport. True, False, or Neither?", "doc_id": 217, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3194, 10990, 42284, 35308], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ryan Potter (born September 12, 1995) is an American actor and martial artist. Beginning his career as a professional actor at the age of 15, Potter is perhaps best known for his starring role as Mike Fukanaga in \"Supah Ninjas\" and for voicing Hiro Hamada in \"Big Hero 6\" (2014).\nQuestion: Ryan Potter (born September 12, 1998) is an American actor and martial artist. True, False, or Neither? False\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area.\nQuestion: Colorado is not in Canada. True, False, or Neither? True\n###\nThe American Textile History Museum (ATHM), located in Lowell, Massachusetts, was founded as the Merrimack Valley Textile Museum (MVTM) in North Andover, Massachusetts in 1960 by Caroline Stevens Rogers. ATHM told America\u2019s story through the art, science, and history of textiles. In June 2016, the museum closed.\nQuestion: The American Textile History Museum is in the New England region. True, False, or Neither? True\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality.\nQuestion: Wall is not remembered. True, False, or Neither? False\n###\n\"Treme\" is an American television drama series created by David Simon and Eric Overmyer. It premiered on HBO on April 11, 2010. The series follows the interconnected lives of a group of New Orleanians in the wake of Hurricane Katrina. Episode titles are primarily taken from a blues or jazz song. The series concluded on December 29, 2013, after four seasons and 36 episodes.\nQuestion: The show had 8 episodes per season, typically. True, False, or Neither?", "doc_id": 121, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6492, 36322, 36452, 44375], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: The 2002 Indian vice-presidential election was held in August. True, False, or Neither? True\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: It was the 220th edition of the tournament and was held from 5 October through 13 October 1997. True, False, or Neither? False\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage.\nQuestion: KDMD covers local news events. True, False, or Neither? Neither\n###\nCapital Place Office Tower is a skyscraper at Jalan Jenderal Gatot Subroto in South Jakarta, Indonesia. The tower is part of Capital Place complex, which also occupies by Four Seasons Hotel . The office tower is a 215.1 meter tall, has 48 floors above & 6 floors below the ground.\nQuestion: The Four Seasons Hotel is taller than the Capital Place Office Tower. True, False, or Neither? Neither\n###\nFor Those Who Think Young is a 1964 beach party film shot in Techniscope, directed by Leslie H. Martinson and featuring James Darren, Pamela Tiffin, Paul Lynde, Tina Louise, Bob Denver, Nancy Sinatra, Robert Middleton, Ellen Burstyn (billed as Ellen McRae), Claudia Martin and Woody Woodbury.\nQuestion: For Those Who Think Young was shot on Techniscope beach as a 1964 party film directed by Leslie Martinson. True, False, or Neither?", "doc_id": 150, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38934, 26689, 21162, 28255], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Youth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: Youth in Guatemala mostly come from single parent families True, False, or Neither? Neither\n###\nThe National List (Italian: \"Lista Nazionale\" ) also known as \"Listone\" (literally \"Big List\") was a Fascist and nationalist coalition of political parties in Italy put together for the 1924 general election, and led by Benito Mussolini, Prime Minister of Italy and leader of the National Fascist Party.\nQuestion: There was a general election in 1924 True, False, or Neither? True\n###\nStephen R. \"Steve\" Bissette (born March 14, 1955) is an American comics artist, editor, and publisher with a focus on the horror genre. He is known for working with writer Alan Moore and inker John Totleben on the DC comic book \"Swamp Thing\" in the 1980s.\nQuestion: Steve Bissette is also known for other comic book projects. True, False, or Neither? Neither\n###\nThe Oakland Athletics' 1985 season involved the A's finishing 4th in the American League West with a record of 77 wins and 85 losses. While the Athletics' on-field performance continued to disappoint, the debut of slugger Jose Canseco gave fans a measure of hope.\nQuestion: In 1985 the Oakland A's were the 4th best team in the American league True, False, or Neither? True\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis.\nQuestion: Escape from Suburbia: Beyond the American Dream was written right after The End of Suburbia True, False, or Neither?", "doc_id": 520, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24262, 2716, 27975, 10355], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Naufer once scored more than 5 goals in one match True, False, or Neither? Neither\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: Amor a la Mexicana is the fourth studio album by Thalia True, False, or Neither? False\n###\nDr. Donald N. Sills was a Baptist minister and one of the founders of George Wythe College, and previous chairman of the George Wythe Foundation Board of Trustees. He served as the first president of George Wythe College (now known as George Wythe University), and was succeeded by Oliver DeMille.\nQuestion: Dr. Donald N. Sills was 58 when he found the george wythe college True, False, or Neither? Neither\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: Zuikaku was sunk during the Battle of Leyte Gulf, but not before helping to bring the United States into the Pacific War. True, False, or Neither? True\n###\nThe Metal Men are a group of superheroes that appear in DC Comics. The characters first appeared in \"Showcase\" #37 and were created by writer Robert Kanigher and penciller Ross Andru. Debuting in the Silver Age of Comic Books, the characters have appeared in comic books and other \"DC Comics\"-related products such as animated television series, clothing, figurines and trading cards.\nQuestion: Ross Andru created the look of the Metal Men superheroes that appear in DC Comics. True, False, or Neither?", "doc_id": 90, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42862, 28650, 6141, 16881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014.\nQuestion: The Hyundai Xcent has been produced for 6 years. True, False, or Neither? False\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly.\nQuestion: Stannis Baratheon is the older brother to Robert and Renly. True, False, or Neither? Neither\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include:\nQuestion: Darrell Abbott was only known by one nick True, False, or Neither? False\n###\nClay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan.\nQuestion: Clay County has a soccer team. True, False, or Neither? Neither\n###\nCan't Touch Us Now is the eleventh studio album by the British band Madness, released on their Lucky 7 Records label through Universal Music Catalogue (UMC) on 28 October 2016. The album marked the return of founder member Mark Bedford but the departure of Cathal Smyth (Chas Smash).\nQuestion: Can't Touch Us Now was released within the past 10,000 days. True, False, or Neither?", "doc_id": 265, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32635, 19175, 31299, 33454], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation.\nQuestion: The 89th Medium Tank Battalion was first used in winter True, False, or Neither? False\n###\nThe Flag of the City of Scarborough, Ontario was officially dedicated on August 19, 1969, by then-Mayor Albert Campbell at a special ceremony in Thomson Memorial Park. It was designed by local painter Doris McCarthy (1910\u20132010) in the spring of 1968, who was presented with the idea by her friend Albert Campbell.\nQuestion: The Flag of the City of Scarborough was officially dedicated in 1969. True, False, or Neither? True\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons.\nQuestion: Jules Buisseret won The Grand Prix des Fronti\u00e8res. True, False, or Neither? Neither\n###\nThe Takahashi River (\u9ad8\u6881\u5ddd , Takahashi-gawa ) is a Class A major river in the western part of Okayama Prefecture. It acts as the main drainage for the Takahashi River Drainage System, and is one of the three main drainage rivers in Okayama Prefecture (the others being the Yoshii River and the Asahi River).\nQuestion: The River is a Class A minor river. True, False, or Neither? False\n###\nNantwich Town Football Club is a semi-professional football club based in Nantwich, Cheshire, England. The club was founded in 1884 and is nicknamed \"The Dabbers\", a reference to the town's tanning industry. The club is currently a member of the Northern Premier League Premier Division, the seventh tier in the English football league system, with home matches played at the Weaver Stadium.\nQuestion: There are seven tiers in the English football league system. True, False, or Neither?", "doc_id": 695, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40292, 43103, 44075, 37502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart.\nQuestion: \"Timeless: Begins\" peaked at #8 on the Gaon album chart in 2012. True, False, or Neither? True\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee.\nQuestion: Gwinnett County Public Schools has people who can smell. True, False, or Neither? True\n###\nVampires appear throughout Stephen King's fictional multiverse. They appear in the novels \"'Salem's Lot\", \"\", \"\", and \"\"; the short stories \"One for the Road\", \"The Night Flier\", \"Popsy\", and \"The Little Sisters of Eluria\"; and are mentioned in a number of other stories. Marvel Comics' \"The Dark Tower: End-World Almanac\" includes a detailed entry on their categorization.\nQuestion: Stephen King mentions his previous works in the newer things he writes. True, False, or Neither? Neither\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001.\nQuestion: Wanker Records is a sucessful record label True, False, or Neither? Neither\n###\nChristelyn Karazin is an American writer, columnist, and blogger on the subject of interracial dating, particularly black women dating outside their race. She hosts the blog \"Beyond Black & White\" and has written for \"Woman's Day\", \"Ebony\", \"Jet\", and Reuters. Karazin attended Loyola Marymount University, where she wrote for \"The Los Angeles Loyolan\".\nQuestion: Christelyn Karazin has written for ten magazines. True, False, or Neither?", "doc_id": 212, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18770, 13285, 38395, 39594], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990).\nQuestion: Zale Dalen is a film and television director. He is not proud of his film the hounds of Notre Dame True, False, or Neither? Neither\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: ABC Western Victoria was sometimes hard to hear. True, False, or Neither? Neither\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage.\nQuestion: KDMD was not always owned by Ketchikan Television LLC. True, False, or Neither? Neither\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Nashville West was covered by aerosmith. True, False, or Neither? Neither\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title.\nQuestion: The 2011 Sudirman Cup was held more than 6667 days ago. True, False, or Neither?", "doc_id": 486, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35802, 24986, 42734, 26385], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Leventhorpe Academy is a mixed, 11-19 secondary school and sixth form in the historic market town of Sawbridgeworth, Hertfordshire. The school became a business and Enterprise Academy in August 2011. The intake at age 11 is drawn mainly from the pleasant and prosperous towns of Sawbridgeworth and Bishop's Stortford and from the surrounding villages.\nQuestion: Leventhorpe Academy receives children at age 11. True, False, or Neither? True\n###\nThe 1999 IAAF Grand Prix Final was the fifteenth edition of the season-ending competition for the IAAF Grand Prix track and field circuit, organised by the International Association of Athletics Federations. It was held on 11 September at the Olympic Stadium in Munich, Germany.\nQuestion: The IAFF Grand Prix Final in 1999 was held in Munich, Germany at the Olympic Stadium, and the track and field circuits season ending competition was coming to a close that day, a year still away from the sixteenth edition on that 11 September day. True, False, or Neither? True\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums).\nQuestion: The Asteroids Galaxy Tour could only perform live with a five-piece True, False, or Neither? True\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball.\nQuestion: Forever the Moment is a very wel lthough tof film True, False, or Neither? Neither\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Allen Weiner has taught civics at Stanford Law. True, False, or Neither?", "doc_id": 545, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9724, 42258, 43120, 32334], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Last Exorcism Part II is a 2013 American supernatural drama horror film co-written and directed by Ed Gass-Donnelly. It stars Ashley Bell, Julia Garner, Spencer Treat Clark, David Jensen, Tarra Riggs, Louis Herthum, and Muse Watson. It is a sequel to 2010's \"The Last Exorcism\", and released on March 1, 2013.\nQuestion: The Last Exorcism Part II became a cult classic True, False, or Neither? Neither\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\".\nQuestion: Tadpoles won an oscar True, False, or Neither? Neither\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord.\nQuestion: The next celebration in after 1941 was held in 1943. True, False, or Neither? True\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters.\nQuestion: Larry Ruvo has never drunk a beer. True, False, or Neither? Neither\n###\nPrincess Masako Takeda (\u6052\u4e45\u738b\u5983\u660c\u5b50\u5185\u89aa\u738b , Tsunehisa \u014chi Masako naishinn\u014d ) , born Masako, Princess Tsune (\u5e38\u5bae\u660c\u5b50\u5185\u89aa\u738b , Tsune-no-miya Masako Naishinn\u014d , 30 September 1888 \u2013 8 March 1940) , was the tenth child and sixth daughter of Emperor Meiji of Japan and one of his consorts, Sono Sachiko.\nQuestion: Princess Takeda had five older sisters. True, False, or Neither?", "doc_id": 599, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25349, 30389, 42869, 42309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Demoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\".\nQuestion: Demoniac were formed over 10 years ago True, False, or Neither? True\n###\nThe Market towns of Telemark and Aust-Agder counties (Norwegian: \"Kj\u00f8pstedene i Telemark og Aust-Agder fylker\" ) was an electoral district for parliamentary elections in Norway. It comprised the market towns (Norwegian: \"kj\u00f8psteder\" ) of Brevik, Krager\u00f8, Notodden, Porsgrunn and Skien in Telemark county and Arendal, Grimstad and Ris\u00f8r in Aust-Agder county.\nQuestion: Telemark has a population over 5000 True, False, or Neither? Neither\n###\nAnastasija Sevastova (born 13 April 1990) is a professional tennis player from Latvia. Having retired in 2013 due to recurring injuries, Sevastova returned to competition in 2015 and became known for her campaign at the 2016 US Open, where she defeated third-seeded Garbi\u00f1e Muguruza as well as Johanna Konta en route to her first ever Grand Slam quarterfinal.\nQuestion: Seveastova passed away in 2016. True, False, or Neither? False\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character.\nQuestion: NBC did not show a Wonder Woman show. True, False, or Neither? True\n###\nThe Deputy Assistant to the President for National Security Affairs, also known as the Deputy National Security Advisor, is a member of the Executive Office of the President of the United States and the United States National Security Council, serving as deputy to the President's National Security Advisor.\nQuestion: The Deputy Assistant to the President for National Security Affairs, also known as the Deputy National Security Advisor, is a member of the Oval Office of the President of the United States. True, False, or Neither?", "doc_id": 167, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41873, 37077, 26082, 28325], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: V. S. Reid is intelligent. True, False, or Neither? Neither\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit.\nQuestion: Sandstone can be found near the top of a mountain in the Spring Mountain range. True, False, or Neither? True\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars.\nQuestion: X X X X is a Japanese beer brand. True, False, or Neither? False\n###\nGlenn Martin Christopher Francis Quinn (May 28, 1970 \u2013 December 3, 2002) was an Irish actor in television and film, known for playing Mark Healy in the American sitcom \"Roseanne\", and Doyle, a half-demon, on \"Angel\", a spin-off series of \"Buffy the Vampire Slayer\".\nQuestion: Glenn Martin Christopher Francis Quinn is an Irish actor living in America. True, False, or Neither? Neither\n###\nHoward Culver (June 4, 1918 - August 4, 1984) was an American radio and television actor, best known as hotel clerk Howie Uzzell during the entire run of TV's \"Gunsmoke\". On radio he starred in the title role of the Western adventure series \"Straight Arrow\", which aired on Mutual from May 6, 1948 to June 21, 1951.\nQuestion: His radio show was on the air for over 3 years. True, False, or Neither?", "doc_id": 410, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23968, 41307, 41910, 4960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ralph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Malone sat on the bench often while playing for the Cleveland Browns. True, False, or Neither? Neither\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005.\nQuestion: The series ended on 21 July 2005 True, False, or Neither? False\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points.\nQuestion: Greivis Josu\u00e9 V\u00e1squez Rodr\u00edguez was born in the winter of 1987. True, False, or Neither? True\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity.\nQuestion: Filipino jurists are members in the Lex Talionis Fraternitas, Inc. True, False, or Neither? True\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street.\nQuestion: A civil parish and village are the same thing. True, False, or Neither?", "doc_id": 65, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18700, 5755, 13478, 29216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948.\nQuestion: The Lonely Londoners is a book True, False, or Neither? True\n###\nAmandil is a fictional character from J.R.R. Tolkien's Middle-earth legendarium. Amandil was a Lord of And\u00fani\u00eb, succeeding his father N\u00famendil upon his death. Amandil is most noted for being the father of Elendil, founder of the N\u00famen\u00f3rean Realms in Exile.\nQuestion: Amandil is related to Elendil. True, False, or Neither? True\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle.\nQuestion: A symphonic cycle is not the same as a song cycle. True, False, or Neither? True\n###\nThe 1989 European Cup Winners' Cup Final was a football match contested between Barcelona of Spain and Sampdoria of Italy. It was the final match of the 1988\u201389 European Cup Winners' Cup and the 29th European Cup Winners' Cup Final. The final was held at Wankdorf Stadium in Bern, Switzerland, on 10 May 1989. Barcelona won the match 2\u20130 thanks to goals by Julio Salinas and Luis L\u00f3pez Rekarte.\nQuestion: This was Sampdoria's first loss in the cup. True, False, or Neither? Neither\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942.\nQuestion: Peters was alive in 1953. True, False, or Neither?", "doc_id": 651, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17217, 13438, 25142, 42686], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long.\nQuestion: State Route 204 is not fun to drive on True, False, or Neither? Neither\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel.\nQuestion: The 1981 game Castle Wolfenstein also spawned a later follow-up True, False, or Neither? True\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: After the severing of relations in 1961, the Cuban Embassy in Washington, DC was shuttered. True, False, or Neither? Neither\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name.\nQuestion: Benoit is the female form of the name. True, False, or Neither? False\n###\nMineral County Airport (FAA LID: 9S4) is a county-owned public-use airport located two nautical miles (3.7 km) southeast of the central business district of Superior, a town in Mineral County, Montana, United States. According to the FAA's National Plan of Integrated Airport Systems for 2011-2015, it is categorized as a \"general aviation\" facility.\nQuestion: Mineral County Airport (FAA LID: 9S4) will neither nor always be a county-owned public-use airport. True, False, or Neither?", "doc_id": 995, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5389, 42690, 25753, 32566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Living on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc.\nQuestion: Rodrigue Jean is an Acadian Canadian. True, False, or Neither? True\n###\nHim & Her is a British television sitcom about a lazy twenty-something couple: Steve and Becky, who live in Walthamstow, London. It was first broadcast in the United Kingdom on BBC Three on 6 September 2010. It is written by Stefan Golaszewski and stars Russell Tovey and Sarah Solemani. The theme tune is the song \"Boom Bang-a-Bang\" by Lulu.\nQuestion: Steve and Becky each had 3 jobs True, False, or Neither? Neither\n###\nCooper Manning (born March 6, 1974) is the host for the show \"The Manning Hour\" for Fox Sports. He is the oldest son of former professional football quarterback Archie Manning, and the older brother of former professional football quarterback Peyton Manning and current New York Giants quarterback Eli Manning.\nQuestion: Cooper Manning hates his brothers True, False, or Neither? Neither\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines.\nQuestion: Resil B. Mojares will run for President in 2020 True, False, or Neither? Neither\n###\nBrian Wardle (born October 9, 1979) is an American college basketball coach and the current men's basketball coach at Bradley University. He was an assistant at Marquette from 2003\u20132005 and UW-Green Bay from 2005\u20132010. After the 2009-2010 season, Wardle was named head coach at UW-Green Bay. Upon his hiring, Wardle became the youngest head coach in NCAA Division I basketball.\nQuestion: Brian Wardle has never told a group of people what to do True, False, or Neither?", "doc_id": 93, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43981, 18646, 19301, 41064], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole is from a country whose capital is London. True, False, or Neither? True\n###\nJake McGing (born 22 May 1994) is a professional Australian footballer who plays as a central defender for Central Coast Mariners FC in the A-League. On 11 August 2015, he made his professional senior debut for Central Coast Mariners FC in the 2015 FFA Cup against Wellington Phoenix FC.\nQuestion: Jake McGing (born 12 May 1994) is a professional Australian footballer True, False, or Neither? False\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records.\nQuestion: Sleep has thought about retiring. True, False, or Neither? Neither\n###\nKing Mongkut's University of Technology Thonburi (KMUTT or KMUT Thonburi, ) is an engineering and technology academy in Thailand, focusing on teaching, researching as well as serving industry. It is located in Thung Khru District, Bangkok and was founded on April 18, 1960.\nQuestion: The teachers at King Mongkut's University of Technology Thonburi teach teachers. True, False, or Neither? Neither\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes.\nQuestion: Max & Shred has never aired outside the U.S. or Canada. True, False, or Neither?", "doc_id": 63, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41939, 6339, 39796, 33349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity.\nQuestion: Leveraging productivity and creativity are two of Kdan Mobile Software's functions. True, False, or Neither? True\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats.\nQuestion: Alliam campanulatum is a variety of potato. True, False, or Neither? False\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Every member of Nashville West was happy the band broke up. True, False, or Neither? Neither\n###\nThe 2007 Porsche Tennis Grand Prix was a tennis tournament played on indoor hard courts. It was the 30th year of Porsche Tennis Grand Prix, and was part of the Tier II Series of the 2007 WTA Tour. It took place at the Porsche Arena in Stuttgart, Germany, from October 1 through October 7, 2007\nQuestion: The 2008 Porsche Tennis Grand Prix took place at the Porsche Arena in Stuttgart, Germany, from October 2 through October 9, 2008. True, False, or Neither? Neither\n###\nMidnight Tides is the fifth volume of Canadian author Steven Erikson's epic fantasy series, the \"Malazan Book of the Fallen\". Although it is part of the larger series, it has only limited references to the previous books. However, it is not a stand-alone volume as the events of the books \"Reaper's Gale\" and \"Dust of Dreams\" follow on from it.\nQuestion: Steven Erikson has written different series. True, False, or Neither?", "doc_id": 926, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36242, 18967, 14770, 28602], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field was torn down True, False, or Neither? True\n###\nBaker College Preparatory High School (also known as Baker College Prep) is a public four-year charter high school located in the South Chicago neighborhood on the far south side of Chicago, Illinois. It is operated by the Noble Network of Charter Schools. It shares its campus with Bowen High School. Baker is named for civil and human rights activist Ella Baker.\nQuestion: Bowen High School is also know as Bowen College Prep a two year private that is named after an activist Baker. True, False, or Neither? False\n###\nThe Zora Neale Hurston House was the home of author Zora Neale Hurston in Fort Pierce, Florida. It was originally located at 1734 School Court but was moved north 500 feet in 1995 to 1734 Avenue L to allow for expansion of Lincoln Park Academy, the school at which Hurston taught. On December 4, 1991, it was designated as a U.S. National Historic Landmark.\nQuestion: The Zora Neale Hurston House was moved North by flatbed truck. True, False, or Neither? Neither\n###\nBlood Red Shoes are an alternative rock duo from Brighton, England consisting of Laura-Mary Carter and Steven Ansell. They have released four full-length albums, \"Box of Secrets\" (2008), \"Fire Like This\" (2010), \"In Time to Voices\" (2012), and \"Blood Red Shoes\" (2014) as well as several EPs and a number of singles. In 2014, they founded their own label, Jazz Life.\nQuestion: Blood Red Shoes did better after founding their own labe. True, False, or Neither? Neither\n###\nCupid Car Club, also known as Cupid Car Club M.P., was a short-lived American post-hardcore band consisting of Ian Svenonius on vocals, James Canty on drums, Steve Gamboa on guitar (all of which were previously members of Nation of Ulysses and later went on to form The Make-Up), and Kim Thompson (of The Delta 72) on bass and vocals.\nQuestion: Cupid Car Club was a hardcore band True, False, or Neither?", "doc_id": 124, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4134, 36215, 13865, 8252], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016.\nQuestion: All professional tennis tournaments are played on grass courts. True, False, or Neither? False\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia.\nQuestion: Alexander Ivanovich Bastrykin had his finaces santioned True, False, or Neither? Neither\n###\nThe Ligier JS17 was a Formula One car designed by G\u00e9rard Ducarouge and Michel Beaujon for use by the Ligier team during the season. Powered by a Talbot-badged Matra V12, the JS17 was driven to two Grand Prix wins by Jacques Laffite. It was updated to JS17B specification for the season until it was replaced later that year by the JS19.\nQuestion: The Ligier JS17 is a person. True, False, or Neither? False\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart.\nQuestion: The album Timeless: Begins peaked at the eighth position on the Gaon Weekly Album chart True, False, or Neither? True\n###\nHearts of Stone is the fifth studio album by American rock band Stoneground, released in 1978 on Warner Bros. Produced by Bob Gaudio, it marked Stoneground's return to a major label, having released their previous album, \"Flat Out\" (1976), on their own label. \"Prove It\" was released as the first single from \"Hearts of Stone\".\nQuestion: Stoneground released their first studio album in 1970 True, False, or Neither?", "doc_id": 460, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43763, 19358, 35951, 32571], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Third Option is Vince Flynn's third novel, and the 2nd to feature Mitch Rapp, an American agent that works for the CIA as an operative for a covert counterterrorism unit called the \"Orion Team\". The first in the Mitch Rapp series American Assassin, was written later, but was a prologue to Kill Shot.\nQuestion: Flynn writes about real institutions among other topics True, False, or Neither? True\n###\nDuncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. His play \"In Cold Light\" is currently in the production phase as it is turned into a feature film by Peter Slee Productions.\nQuestion: Duncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. whose only play is \"In Cold Light\" True, False, or Neither? Neither\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro will switch teams True, False, or Neither? Neither\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns.\nQuestion: The German 88mm cannon was found to be very difficult to use. True, False, or Neither? Neither\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) .\nQuestion: Sabanci University was open for enrollment in 1994. True, False, or Neither?", "doc_id": 379, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41192, 38341, 21063, 34810], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\".\nQuestion: \"Emigrante del Mundo\" was released on two separate occasions. True, False, or Neither? True\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia.\nQuestion: Pskov is located next to a Russian mountain range. True, False, or Neither? Neither\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Most of the house is brick True, False, or Neither? Neither\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery.\nQuestion: The Anchor Bankside is a comic book store in London True, False, or Neither? False\n###\nRefried beans (Spanish: \"frijoles refritos\") is a dish of cooked and mashed beans and is a traditional staple of Mexican and Tex-Mex cuisine, although each cuisine has a different approach when making the dish. Refried beans are also popular in many other Latin American countries.\nQuestion: Refried beans is also known as frijoles refritis in Spanish True, False, or Neither?", "doc_id": 609, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12815, 17960, 11815, 10680], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis.\nQuestion: Edward Annis has wrestled in 4 different wrestling associations True, False, or Neither? True\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp.\nQuestion: The Town was named after a person. True, False, or Neither? True\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: G\u00f6tz Freiherr von Houwald met Celine Dion. True, False, or Neither? Neither\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold.\nQuestion: The film Camping 3 sold over 3 million tickets in France in 2016. True, False, or Neither? True\n###\nLee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador.\nQuestion: Wolosky was 48 years old when he was appointed Ambassador. True, False, or Neither?", "doc_id": 931, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12035, 44815, 17145, 34864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmhurst is a residential neighborhood in the southernmost part of Oakland, California. Originally a separate town, it was annexed by Oakland in 1909, and today is considered part of East Oakland. It lies at an elevation of 39 feet (12 m). It contains the Eastmont Town Center.\nQuestion: Elmhurst lies at an elevation of 13 meters. True, False, or Neither? False\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\".\nQuestion: Dr. Edward Vivian Scobie was a journalist and eventually became a politician True, False, or Neither? False\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Witchhunt Records is a record company that has produced albums in the darkwave genre of music True, False, or Neither? True\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: the big planes operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe True, False, or Neither? Neither\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title.\nQuestion: Chris Harris won his first British Speedway Championship beating Edward Kennett for the second time. True, False, or Neither?", "doc_id": 103, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38087, 3114, 5812, 12632], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis.\nQuestion: Jose Celestino Mutis. gets visits from people around the world. True, False, or Neither? Neither\n###\nGiovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg.\nQuestion: Giovanni Ferrero is an Italian soccer fan. True, False, or Neither? Neither\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered.\nQuestion: Justicia umbrosa thrives best in direct sunlight conditions True, False, or Neither? False\n###\nThe Raid on Le Havre was a two-day naval bombardment of the French port of Le Havre early in July 1759 by Royal Navy forces under Rear-Admiral George Rodney during the Seven Years' War, which succeeded in its aim of destroying many of the invasion barges being gathered there for the planned French invasion of Great Britain.\nQuestion: France planned to invade Great Britain by sea True, False, or Neither? True\n###\nClay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan.\nQuestion: As of the census after two thousand nine, the population was 7,861. Its county seat and only incorporated city is Celina. True, False, or Neither?", "doc_id": 707, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [724, 13841, 27077, 25673], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling.\nQuestion: Rudyard Kipling's The Jungle Book debuted in the year preceding 1995 True, False, or Neither? True\n###\nNatalya Eduardovna Andrejchenko, Honored Artist of the RSFSR (1984) (Russian: \u041d\u0430\u0442\u0430\u0301\u043b\u044c\u044f \u042d\u0434\u0443\u0430\u0301\u0440\u0434\u043e\u0432\u043d\u0430 \u0410\u043d\u0434\u0440\u0435\u0301\u0439\u0447\u0435\u043d\u043a\u043e ; born May 3, 1956) is an actress. Her most famous roles include the title character in \"Mary Poppins, Goodbye\" and Lyuba in \"Wartime Romance\".\nQuestion: Natalya Andrejchenko was Mary Poppins. True, False, or Neither? True\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon was born in the year 1995. True, False, or Neither? True\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\nQuestion: Emperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. After splitting up in 2001, they reunited from 2005 to 2007 for a few festival dates and brief US tours, and again in 2013 to 2014. The group was founded by Ihsahn and Samoth. True, False, or Neither? True\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono.\nQuestion: Bono sang in Black Wind, White Land. True, False, or Neither?", "doc_id": 858, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40745, 7470, 13094, 24671], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\".\nQuestion: Guy Sebastian is a nice guy. True, False, or Neither? Neither\n###\nTarget Field is a baseball park in the historic warehouse (or North Loop) district of downtown Minneapolis. It is the home ballpark of the Minnesota Twins, the state's Major League Baseball (MLB) franchise. It also has served as the occasional home of Minnesota Golden Gophers baseball, and other local and regional baseball events.\nQuestion: The Minnesota Twins have never lost a game at Target Field True, False, or Neither? Neither\n###\nClear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Chris Vrenna of Nine Inch Nails/Tweaker, it was released in July 2000 on the now-defunct label Risk Records. After \"Clear Hearts, Grey Flowers\" the band formally split up and moved on to establish other projects.\nQuestion: Risk Records released Clear Hearts Grey Flowers before they went defunct. True, False, or Neither? True\n###\nKalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others.\nQuestion: \"Blue Mountains\" was filmed before \"Child in Pondicherry\". True, False, or Neither? Neither\n###\nBudapest Gypsy Symphony Orchestra is a Hungarian symphony orchestra of Romani (Gypsy) musicians. It emphasizes works by composers inspired by Hungarian folk music including Johannes Brahms, Vittorio Monti, Piotr Tcha\u00efkovski, Johann Strauss and Johann Strauss II. The orchestra has been performing for\nQuestion: Budapest Gypsy Symphony Orchestra consists of ethnically diverse musicians True, False, or Neither?", "doc_id": 753, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30321, 31805, 15437, 15999], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "City Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle.\nQuestion: City Mall is the largest mall in Jordan True, False, or Neither? Neither\n###\nIntervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009.\nQuestion: Intervilles is a popular French comedy game show True, False, or Neither? Neither\n###\nThe Last Exorcism Part II is a 2013 American supernatural drama horror film co-written and directed by Ed Gass-Donnelly. It stars Ashley Bell, Julia Garner, Spencer Treat Clark, David Jensen, Tarra Riggs, Louis Herthum, and Muse Watson. It is a sequel to 2010's \"The Last Exorcism\", and released on March 1, 2013.\nQuestion: There was only a writer for this movie True, False, or Neither? False\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey.\nQuestion: Ballymena United Football Club consists of ethnically diverse players True, False, or Neither? Neither\n###\nNico is the third studio album by American rock band Blind Melon, released in 1996 by Capitol Records. The album was released after lead singer Shannon Hoon's cocaine overdose that resulted in his death in 1995. The album was named for his daughter, Nico Blue, and the proceeds arising from album sales were placed in a college trust for her. It features\nQuestion: Lead singer Shannon Hoon was dead in 2000. True, False, or Neither?", "doc_id": 398, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17028, 28374, 29148, 15396], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries.\nQuestion: All of the monuments were made after 900 CE. True, False, or Neither? False\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989.\nQuestion: The Times of India covered the award ceremony for the Padma Bhushan. True, False, or Neither? Neither\n###\nCranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century.\nQuestion: Cranborne Priory is the church that I belong to of the 12th century with the building, which leads into the parish True, False, or Neither? Neither\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: Colorz of Rage featured hip-hop star Redman and the singer Cheryl as well as other stars. True, False, or Neither? Neither\n###\nSamuel Bronston (Samuel Bronshtein, March 26, 1908, Bessarabia \u2013 January 12, 1994, Sacramento, California) was a Bessarabian-born American film producer, film director, and a nephew of socialist revolutionary figure, Leon Trotsky. He was also the petitioner in a U.S. Supreme Court case that set a major precedent for perjury prosecutions when it overturned his conviction.\nQuestion: Samuel Bronston was not born in the United States. True, False, or Neither?", "doc_id": 395, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10837, 32737, 33818, 43981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Anlo Youth Organisation (also known as the Anlo Youth Association) was a political party that existed in the Gold Coast and later Ghana. It campaigned for the Ewe people under British rule to stay within Ghana after independence. It ended by merging with other parties to form a united opposition to the Convention People's Party.\nQuestion: The Anlo Youth Association's views were different than those of the Convention People's Party. True, False, or Neither? True\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m).\nQuestion: Sonnette is home to many people. True, False, or Neither? Neither\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: Sadhu Kokila is one of the main cast in Suntaragaali True, False, or Neither? False\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole is from a country whose capital is London. True, False, or Neither? True\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory.\nQuestion: The England national cricket team lost every game of the 1912 Triangular Tournament True, False, or Neither?", "doc_id": 115, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34070, 789, 21925, 12824], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boonie Bears III is a 2016 Chinese animated adventure comedy film directed by Ding Liang and Lin Yongchang. The film is the third installment in the \"Boonie Bears\" film series based on the animated series of the same name, following the 2015 film \"\". It was released in China on January 16, 2016. It will be followed by \"\", scheduled for release in 2017.\nQuestion: Boonie Bears III debut at number 1 True, False, or Neither? Neither\n###\nA Moment to Remember (; lit. \"Eraser in My Head\") is a 2004 South Korean film based on the 2001 Japanese television drama \"Pure Soul\". It stars Son Ye-jin and Jung Woo-sung and follows the theme of discovery in a relationship and the burdens of loss caused by Alzheimer's disease.\nQuestion: In the film, the characters played by Son Ye-jin and Jung Woo-sung break up True, False, or Neither? Neither\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: Ranila has houses. True, False, or Neither? True\n###\nThe Bowling Green Falcons men's basketball team is the basketball team that represent Bowling Green State University in Bowling Green, Ohio. The school's team currently competes in the Mid-American Conference. The team last played in the NCAA Division I Men's Basketball Tournament in 1968. The Falcons are now coached by Michael Huger, their 17th head coach.\nQuestion: Bowling Green was founded in 1950 True, False, or Neither? Neither\n###\nBabes in Arms is a 1937 musical comedy with music by Richard Rodgers, lyrics by Lorenz Hart and book by Rodgers and Hart. It concerns a group of small-town Long Island teenagers who put on a show to avoid being sent to a work farm by the town sheriff when their actor parents go on the road for five months in an effort to earn some money by reviving vaudeville.\nQuestion: Rodgers and Hart wrote a funny musical True, False, or Neither?", "doc_id": 470, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31000, 27639, 10809, 5706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement.\nQuestion: Princess Caroline of Gloucester died from measles True, False, or Neither? Neither\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013.\nQuestion: Kimberly Ane Peirce has directed more than one film True, False, or Neither? True\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University.\nQuestion: Pryor began dancing at age 6 True, False, or Neither? Neither\n###\nThe Emami Kolkata Open ATP Challenger Tour (formerly known as State Bank of India ATP Challenger Tour) is a professional tennis tournament played on outdoor hard courts. It is currently part of the Association of Tennis Professionals (ATP) Challenger Tour. It is held annually at the Bengal Tennis Association Stadium in Kolkata, India since 2014.\nQuestion: The Emami Kolkata Open ATP Challenger Tour is mostly known as the State Bank of India ATP Challenger Tour. True, False, or Neither? Neither\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title.\nQuestion: The tournament was postponed due to a terror threat True, False, or Neither?", "doc_id": 181, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11101, 23085, 40301, 10316], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Orange, Red, Yellow is a 1961 Color Field painting by Mark Rothko. It sold at Christie's for $86.882.500 on May 8, 2012. The seller was the estate of David Pincus and the sale price represents a record nominal price for Post-War / contemporary art at public auction and for Rothko works in general.\nQuestion: If you look at Orange, Red, Yellow long enough you will go blind. True, False, or Neither? Neither\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL).\nQuestion: He was a popular player True, False, or Neither? Neither\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944.\nQuestion: Tippet was a prolific recording artist before meeting Holst. True, False, or Neither? Neither\n###\nKXST is a radio station licensed to North Las Vegas, Nevada, broadcasting to the Las Vegas, Nevada area on 1140 AM. The station is owned by CBS Radio, and broadcasts a sports talk format as part of the CBS Sports Radio network. The station's studios are located in the unincorporated Clark County area of Spring Valley, while its transmitter is near Nellis Air Force Base.\nQuestion: kxst broadcasts a sports talk format True, False, or Neither? True\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8.\nQuestion: Dave Dennis spent most of his life in New Zealand True, False, or Neither?", "doc_id": 962, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27370, 409, 24386, 10469], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "WJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM.\nQuestion: Rainey Radio owns more radio stations in the area. True, False, or Neither? Neither\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart.\nQuestion: Early Mac's debut EP was released four years ago by Foolay Ent., LLC. True, False, or Neither? False\n###\nFatsia japonica(syn. \"Aralia japonica\" Thunb., \"A. sieboldii\" Hort. ex K.Koch), also glossy-leaf paper plant, fatsi, paperplant or Japanese aralia, is a species of flowering plant in the family Araliaceae, native to southern Japan, southern Korea, and Taiwan.\nQuestion: Fatsia japonica is only native to Japan. True, False, or Neither? False\n###\nThe 2007 Grand National (known as the John Smith's Grand National for sponsorship reasons) was the 160th official annual running of the world-famous Grand National steeplechase which took place at Aintree Racecourse near Liverpool, England, on 14 April 2007 and attracted the maximum permitted field of forty competitors for a total prize money of \u00a3700,000 including \u00a3399,140 to the winner.\nQuestion: There was one winner of the 160th official annual Grand National steeplechase at Aintree Racecourse who won \u00a3399,140 out of the total prize money of \u00a3700,000. True, False, or Neither? True\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy.\nQuestion: The Sisters of Mercy are a Christian organization. True, False, or Neither?", "doc_id": 319, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28636, 17519, 28076, 21825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Laura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture.\nQuestion: Alexander Theroux loved to eat pizza. True, False, or Neither? Neither\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House.\nQuestion: Spittal is an urban city with millions of people. True, False, or Neither? False\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title.\nQuestion: May 23, 2011 was a sunny day in Qingdao. True, False, or Neither? Neither\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: ranila is a village in southern japan True, False, or Neither? False\n###\nAbdessadeq Cheqara (1931 \u2013 October 31, 1998) (in Arabic: \u0639\u0628\u062f \u0627\u0644\u0635\u0627\u062f\u0642 \u0634\u0642\u0627\u0631\u0629) was a Moroccan singer of traditional Andalusian classical music and Moroccan folk music. Known as the \"grand master of al-Ala (Andalusian music)\", he was also a violin and oud virtuoso.\nQuestion: He was well-regarded in his field. True, False, or Neither?", "doc_id": 850, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19635, 1688, 42593, 32602], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shitanshu Hargovindbhai Kotak (born 19 October 1972 in Rajkot) was an Indian first-class cricketer. A left-handed batsman, he has been a prolific run scorer for Saurashtra. Now he is the coach of Saurastra Cricket Team & soon will join Gujarat Lions IPL team as Assistant Coach.\nQuestion: Shitanshu Hargovindbhai Kotak is a thin man True, False, or Neither? Neither\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture.\nQuestion: Villeneuve has never won an award. True, False, or Neither? False\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars.\nQuestion: X X X X is served cold or warm True, False, or Neither? Neither\n###\nThe Combat Box was a tactical formation used by heavy (strategic) bombers of the U.S. Army Air Forces during World War II. The combat box was also referred to as a \"staggered formation\". Its defensive purpose was in massing the firepower of the bombers' guns, while offensively it concentrated the release of bombs on a target.\nQuestion: The Combat Box was an extremely successful tactical formation. True, False, or Neither? Neither\n###\nTobias Svantesson (born April 1, 1963, in Malmo, Sweden), is a former professional tennis player from Sweden. He enjoyed most of his tennis success while playing doubles. During his career he won 2 doubles titles. He achieved a career-high doubles ranking of World No. 65 in 1991. His career high world ranking in singles was no 89.\nQuestion: Tobias Svantesson has played other than doubles, despite them being where he had the most success. True, False, or Neither?", "doc_id": 702, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1567, 33519, 6787, 23697], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe.\nQuestion: The show was not an animation True, False, or Neither? False\n###\nPiazza Colonna is a piazza at the center of the Rione of Colonna in the historic heart of Rome, Italy. It is named for the marble Column of Marcus Aurelius, which has stood there since AD 193. The bronze statue of Saint Paul that crowns the column was placed in 1589, by order of Pope Sixtus V. The Roman Via Lata (now the Via del Corso) runs through the piazza's eastern end, from south to north.\nQuestion: The Column of Marcus Aurelius was in Rome, Italy in 1589. True, False, or Neither? True\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing.\nQuestion: An experience point is gained through harassing other players True, False, or Neither? False\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: The final of the 1983 Prudential Cup was the most exciting game of the century. True, False, or Neither? Neither\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: The building was originally constructed over 1999 days ago. True, False, or Neither?", "doc_id": 61, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28058, 20191, 827, 38693], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Joseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars.\nQuestion: Joseph was born in the summer of 1987. True, False, or Neither? True\n###\nBel Ami (; also known as \"Pretty Boy\", and \"'Pretty Man\", is a South Korean romantic comedy television series starring Jang Keun-suk, IU, Lee Jang-woo and Han Chae-young. Based on the same-titled 17-volume manhwa by Chon Kye-young, it aired on KBS2 from November 20, 2013 to January 9, 2014 on Wednesdays and Thursdays at 21:55 for 16 episodes.\nQuestion: Bel Ami had a very short run. True, False, or Neither? True\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced.\nQuestion: Vincenzo Natali Directed splice. True, False, or Neither? True\n###\nThe November 2004 San Francisco general elections were held on November 2, 2004, in San Francisco, California. The elections included seven seats to the San Francisco Board of Supervisors, four seats to the San Francisco Community College Board, four seats to the San Francisco Board of Education, and fourteen San Francisco ballot measures.\nQuestion: There was a recount in the elections. True, False, or Neither? Neither\n###\nAnne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952.\nQuestion: The even of Rash Hashanah was the first time it was aired. True, False, or Neither?", "doc_id": 982, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40680, 10317, 37293, 27512], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kindred is a 1987 American horror film directed by Jeffrey Obrow and Stephen Carpenter. Obrow also produced the film and co-wrote it along with Carpenter, Earl Ghaffari and John Penney. The film stars David Allen Brooks, Amanda Pays and Rod Steiger. It was released on January 9, 1987 and grossed just over $2 million.\nQuestion: Kindred is written by at least 4 people True, False, or Neither? True\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: The Shokaku class aircraft carriers were part of the reason the United States was brought into the Pacific War. True, False, or Neither? True\n###\nVampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction.\nQuestion: Vampire Vs Vampire was released 11 years prior to the next century. True, False, or Neither? True\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group.\nQuestion: The 2011 Grammy Award for Best Jazz Instrumental Album goes to James Moody for his exceptional instrumental album Moody 4B, released a year earlier. True, False, or Neither? True\n###\nDuncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. His play \"In Cold Light\" is currently in the production phase as it is turned into a feature film by Peter Slee Productions.\nQuestion: Duncan Ley is from the Northern Hemisphere. True, False, or Neither?", "doc_id": 105, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12182, 31556, 15861, 33904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\".\nQuestion: Aniket Vishwasrao got a role only because he knew the director True, False, or Neither? Neither\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005.\nQuestion: Newlyweds was a show about a boyband member and his wife. True, False, or Neither? True\n###\nJesco White, also known as the \"Dancing Outlaw\" (born July 30, 1956) is an American folk dancer and entertainer. He is best known as the subject of three American documentary films that detail his desire to follow in his famous father's footsteps while dealing with depression, drug addiction, alcoholism, and the poverty that permeates much of rural Appalachia.\nQuestion: poverty was the main reason of jesco white drug addiction and alcoholism True, False, or Neither? Neither\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes.\nQuestion: Max & Shred appeared on Nickelodeon and YTV at the same time. True, False, or Neither? True\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C.\nQuestion: The Luton Town Ladies Football Club partnered with the Luton Town F.C. the year after it was founded. True, False, or Neither?", "doc_id": 254, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7022, 26970, 40726, 20848], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Drifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016.\nQuestion: Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke have all worked together before. True, False, or Neither? True\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar).\nQuestion: Beastie Boys were a bad American hip hop group. True, False, or Neither? Neither\n###\nAnnabelle's Affairs is a 1931 American pre-Code romantic comedy film directed by Alfred L. Werker and starring Victor McLaglen, Jeanette MacDonald and Roland Young. The film is based on the play \"Good Gracious Annabelle\" by Clare Kummer. It is the only one of MacDonald's films to be considered lost. It was well received by critics, but did not perform well at the box office.\nQuestion: Jeannette MacDonald made a lot of bad movies. True, False, or Neither? Neither\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel.\nQuestion: kiss and tell was the best song on the album True, False, or Neither? Neither\n###\nIn tabletop role-playing games, the character race represents the people to which a player character (PC) or a non-player character (NPC) belongs. \"People\" is to be taken in the broader sense, and may encompass ethnic groups, species, nationality or social groups.\nQuestion: \"people\" could mean elves in tabletop rpg games True, False, or Neither?", "doc_id": 876, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14925, 38422, 5802, 45059], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues.\nQuestion: Wireshark is used for communications protocol development. True, False, or Neither? True\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: The Santa Cova Funicular is not in England. True, False, or Neither? True\n###\nCherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\".\nQuestion: Charles Napier first appeared in the film \"Vixen!\" in the 60's. True, False, or Neither? True\n###\nA madrigal is a secular vocal music composition of the Renaissance and early Baroque eras. Traditionally, polyphonic madrigals are unaccompanied; the number of voices varies from two to eight, and most frequently from three to six. It is quite distinct from the Italian Trecento madrigal of the late 13th and 14th centuries, with which it shares only the name.\nQuestion: A madrigal is a secular vocal music composition distinct from the Italian Trecento madrigal of the 15th century. True, False, or Neither? False\n###\nCooper Manning (born March 6, 1974) is the host for the show \"The Manning Hour\" for Fox Sports. He is the oldest son of former professional football quarterback Archie Manning, and the older brother of former professional football quarterback Peyton Manning and current New York Giants quarterback Eli Manning.\nQuestion: Cooper Manning was born before his father retired from professional football. True, False, or Neither?", "doc_id": 361, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7721, 18103, 6011, 7370], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Resorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming.\nQuestion: Resorts Casino Tunica has gone through a couple name changes over the years. True, False, or Neither? True\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\".\nQuestion: J. R. R. Tolkien wrote the scripts for the The Lord of the Rings trilogy films as well as The Lord of the Rings books. True, False, or Neither? Neither\n###\nThe 1983 Summer Universiade, also known as the 1983 World University Games or XII Summer Universiade, took place in Edmonton, Alberta, Canada between July 1 and 12, 1983. Over 2400 athletes from 73 countries participated. It was the first time Canada hosted these Games. Edmonton also hosted the 1978 Commonwealth Games.\nQuestion: The 1983 World University Games took place in Edmonton, Alberta, Canada during winter True, False, or Neither? False\n###\nB&Q plc is a British multinational DIY and home improvement retailing company, headquartered in Eastleigh, England, United Kingdom and is a wholly owned subsidiary of Kingfisher plc. Founded by Richard Block and David Quayle in 1969 originally as Block & Quayle, the retail chain offers over 40,000 products across 300 stores and online.\nQuestion: B&Q plc is founded by Richard Block and Donald Trump True, False, or Neither? False\n###\nThe Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name.\nQuestion: Shulman died in 1964 True, False, or Neither?", "doc_id": 424, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4914, 25048, 32512, 34051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006.\nQuestion: Neko's absence left a void in the band that fans feel Kathryn has not adequately filled. True, False, or Neither? Neither\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University.\nQuestion: The Old Time Gospel Hour Quartet performed twice a week. True, False, or Neither? False\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: Osella was born in 1985. True, False, or Neither? False\n###\nThe Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris. The film was released in the United States on December 13, 1991.\nQuestion: It was a canadian movie True, False, or Neither? False\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior.\nQuestion: A semi-automatic pistol can fire cartridges in rapid succession by holding down the trigger. True, False, or Neither?", "doc_id": 682, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25861, 16292, 23838, 38132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The Most Valuable Player was given to less than three players. True, False, or Neither? True\n###\nCarol Ann Crawford (February 22, 1934 \u2013 August 10, 1982), also known as Carol Stolkin and Carol Ross, was an American backgammon and bridge player from Buffalo, New York who spent many years in Detroit, Michigan.. In 1973, she became the second woman to win the world backgammon championships.\nQuestion: Carol Ann Crawford never won. True, False, or Neither? False\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Izzadeen Mohamed Naufer has won an AFC challenge cup. True, False, or Neither? Neither\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996.\nQuestion: Trainspotting shows explicit drug use. True, False, or Neither? Neither\n###\nCorn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin.\nQuestion: corn crab is a goumet dish True, False, or Neither?", "doc_id": 656, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28775, 5818, 25614, 18959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: Kristen Bell was in the episode \"Fight or Flight\" on the show \"Heroes\". True, False, or Neither? True\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered.\nQuestion: Justicia umbrosa will wilt in the shade True, False, or Neither? False\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001).\nQuestion: Susan Lynch filmed multiple films in 2000 True, False, or Neither? Neither\n###\nRonald Francis Arias (born November 30, 1941) is a former senior writer and correspondent for \"People magazine\" and \"People en Espa\u00f1ol\". He is also a highly regarded author whose novel \"The Road to Tamazunchale\" has been recognized as a milestone in Chicano literature.\nQuestion: He has only written non-fiction. True, False, or Neither? False\n###\nEMP Merchandising also known as EMP Merchandising Handelsgesellschaft mbH, Large Popmerchandising, and Sweden Rock Shop is a German-based music mail order and merchandising store. The company distributes a quarterly catalog to customers. In a 2003 report the Osnabr\u00fcck Chamber of Commerce considered the company to be the largest mail order business for Heavy Metal and Hard Rock music in Germany.\nQuestion: EMP Merchandising was founded in 2003. True, False, or Neither?", "doc_id": 192, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20268, 34067, 10923, 36809], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society.\nQuestion: Christmas Eve and day are the most important holidays in Western Society. True, False, or Neither? Neither\n###\nBoonie Bears III is a 2016 Chinese animated adventure comedy film directed by Ding Liang and Lin Yongchang. The film is the third installment in the \"Boonie Bears\" film series based on the animated series of the same name, following the 2015 film \"\". It was released in China on January 16, 2016. It will be followed by \"\", scheduled for release in 2017.\nQuestion: Boonie Bears III was dubbed for an english version True, False, or Neither? Neither\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium.\nQuestion: moulton grew up in a four story mansion True, False, or Neither? Neither\n###\nUniversity Church of England Academy is a secondary school located in Ellesmere Port, Cheshire. It was formed in 2009 by the merger of Ellesmere Port Specialist School of Performing Arts (located at Woodchurch Lane) and Cheshire Oaks High School (located at Stanney Lane).\nQuestion: University Church of England Academy is a very clean school True, False, or Neither? Neither\n###\nUdinese Channel is a subscription-based channel, entirely dedicated to the Italian football team Udinese Calcio. The channel offers Udinese Calcio fans exclusive interviews with players and staff, full matches, including replays of all Serie A, Coppa Italia, and UEFA Cup games, in addition to vintage matches, footballing news, and other themed programming.\nQuestion: The channel features baseball once a month. True, False, or Neither?", "doc_id": 790, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14301, 32350, 10527, 36827], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Globacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide.\nQuestion: GLO operates in less than 20 countries. True, False, or Neither? True\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Slender mint is native to Tasmania, Queensland, New South Wales, Victoria, and South Australia. True, False, or Neither? True\n###\n\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\".\nQuestion: Lucenzo wrote more than one song. True, False, or Neither? True\n###\n\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single,\nQuestion: The song Inbetweener by English Britpop band Sleeper is over 3 years old True, False, or Neither? True\n###\nMinoo Mumtaz (born Malikunnisa Ali on 26 April 1942) is a former Indian film actress. She is the sister of India's ace comedian Mehmood Ali and part of the Mehmood Ali film family. Minoo Mumtaz appeared in many Hindi films of the 1950s and 1960s, mostly as a dancer and character actress.\nQuestion: Malikunnisa Ali was born more than 1942 years ago. True, False, or Neither?", "doc_id": 222, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3185, 22207, 39962, 40773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points.\nQuestion: At least four games have been played in Memorial Stadium. True, False, or Neither? True\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era.\nQuestion: The film was very popular True, False, or Neither? Neither\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production.\nQuestion: The Key was Williams Holden first appearance in a film. True, False, or Neither? Neither\n###\nLois Cleveland Chiles (born April 15, 1947) is an American actress and former fashion model known for her roles as Dr. Holly Goodhead in the 1979 James Bond film \"Moonraker\", and as a hit and run driver in 1987's \"Creepshow 2\", as well as such films as \"The Great Gatsby\", \"The Way We Were\", \"Death on the Nile\" and \"Broadcast News\".\nQuestion: Lois was a fashion model in America. True, False, or Neither? True\n###\nMichael Tunn (born 18 January 1974) is an Australian radio announcer and television presenter. He was hired by Australia's national youth station Triple J in 1990 at the age of 17, making him Australia's youngest professional radio presenter at the time.\nQuestion: Michael Tunn was hired by Australia's national youth station Triple J 13 years after he was born. True, False, or Neither?", "doc_id": 785, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17869, 27679, 6160, 14783], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: Kapp Heights had a population of less than 1000 in 2010. True, False, or Neither? True\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament.\nQuestion: Belmont plays in the NCAA Division II. True, False, or Neither? False\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006.\nQuestion: Oliver was a very good priest. True, False, or Neither? False\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards.\nQuestion: Jake Deckard has won more than two awards. True, False, or Neither? Neither\n###\nSilver Bow County is a county in the State of Montana. As of the 2010 census, the population was 34,200. Its county seat is Butte. In 1977, the city and county governments consolidated to form the single entity of Butte-Silver Bow. Additionally, the town of Walkerville is a separate municipality from Butte and is within the county.\nQuestion: Silver Bow County borders South Dakota. True, False, or Neither?", "doc_id": 92, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4657, 39455, 9619, 19280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sophie Charlene Akland Monk (born 14 December 1979) is an English-born Australian singer, songwriter, actress, model and radio personality. Monk was a member of the girl group Bardot and released a solo album called \"Calendar Girl\" (2003). She has appeared in films such as \"Date Movie\" (2006), \"Click\" (2006), and \"Spring Breakdown\" (2009).\nQuestion: Sophie never appeared in movies. True, False, or Neither? False\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: Gulf Air has been used by dan. True, False, or Neither? Neither\n###\nEMP Merchandising also known as EMP Merchandising Handelsgesellschaft mbH, Large Popmerchandising, and Sweden Rock Shop is a German-based music mail order and merchandising store. The company distributes a quarterly catalog to customers. In a 2003 report the Osnabr\u00fcck Chamber of Commerce considered the company to be the largest mail order business for Heavy Metal and Hard Rock music in Germany.\nQuestion: There are no larger heavy metal mail order businesses in Germany. True, False, or Neither? True\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Weiner Still works at stanford True, False, or Neither? True\n###\nHakea microcarpa , commonly known as small-fruit hakea is a flowering plant in the family Proteaceae and is endemic to eastern Australia. It is a spreading shrub, often growing in woodlands, heathlands and near swamps in montane areas of eastern Australia.\nQuestion: the plant can grow in mountains True, False, or Neither?", "doc_id": 484, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44865, 14245, 3362, 11591], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Airline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya.\nQuestion: No person on the continent of Africa has become a certified Captain of the Boeing 787. True, False, or Neither? False\n###\nJulia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\".\nQuestion: Julia Goldani Telles have been in an ABC Family Seris. True, False, or Neither? True\n###\nThe Col de la Croix Fry (1467 m ) is a mountain pass located in the Cha\u00eene des Aravis, between Manigod and La Clusaz in the Haute-Savoie department of France. The road over the col is used occasionally by the Tour de France cycle race with the tour crossing the pass on Stage 19 of the 2013 Tour. At the summit is the village of La Croix Fry.\nQuestion: The Tour de France is a cycle race. True, False, or Neither? True\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority.\nQuestion: Dickinson owns an airport in North Dakota True, False, or Neither? True\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175.\nQuestion: Look at Me (When I Rock Wichoo) ends with an O. True, False, or Neither?", "doc_id": 948, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6177, 16902, 8805, 906], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002.\nQuestion: The conservative Political Party is a member of the EDU True, False, or Neither? True\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington.\nQuestion: The Angel and the Soldier Boy was the 13th song by Clannad. True, False, or Neither? False\n###\nThe Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms.\nQuestion: Nolan funk stared in a movie that had a limited release on the month of august 2013. True, False, or Neither? True\n###\nHenry Gabriel Murphy (1903\u20132001) was an American businessman, sportsman and Major League Baseball club owner. From June 1950 through April 1984, he was a minority stockholder in the Washington Senators/Minnesota Twins franchise of the American League.\nQuestion: Murphy was a Major League Baseball club player from 1950-1984. True, False, or Neither? False\n###\nGeorge White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation.\nQuestion: George White's Scandals was released more than 1934 seconds ago. True, False, or Neither?", "doc_id": 986, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14283, 7248, 31053, 36117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Flatline\" is the ninth episode of the eighth series of the British science fiction television programme \"Doctor Who\", written by Jamie Mathieson, and directed by Douglas Mackinnon. The episode stars Peter Capaldi and Jenna Coleman, with Joivan Wade and Christopher Fairbank guest starring. The episode received critical acclaim, with particular praise directed at Coleman's performance.\nQuestion: Flatline was loved by many people. True, False, or Neither? True\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\".\nQuestion: Martin John Christopher Freeman did not play Bilbo Baggins in \"The Hobbit\". True, False, or Neither? False\n###\nAmelio Robles Avila was a colonel during the Mexican Revolution. He was born a woman with the name of Amelia Robles \u00c1vila on November 3, 1889 in Xochipala, Guerrero. His father was named Casimiro Robles and his mother Josefa \u00c1vila. His father was a wealthy farmer who owned 42 acres of land and owned a small Mezcal factory.\nQuestion: The colonel was born Amelio and changed his name to Amelia. True, False, or Neither? False\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Mentha diemenica is a popular species of mint from Tasmania and very good in cocktails. True, False, or Neither? Neither\n###\nEngine is the second album by American Music Club. It was jointly released by Frontier and Grifter in the US and by Zippo in the UK and Europe in 1987. The 1998 Warner Bros. Records reissue added three additional tracks from the same period. The artwork for the Zippo UK release features an incorrect track listing, putting the songs in the wrong order.\nQuestion: American Music Club released an album in 1986. True, False, or Neither?", "doc_id": 970, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33563, 14965, 2731, 20138], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\".\nQuestion: It was a made up story True, False, or Neither? Neither\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold.\nQuestion: One of the highest earning films and with a high number of tickets sold Camping 3 made lots of money. True, False, or Neither? True\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture.\nQuestion: Denis Villeneuve is a French Canadian film director and writer who won the Canadian Screen Award for writing four times, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. True, False, or Neither? False\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808\nQuestion: The house is standing on a place that is now called Warren Hall Farm. True, False, or Neither? True\n###\nNBA 2K9 is a basketball simulation video game developed by Visual Concepts and published by 2K Sports. It is the tenth installment in the \"NBA 2K\" franchise and the successor to \"NBA 2K8\". It was released in 2008 for PlayStation 2, PlayStation 3, Xbox 360, and Microsoft Windows. Kevin Garnett is the cover athlete of the game. \"NBA 2K9\" is the predecessor to \"NBA 2K10\" in the \"NBA 2K\" series.\nQuestion: NBA 2K10 is the last in the \"NBA 2K\" series. True, False, or Neither?", "doc_id": 492, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8660, 18999, 21025, 19250], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart.\nQuestion: The band Cross Gene does not contain any female band members. True, False, or Neither? True\n###\nQuetta: A City of Forgotten Dreams is an upcoming Pakistani drama film directed by Murtaza Chaudary, written by Faysal Chaudary and co-produced by Faysal Chaudary, Sana Bucha under the Production banner \"Filmsaaz\", \"Sana Bucha Productions\". The film star Asal Din Khan, Abdullah Ghaznavi, Ali Karimi, Fayaz Hussain and Danyal Ali in lead roles.\nQuestion: The film star Asal Din Khan had roles in over 20 films. True, False, or Neither? Neither\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington.\nQuestion: The Angel and the Soldier Boy is not an animated movie. True, False, or Neither? False\n###\nThe Albany Great Danes men's lacrosse team represents the University at Albany in NCAA Division I men's college lacrosse. Albany currently competes in the America East Conference and plays its home games on John Fallon Field. The team has reached the NCAA Men's Lacrosse Championship tournament nine times. The Great Danes are currently coached by Scott Marr.\nQuestion: The lacrosse team represents the University at Albany in NCAA Division 2 True, False, or Neither? False\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty.\nQuestion: Kilpatrick was a police officer True, False, or Neither?", "doc_id": 11, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37792, 36852, 1268, 27036], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Humans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video.\nQuestion: Humans Need Not Apply is a 2014 short Internet documentary film about economics in the last century. True, False, or Neither? False\n###\nShannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University.\nQuestion: Shannon Kelley wants to coach SMU. True, False, or Neither? Neither\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898.\nQuestion: the mariner will rise up True, False, or Neither? Neither\n###\nBarry Redden (born July 21, 1960) is a former American football running back who played for the Los Angeles Rams, the San Diego Chargers, and the Cleveland Browns of the National Football League (NFL). He spent much of his career playing in the shadow of Pro Football Hall of Fame running back Eric Dickerson.\nQuestion: Barry Redden is a very funny man True, False, or Neither? Neither\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson.\nQuestion: Fri\u00f0rik Ingi R\u00fanarsson is a player on the basketball team. True, False, or Neither?", "doc_id": 521, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16622, 40520, 13273, 20091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Eliza Mahoney (May 7, 1845 \u2013 January 4, 1926) was the first African American to study and work as a professionally trained nurse in the United States, graduating in 1879. Mahoney was one of the first African Americans to graduate from a nursing school, and she prospered in a predominantly white society. She also challenged discrimination against African Americans in nursing.\nQuestion: Mary Eliza Mahoney passed her classes. True, False, or Neither? True\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire.\nQuestion: Eugene Gearty has worked on over 100 films since 1983 True, False, or Neither? False\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: ABC Western Victoria also owned a television studio. True, False, or Neither? Neither\n###\nErnest Asi Afiesimama is a Nigerian environmental and climate scientist who has worked for the Nigerian Meteorological Agency and was a consultant in environmental and climate affairs at Stern Integrated Projects. He currently works with the World Meteorological Organisation.\nQuestion: Hes a consultant in environmental and climate affairs True, False, or Neither? True\n###\nHumphrey Mieno Ochieng (born 25 December 1989 in Nairobi) is a Kenyan footballer who currently plays for Kenyan Premier League side Tusker and the Kenya national team as a midfielder. He previously played for A.F.C. Leopards Sofapaka and Kenya Commercial Bank in the Kenyan Premier League, as well as Tunisian side Club Africain and Tanzanian club Azam.\nQuestion: Humphrey Mieno Ochieng was born on Christmas Day True, False, or Neither?", "doc_id": 758, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5874, 33853, 1395, 20924], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise).\nQuestion: Josiane starred in as well as created The Magic Roundabout. True, False, or Neither? Neither\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics.\nQuestion: ALGOL 68 is a language for computer programming. True, False, or Neither? True\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\".\nQuestion: b.j. thomas song hooked on a feeling was a success in 1968 True, False, or Neither? Neither\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy.\nQuestion: toronto fc is predicted to beat the LA galaxy True, False, or Neither? Neither\n###\nMarvin Karlton Rainwater (July 2, 1925 \u2013 September 17, 2013) was an American country and rockabilly singer and songwriter who had several hits during the late 1950s, including \"Gonna Find Me a Bluebird\" and \"Whole Lotta Woman\", a UK no.1 record. He was known for wearing Native American-themed outfits on stage and was 25 percent Cherokee.\nQuestion: Marvin Karlton Rainwater was a very badsinger True, False, or Neither?", "doc_id": 728, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11826, 44887, 3326, 32547], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Speedway Field was the original name for the airfield that was to evolve into Minneapolis-St. Paul International Airport, the twelfth busiest airport in the United States; it was also the largest hub for Northwest Airlines and the third largest hub for Delta Air Lines, Northwest's successor.\nQuestion: The former Speedway Field is the twelfth busiest airport in the U.S. True, False, or Neither? True\n###\nAnti-D\u00fchring (German: \"Herrn Eugen D\u00fchrings Umw\u00e4lzung der Wissenschaft\" , \"Herr Eugen D\u00fchring's Revolution in Science\") is a book by Friedrich Engels, first published in German in 1878. It had previously been serialised in a periodical. There were two further German editions in Engels' lifetime. \"Anti-D\u00fchring\" was first published in English translation in 1907.\nQuestion: Anti-D\u00fchring starts with C. True, False, or Neither? False\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song.\nQuestion: \"I'm Not the One's\" lead vocalist has a first name that starts with the letter R. True, False, or Neither? True\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there.\nQuestion: Joseph Bouck was born in Schoharie County True, False, or Neither? True\n###\n\"Chasing Colors\" is a song recorded by electronic DJs Marshmello and Ookay featuring the vocals of American singer Noah Cyrus. It was written by Marshmello, Ookay, Skyler Stonestreet and Chase Duddy and released on 24 February 2017 via Marshmello's label Joytime Collective.\nQuestion: DJ Ookay released a track on a Friday in February 2017 True, False, or Neither?", "doc_id": 327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23868, 34861, 3431, 7530], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grindhouse Releasing is a Hollywood-based independent cult film distribution company led by film editor Bob Murawski and co-founded by Sage Stallone. Grindhouse digitally remasters, restores, and produces bonus materials and video documentaries for cult film DVDs and Blu-rays which it distributes on the CAV label.\nQuestion: Grindhouse Releasing plans to release bonus scenes to star wars True, False, or Neither? Neither\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: the airline operates scheduled services to 41 destinations in 23 countries across all of Africa, Asia and Europe True, False, or Neither? Neither\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The North African ostrich or red-necked ostrich can also be seen in European countries. True, False, or Neither? Neither\n###\nRobots is a 2005 American computer-animated adventure comedy film produced by Blue Sky Studios for 20th Century Fox. It was directed by Chris Wedge and produced by Jerry Davis, William Joyce, and John C. Donkin. It features the voices of Ewan McGregor, Halle Berry, Greg Kinnear, Mel Brooks, Amanda Bynes, Drew Carey, and Robin Williams.\nQuestion: Robots was the last film produced by Blue Sky Studios. True, False, or Neither? Neither\n###\nCarl Frederik Tietgen (19 March 1829 \u2013 19 October 1901) was a Danish financier and industrialist. The founder of numerous prominent Danish companies, many of which are still in operation today, he played an important role in the industrialisation of Denmark. Notably also forming conglomerates, several of Tietgen's companies attained a monopoly-like status, which cemented their durability.\nQuestion: Carl Frederik Tietgen was a popular figure in the industrialization of denmark True, False, or Neither?", "doc_id": 783, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30771, 20839, 17612, 24464], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Selma Diamond (August 6, 1920 \u2013 May 13, 1985) was an American comedic actress and radio and television writer, known for her high-range, raspy voice, and her portrayal of Selma Hacker on the first two seasons of the NBC television comedy series \"Night Court\".\nQuestion: Selma Diamond was married several times True, False, or Neither? Neither\n###\nSpring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch. Three years after principal photography, and after the film's owner, Warner Independent Pictures, was shut down by its parent company, it was released direct-to-video in 2009.\nQuestion: Spring Breakdown made a lot of money True, False, or Neither? Neither\n###\nThe Ravenswood City School District is a public school district headquartered in East Palo Alto, California, US. The district, in the San Francisco Bay Area, serves the communities of East Palo Alto and eastern Menlo Park. Students from this school district who continue on with public schooling matriculate to the Sequoia Union High School District. In 2008-09 it served over 4,500 students.\nQuestion: The Ravenswood City School District served less than 4500 students in the 2008 school year True, False, or Neither? False\n###\nNew American Writing is a once-a-year American literary magazine emphasizing contemporary American poetry, including a range of innovative contemporary writing. The magazine is published in association with San Francisco State University. \"New American Writing\" is published by OINK! Press, a nonprofit organization. The magazine appears in early June each year. First published in 1986.\nQuestion: Students of San Francisco State University have contemporary poetry published in New American Writing. True, False, or Neither? Neither\n###\nTasmanian Devils is a 2013 television film directed by Zach Lipovsky and starring Danica McKellar and Apolo Ohno. The movie was first released onto the Syfy channel on January 19, 2013 and centers around a group of friends that get attacked by extremely large tasmanian devils. \"Radio Times\" rated the film poorly, giving it two out of 5 stars.\nQuestion: The Tasmanian devils is a film about a group of Tasmanian devils that get attacked by a large group of friends. True, False, or Neither?", "doc_id": 918, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34289, 20757, 2527, 30842], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: A lot of people liked her True, False, or Neither? Neither\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old.\nQuestion: Urschel played football professionally into his second decade. True, False, or Neither? False\n###\nKim Hyang-gi (born August 9, 2000) is a South Korean actress. Kim began her career as a child actress, and has starred in films and television series such as \"Wedding Dress\" (2010), \"The Queen's Classroom\" (2013), \"Thread of Lies\" (2014) and \"Snowy Road\" (2017).\nQuestion: The film Snowy Road was well received True, False, or Neither? Neither\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east.\nQuestion: It is bordered by a sea named after a color True, False, or Neither? True\n###\nThe 1976 European Cup Winners' Cup Final was a football match between West Ham United of England and Anderlecht of Belgium. The final was held at Heysel Stadium in Brussels on 5 May 1976. It was the final match of the 1975\u201376 European Cup Winners' Cup tournament and the 16th European Cup Winners' Cup Final.\nQuestion: West Ham United played against another British team in the 1976 European Cup Winners' Cup Final in Brussels. True, False, or Neither?", "doc_id": 559, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9344, 16003, 16571, 12178], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Day with Wilbur Robinson is a 1990 children's picture book (slightly expanded for a 2006 reissue) written and illustrated by William Joyce. A film adaptation called \"Meet the Robinsons\" was released by Walt Disney Pictures in 2007 in the United States.\nQuestion: Meet the Robinsons was 200 minutes long. True, False, or Neither? Neither\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey.\nQuestion: Ballymena United Football Club will compete in the next World Cup True, False, or Neither? Neither\n###\nLeft Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker.\nQuestion: Left Hand Spring isn't located in Canada. True, False, or Neither? True\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\".\nQuestion: Aniket Vishwasrao was paid more than his female counterparts True, False, or Neither? Neither\n###\nCatherine Breillat (] ; born 13 July 1948) is a French filmmaker, novelist and Professor of Auteur Cinema at the European Graduate School. She has often courted controversy with her films' frank treatment of sexual themes. For example, her 1976 debut film, \"A Real Young Girl\", was not released in theaters until 2000.\nQuestion: Catherine Breillat was born in 19488 True, False, or Neither?", "doc_id": 621, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35857, 9031, 23218, 3633], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon plays in the position of forward. True, False, or Neither? True\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming.\nQuestion: The hotel will expand to a 250-room hotel. True, False, or Neither? Neither\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974.\nQuestion: Juan Domingo Per\u00f3n met Clinton. True, False, or Neither? Neither\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire.\nQuestion: Peter John Reynolds worked for free at Butser Ancient Farm True, False, or Neither? Neither\n###\nState Highway\u00a0128 (SH-128) is a 2.198 mi state highway in the U.S. state of Idaho, serving the city of Lewiston in Nez Perce County. The highway travels east along the Clearwater River within Lewiston from Washington State Route\u00a0128 (SR\u00a0128) to U.S. Route\u00a012 (US-12).\nQuestion: Highway 128 traverses several states True, False, or Neither?", "doc_id": 420, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1962, 17697, 18722, 32281], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SuperpinkyMandy is the debut studio album of British singer Beth Orton. Largely in the style of electronica, and produced closely with then boyfriend William Orbit, it was a limited Japan-only release, with about 5000 copies pressed. As such, it is very much sought after. Orton largely passes over the release when interviewed, citing 1996's \"Trailer Park\" as her first release.\nQuestion: SuperpinkyMandy is a rare album True, False, or Neither? True\n###\nCavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas.\nQuestion: The parade involves a commemoration. True, False, or Neither? True\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Ritchie Wenaweser and Simone Steiner do not love working together True, False, or Neither? Neither\n###\nReal Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded.\nQuestion: Real Madrid C was disbanded on May 2015 True, False, or Neither? Neither\n###\n\"Aven Romale\" (Come in Gypsies), is a song by the Czech group Gipsy.cz that was the Czech entry at the 2009 Eurovision Song Contest held in Moscow, Russia. It scored zero points at the Eurovision Song Contest semi-final, thereby failing to qualify for the final.\nQuestion: \"Aven Romale\" was actually born in Poland True, False, or Neither?", "doc_id": 291, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24382, 42536, 8761, 32111], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: Dariush Mehrjui is also known for his work in the field of Physics True, False, or Neither? Neither\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records.\nQuestion: Suge Knight and Dr. Dre were rivals True, False, or Neither? Neither\n###\nJunun is a 2015 album by the Israeli composer Shye Ben Tzur, the English composer and Radiohead guitarist Jonny Greenwood, and the Indian ensemble the Rajasthan Express. It was produced by Greenwood and recorded, mixed, and engineered by Radiohead producer Nigel Godrich.\nQuestion: The album was nominated for a Grammy. True, False, or Neither? Neither\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement.\nQuestion: Princess Caroline was 1 day old when she died True, False, or Neither? False\n###\nThe Volkswagen Citi Golf was a car produced by Volkswagen in South Africa from 1984 until 21 August 2009. It was a face-lifted version of the original Volkswagen Golf Mk1 hatchback, which ceased production in Germany in 1983. The car was produced only with right-hand drive.\nQuestion: The Citi Golf was occasionally produced with left-hand drive. True, False, or Neither?", "doc_id": 859, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39278, 1244, 3656, 24100], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network.\nQuestion: Liverpool Street station has issues with the homeless. True, False, or Neither? Neither\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire.\nQuestion: Peter John Reynolds was born in Scotland. True, False, or Neither? False\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration.\nQuestion: The High Bridge Branch never was near a body of water. True, False, or Neither? False\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808\nQuestion: The manor house was struck by lightning several times True, False, or Neither? Neither\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart.\nQuestion: Nine Lashes did not debut with \"Break the World\" True, False, or Neither?", "doc_id": 583, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9748, 33563, 18412, 25293], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award.\nQuestion: Simon & Schuster approached David McCullough to ask him to write the book The Path Between the Seas. True, False, or Neither? Neither\n###\nThe Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\".\nQuestion: It was a made up story True, False, or Neither? Neither\n###\n[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis.\nQuestion: Michael Keene had not previously worked with the band True, False, or Neither? False\n###\nKulte is a clothing label from Marseille. It was created in 1998 and in 2013 it owns more than 10 shops mainly in France (its first foreign shop opened in Athens in 2011). The brand collaborated with several artists (MGMT, Na\u00efve New Beaters) and music related organizations (including the music festivals, Marsatac and Transmusicales, and record labels, Because Music and Kitsun\u00e9).\nQuestion: Kulte's first shop outside France was in Greece. True, False, or Neither? True\n###\nBrennan Hesser (born 1980) is an American television actress, best known for co-starring in Tori Spelling's VH1 sitcom, \"So NoTORIous\". She also starred in Fox's drama, \"Jonny Zero\". She also guest starred in an episode of the CBS television show, \"The Guardian\". As a youngster, she attended the prestigious Interlochen Arts Camp in Northern Michigan.\nQuestion: Tori Spelling had a sitcom on VH1 called \"So.\" True, False, or Neither?", "doc_id": 897, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43626, 28930, 29202, 12981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Albert Ball, VC, DSO & Two Bars, MC (14 August 1896 \u2013 7 May 1917) was an English fighter pilot during the First World War. At the time of his death he was the United Kingdom's leading flying ace, with 44 victories, and remained its fourth-highest scorer behind Edward Mannock, James McCudden, and George McElroy.\nQuestion: Albert Ball did not partake in the second World War. True, False, or Neither? True\n###\nDie Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen.\nQuestion: Die Antwoord translates to the answer True, False, or Neither? True\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant.\nQuestion: Voyager did not want Neelix to join the crew. True, False, or Neither? Neither\n###\nThe 2015 J&T Banka Prague Open was a professional tennis tournaments played on outdoor clay courts. It was the 6th edition of the tournament which was an International tournament on the 2015 WTA Tour. It took place at the Sparta Prague Tennis Club in Prague, Czech Republic, from 27 April to 2 May 2015. This was the event's first edition as a WTA International tournament.\nQuestion: The first WTA International Tournament started in May of 2015. True, False, or Neither? False\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title.\nQuestion: The 2010 ASB Classic was for men and women True, False, or Neither?", "doc_id": 383, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22146, 15255, 5801, 18371], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s.\nQuestion: The album was released in the century before the current century True, False, or Neither? True\n###\nUna questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga.\nQuestion: Una questione privata was based on a WWI novel that had the same name. True, False, or Neither? False\n###\nH\u00e9rcules de Alicante Club de F\u00fatbol, S.A.D. (] ) is a Spanish football team based in Alicante, in the autonomous community of Valencian Community. Founded in 1922, it currently plays in Segunda Divisi\u00f3n B \u2013 Group 3 and plays its home games at the 30,000-capacity Estadio Jos\u00e9 Rico P\u00e9rez.\nQuestion: Estadio Jos\u00e9 Rico P\u00e9rez has always been a 30,000 capacity stadium. True, False, or Neither? Neither\n###\nBlack Panther (\"H\u0113i B\u00e0o\" \u9ed1\u8c79 ) was a seminal Chinese rock band founded in 1987. It was originally fronted by one of China's alternative music pioneers Dou Wei. The band reunited and released a new album in 2013. The band's best known songs include \"Don't break my heart\", \"Shameful\" \u300a\u65e0\u5730\u81ea\u5bb9\u300b, \"Spirit of Light\" \u300a\u5149\u8292\u4e4b\u795e\u300b\uff0c \"No Right, No Wrong\" \u300a\u65e0\u662f\u65e0\u975e\u300b\uff0c and \"Our generation\" \u300a\u6211\u4eec\u8fd9\u4e00\u4ee3\u300b\nQuestion: Black Panther was founded in the 20th century True, False, or Neither? True\n###\nNew Hampshire Route 78 (abbreviated NH 78) is a 3.456 mi secondary state highway in Cheshire County in the southern part of the U.S. state of New Hampshire. A northward extension of Massachusetts Route 78, NH 78 runs entirely within the town of Winchester from the state border to downtown, where it ends at New Hampshire Route 10 and New Hampshire Route 119.\nQuestion: NH 78 is less than five miles long True, False, or Neither?", "doc_id": 329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34686, 26159, 7457, 19399], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: Campbell aged over the course of the book. True, False, or Neither? True\n###\n\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\".\nQuestion: The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1980 True, False, or Neither? False\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music.\nQuestion: Mission: Impossible III did not have enough money to have an album release True, False, or Neither? Neither\n###\nFrancis Gary Powers (August 17, 1929 \u2013 August 1, 1977) \u2013 often referred to as simply Gary Powers \u2013 was an American pilot whose Central Intelligence Agency (CIA) U-2 spy plane was shot down while flying a reconnaissance mission in Soviet Union airspace, causing the 1960 U-2 incident.\nQuestion: Francis Gary Powers went by G. Powers. True, False, or Neither? False\n###\nThe first season of \"Charmed\", an American supernatural drama television series created by Constance M. Burge, premiered on October 7, 1998 on The WB. Airing on Wednesdays at 9:00 pm, the season consisted of 22 episodes and concluded its airing on May 26, 1999. Paramount Home Entertainment released the complete first season in a six-disc box set on February 1, 2005.\nQuestion: The airing of the first season of \"Charmed\" lasted 8 months True, False, or Neither?", "doc_id": 334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36219, 20776, 30211, 38195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: Mckay has never ridden a skateboard. True, False, or Neither? False\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\".\nQuestion: Alex Rider is the main character of the \"Alex Rider\" series. True, False, or Neither? True\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life.\nQuestion: Health For All has been used by dan. True, False, or Neither? Neither\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures.\nQuestion: There were speaking lines in A Daughter of the Wolf. True, False, or Neither? False\n###\nCorrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed.\nQuestion: Corrina, Corrina was released in an even-numbered year. True, False, or Neither?", "doc_id": 782, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25131, 358, 36074, 540], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: The United States suffered minimally after relations with Cuba were severed. True, False, or Neither? Neither\n###\nSpy Corps is a spy film for Christian families that was written and directed by J David Baker. It stars Sarah Beth Hill as a fearful high school teenager, and Adam Hale as a secret member of the Reserve Spy Training Corps, a training program for high school students who want to pursue a career as a spy.\nQuestion: Reserve Spy Training Corps is a training program for Christian high school students. True, False, or Neither? Neither\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: Sebo lives in Venice, Los Angeles. True, False, or Neither? True\n###\nThe Mini Hatch, stylized as MINI hatch or MINI Hardtop in the US, also known as Mini Cooper or Mini One or simply the Mini, is a three-door hatchback first introduced in late 2000, with a second generation launched in 2006 and a third generation model launched in 2014. A convertible version was introduced in 2004, with the second generation following in 2008.\nQuestion: The second generation Mini was release twice. True, False, or Neither? True\n###\nWriting Degree Zero (French: \"Le degr\u00e9 z\u00e9ro de l'\u00e9criture\" ) is a book of literary criticism by Roland Barthes. First published in 1953, it was Barthes' first full-length book and was intended, as Barthes writes in the introduction, as \"no more than an Introduction to what a History of Writing might be.\"\nQuestion: Le degr\u00e9 z\u00e9ro de l'\u00e9criture is french for writing degree zero. True, False, or Neither?", "doc_id": 538, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24189, 45060, 18488, 9249], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Petasites have another nickname of Kinderbars. True, False, or Neither? Neither\n###\nSpring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch. Three years after principal photography, and after the film's owner, Warner Independent Pictures, was shut down by its parent company, it was released direct-to-video in 2009.\nQuestion: Spring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch released direct-to-video before two thousand nine. True, False, or Neither? False\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: LYU offers degrees in over 5 major disciplines True, False, or Neither? True\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone is an unincorporated community and with 59 people on their census in central Keith County, Nebraska, United States. True, False, or Neither? Neither\n###\nBrandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more.\nQuestion: All of 6 Tre G's albums were released after 1980. True, False, or Neither?", "doc_id": 49, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27353, 8780, 16041, 22634], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park.\nQuestion: Newnes railway line is in London. True, False, or Neither? False\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone is incorporated. True, False, or Neither? False\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: The cultural evolution of Pakistan is portrayed in the book End of the Past. True, False, or Neither? True\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Allen S. Weiner was a co-director True, False, or Neither? True\n###\nSulejman Vokshi (1815 - 1890) was an Albanian military commander and leader of the League of Prizren. A member of the central committee of the league as head of the finances commission, Vokshi also was an important leader of the organization's military branch and an officer of its military staff.\nQuestion: Sulejman lived three quarters of a century before dying True, False, or Neither?", "doc_id": 901, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14787, 33021, 7575, 45132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Versailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S.\nQuestion: Versailles was filmed in Canada. True, False, or Neither? Neither\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC.\nQuestion: Tory Woodbury will become the next coach of the New England Patriots True, False, or Neither? Neither\n###\nThe Portezuelo Formation is a geologic formation outcropping in the Mendoza, R\u00edo Negro and Neuqu\u00e9n provinces of Argentina. It is the fourth-oldest formation in the Neuqu\u00e9n Group and the older of the two formations in the R\u00edo Neuqu\u00e9n Subgroup. Formerly, that subgroup was treated as a formation, and the Portezuelo Formation was known as the Portezuelo Member.\nQuestion: Portezuelo Formation was not discovered by Argentinians. True, False, or Neither? Neither\n###\nThe Rhodesian ridgeback is a dog breed developed in South Africa. Its European forebears can be traced to the early pioneers of the Cape Colony of southern Africa, who crossed their dogs with the semi-domesticated, ridged hunting dogs of the Khoikhoi.\nQuestion: European Ridgebacks were developed in South Africa True, False, or Neither? False\n###\nMiriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua also known as Ka\u02bb ahumanu III (July 27, 1794 \u2013 June 7, 1845), was Kuhina Nui of the Kingdom of Hawaii, a queen consort of both King Kamehameha I and Kamehameha II, and mother of another king.\nQuestion: Miriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua's name was likely more than a dozen syllables True, False, or Neither?", "doc_id": 703, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37596, 32474, 12695, 33886], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Circus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Circus Palestine wan and Israeli Film Award in 2005 True, False, or Neither? Neither\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada.\nQuestion: The ACT has events in New York City. True, False, or Neither? Neither\n###\nVladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic.\nQuestion: Vladislav Adolfovitch Rusanov began writing fantasy stories while in high school True, False, or Neither? Neither\n###\nThe Coca-Cola Bottling Company of Cape Cod is a former bottler of Coca-Cola, Dr Pepper and Canada Dry soft drinks located in Sandwich, Massachusetts, United States. The company was bought out in 2000 by the Coca-Cola Bottling Company of Northern New England.\nQuestion: The Coca-Cola Bottling Company of Cape Cod was bought out before 2000 by the Coca-Cola Bottling Company of Northern New England. True, False, or Neither? False\n###\nJaeden Wesley Lieberher (born January 4, 2003) is an American actor. He is known for starring as Bill Denbrough in the horror film \"It\" (2017), and for his leading roles in the films \"St. Vincent\", as Oliver Bronstein, \"Midnight Special\", as Alton Meyer, \"The Confirmation\", as Anthony, \"The Book of Henry\", as Henry Carpenter.\nQuestion: Henry Carpenter was a character in Midnight Special True, False, or Neither?", "doc_id": 416, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2697, 16080, 19448, 33972], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harry Spencer Davis (born 24 September 1991) is an English professional footballer, who plays as a defender for Scottish Championship side St Mirren. Davis previously played with Crewe Alexandra. Early in his career, he was loaned by Crewe to Nantwich Town, Stafford Rangers and Curzon Ashton.\nQuestion: Harry Spencer Davis has been loaned to at least three other teams by Crewe Alexandra. True, False, or Neither? True\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title.\nQuestion: The 1982 Bavarian Tennis Championships was held in Berlin, Germany True, False, or Neither? False\n###\nHearts of Stone is the fifth studio album by American rock band Stoneground, released in 1978 on Warner Bros. Produced by Bob Gaudio, it marked Stoneground's return to a major label, having released their previous album, \"Flat Out\" (1976), on their own label. \"Prove It\" was released as the first single from \"Hearts of Stone\".\nQuestion: Prove It was released as the first single from Hearts of Stone. True, False, or Neither? True\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing.\nQuestion: The Program in Creative Writing is not located at Iowa State University. True, False, or Neither? True\n###\nTanya McQueen is an American reality television personality and interior designer on TV's . She made her debut on \"Extreme Makeover\" in an October 2005 episode titled, \"The Teas Family\". On August 2, 2011, McQueen and fellow Extreme Makeover personality Tracy Hutson debuted the show \"Picker Sisters\" on Lifetime.\nQuestion: Tanya McQueen had no friends True, False, or Neither?", "doc_id": 969, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19378, 35138, 44061, 40031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Louis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels.\nQuestion: Louis Glenn Marson played for the Cleveland Indians. True, False, or Neither? True\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: ATO records is the best label for their type of music True, False, or Neither? Neither\n###\nIn tabletop role-playing games, the character race represents the people to which a player character (PC) or a non-player character (NPC) belongs. \"People\" is to be taken in the broader sense, and may encompass ethnic groups, species, nationality or social groups.\nQuestion: People is discontinued due to its criticism True, False, or Neither? Neither\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall.\nQuestion: The fortress also function as a dining hall. True, False, or Neither? Neither\n###\nOlga Bay larch or Olgan larch (\"Larix olgensis\"), a species of larch, is named after Olga Bay in the Sea of Japan. The common name in Chinese is \u9ec4\u82b1\u843d\u53f6\u677e (pinyin: huang hua luo ye song). This species occurs in Central Sikhote-Alin, and rarely occurs in North Korea, and Jilin and eastern Heilongjiang provinces of China, between 500 and 1100 metres in elevation.\nQuestion: Olga Bay larch is the same thing as huang hua luo ye song True, False, or Neither?", "doc_id": 252, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4710, 23847, 43369, 10039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146.\nQuestion: Not all communities in Keith County are unincorporated. True, False, or Neither? Neither\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani.\nQuestion: FS Kozani has lost most of it's games. True, False, or Neither? Neither\n###\nPillars of Eternity: The White March is a two-part expansion pack for the 2015 role-playing video game \"Pillars of Eternity\", developed by Obsidian Entertainment and published by Paradox Interactive. The first part was released on August 25, 2015, while the second was released on February 16, 2016.\nQuestion: In Pillars of Eternity players pretend to be someone they're not. True, False, or Neither? True\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer.\nQuestion: Gotz Freiherr von Houwald died on 7/16/2001 True, False, or Neither? False\n###\nNick Davis is a visual effects supervisor who has worked in visual effects since the early 1990s. He was nominated at the 81st Academy Awards for \"The Dark Knight\". He was nominated in the category of Best Visual Effects, he shared his nomination with Chris Corbould, Paul Franklin and Tim Webber.\nQuestion: Nick Davis began his career in visual effects during the final decade of the 20th century True, False, or Neither?", "doc_id": 202, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2909, 13371, 20586, 7124], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amara Karan (born 1984) is a Sri Lankan-English actress who made her film d\u00e9but as the love interest in Wes Anderson's \"The Darjeeling Limited\". The film premi\u00e8red at the 2007 Venice Film Festival. Karan's second film role was as schoolgirl Peaches in the 2007 film \"St Trinian's\".\nQuestion: Karan played many schoolgirls in films True, False, or Neither? Neither\n###\nFan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers.\nQuestion: Yellowstone National Park is the largest national park True, False, or Neither? Neither\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry.\nQuestion: Chess Records was working with Chuck Berry 63 years ago. True, False, or Neither? True\n###\nThe Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books.\nQuestion: Paul Brooks produced a film starring Kelsey Grammer about a detective who recruits the help of a young girl to write children's books True, False, or Neither? True\n###\nThe Kur\u0161ininkai (Curonians; German: \"Kuren\" ; Lithuanian: \"kur\u0161ininkai, kur\u0161iai\" ; Latvian: \"kursenieki, kur\u0161i\" ; Polish: \"kuronowie pruscy\" ) are a nearly extinct Baltic ethnic group living along the Curonian Spit. \"Kur\u0161ininkai\" refers only to inhabitants of Lithuania and former East Prussia that speak a dialect of Latvian.\nQuestion: The Curonians were linguistically influenced by the Latvians. True, False, or Neither?", "doc_id": 680, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20051, 30929, 15405, 38418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th.\nQuestion: Irina Cherniaeva was born in Moscow. True, False, or Neither? False\n###\nStoked (stylized as \"St\u014dked\") is a Canadian animated series produced by Fresh TV that premiered on Teletoon on June 25, 2009 and ended on January 26, 2013. It formerly aired on Teletoon in Canada and ABC3 in Australia, and on Cartoon Network in the United States. The series is from the same creators as \"6teen\" and the \"Total Drama\" series.\nQuestion: Stoked aired for less than exactly 4 years. True, False, or Neither? True\n###\nRyman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc.\nQuestion: Ryman Auditorium is the largest in Nashville, Tennessee. True, False, or Neither? Neither\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic.\nQuestion: Lausche is not shaped like a cone. True, False, or Neither? False\n###\n\"Drivin' Around Song\" is a song recorded by American country rap singer Colt Ford and country music singer Jason Aldean. It is the third single from his fourth studio album, \"Declaration of Independence\". The song was written by Chris Tompkins and Craig Wiseman.\nQuestion: Colt Ford was born in nineteen hundred fifty seven. True, False, or Neither?", "doc_id": 902, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9018, 33751, 33736, 32233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is situated by water. True, False, or Neither? True\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012.\nQuestion: The Cite du Cinema is in France. True, False, or Neither? True\n###\n\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording.\nQuestion: Jonathan Groff wrote the music and lyrics to the song. True, False, or Neither? False\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: Dave Lee Murphy released a song that made it to the top 10 on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. True, False, or Neither? True\n###\nCoptosapelteae is a tribe incertae sedis of flowering plants in the Rubiaceae family and contains about 55 species in 2 genera. Its representatives are found in tropical and subtropical Asia. This tribe has not been placed within as subfamily of Rubiaceae, but is sister to the rest of the family.\nQuestion: I am the tribe of Coptosapelteae True, False, or Neither?", "doc_id": 813, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17689, 24953, 13048, 1706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas.\nQuestion: The marchers usually wear bowler hats. True, False, or Neither? True\n###\nCarol Hernandez is an American journalist from Miami Florida. She won a 1996 Goldsmith Prize for Investigative Reporting. She won the 1996 Pulitzer Prize for National Reporting. She currently resides in Long Island with her husband, and three children, (the oldest being the best and most funny and creative).\nQuestion: Carol Hernandez has the same last name as her husband. True, False, or Neither? Neither\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues.\nQuestion: Ethereal ran into legal issues with its trademark so it was renamed. True, False, or Neither? True\n###\nThe Death and Life of John F. Donovan is an upcoming Canadian drama film, co-written, co-produced and directed by Xavier Dolan in his English-language debut. It stars Kit Harington, Natalie Portman, Jessica Chastain, Susan Sarandon, Kathy Bates, Jacob Tremblay, Ben Schnetzer, Thandie Newton, Amara Karan, Chris Zylka, Jared Keeso, Emily Hampshire and Michael Gambon.\nQuestion: The Death and Life of John F. Donovan is Kit Harington's English-language debut True, False, or Neither? Neither\n###\nCorrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed.\nQuestion: Corrina, Corrina was based on a happy event. True, False, or Neither?", "doc_id": 591, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34093, 17050, 27903, 37500], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Lyra\" is a song written, produced, and performed by British recording artist Kate Bush, from the 2007 soundtrack album \"The Golden Compass\" from the film of the same name. It is used in the closing credits of the film. Bush was commissioned to write the song, with the request that it make reference to the lead character, Lyra Belacqua.\nQuestion: Kate bush writes songs in the early 2000's True, False, or Neither? True\n###\nThe Alfa Romeo Brera and the Alfa Romeo Spider (Type 939) are two sports cars manufactured by Alfa Romeo respectively between 2005-2010 and 2006-2010. The Brera is a 2+2 coup\u00e9, while the Spider is its roadster version. Both models were built by Pininfarina.\nQuestion: Pininfarina has only ever built one sports car model for Alfa Romeo True, False, or Neither? False\n###\nShabbona Township is one of nineteen townships in DeKalb County, Illinois, USA. As of the 2010 census, its population was 1,453 and it contained 603 housing units. The township contains the Chief Shabbona Forest Preserve and Shabbona Lake State Park.\nQuestion: In 2010, there was at least 600 housing units. True, False, or Neither? True\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001.\nQuestion: Wanker Records is a fair record label True, False, or Neither? Neither\n###\nNational Highway 26 (NH 26), (previously National Highway 43), is a National Highway in India, that connects Raipur in Chhattisgarh and passes through Odisha to connect with Natavalasa in Vizianagaram district of Andhra Pradesh. It connects National Highway 5 and National Highway 6 and transverses the Eastern Ghats.\nQuestion: National Highway 5 and 6 traverse the Eastern Ghats. True, False, or Neither?", "doc_id": 508, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24736, 19573, 12506, 5442], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Two Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations.\nQuestion: British customers use Two Men And A Truck. True, False, or Neither? True\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy.\nQuestion: The National Rehabilitation Hospital is a very bad hospital True, False, or Neither? Neither\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing.\nQuestion: Ingham County is Michigan's most sparsely populated county True, False, or Neither? Neither\n###\nBronwen (] ) is a Welsh feminine given name. It is closely associated with the similar name \"Branwen\", which appears in medieval Welsh literature. Used in Wales since the 19th century, it was introduced to the English-speaking public at large by a character in the Richard Llewellyn novel \"How Green Was My Valley\" (1939).\nQuestion: The name has seen a decline in use since the 70's. True, False, or Neither? Neither\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\".\nQuestion: Mark Donovan was born on an even day True, False, or Neither?", "doc_id": 473, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10463, 25801, 26795, 27039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released.\nQuestion: It was recorded during the tour celebrating four decades on the road. True, False, or Neither? True\n###\nMurder Rock (Italian: Murderock - uccide a passo di danza; also known as Murder-Rock: Dancing Death, Slashdance and The Demon Is Loose!) is a 1984 Italian giallo film starring Olga Karlatos and Ray Lovelock, and written and directed by Lucio Fulci. Fulci recalled the producer forced him to turn the film into a musical with the music of Keith Emerson due to the success of \"Flashdance\".\nQuestion: Murder Rock was directed by Donald Trump True, False, or Neither? False\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles).\nQuestion: The first British Grand Prix was seen all over the world. True, False, or Neither? Neither\n###\nThe Hamas-Jund Ansar Allah clash was a battle, fought between the police forces of the Islamist group Hamas controlling Gaza, and the radical Islamist group Jund Ansar Allah. The fighting began on 14 August 2009 and concluded the next day. In total, 24 people were killed in the fighting, including six Hamas police officers and an 11-year-old girl, and a further 150 were wounded.\nQuestion: The battle was fought with incendiaries. True, False, or Neither? Neither\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013.\nQuestion: The 1941 Cabo San Lucas hurricane was not a weather formation that one would consider taking precautions with True, False, or Neither?", "doc_id": 846, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23509, 25288, 23634, 29875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: Maps members are based in two bordering states. True, False, or Neither? True\n###\nKulte is a clothing label from Marseille. It was created in 1998 and in 2013 it owns more than 10 shops mainly in France (its first foreign shop opened in Athens in 2011). The brand collaborated with several artists (MGMT, Na\u00efve New Beaters) and music related organizations (including the music festivals, Marsatac and Transmusicales, and record labels, Because Music and Kitsun\u00e9).\nQuestion: MGMT and Marsatac are commercial partners of Kulte. True, False, or Neither? True\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Tillya tepe contains a lot of ivory. True, False, or Neither? Neither\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972.\nQuestion: The Arizona Cardinals existed in 1962. True, False, or Neither? True\n###\nWinnie the Pooh and a Day for Eeyore is a 1983 Disney Winnie the Pooh animated featurette, based on two chapters from the books \"Winnie-the-Pooh\" and \"The House at Pooh Corner\", originally released theatrically on March 25, 1983, with the 1983 re-issue of \"The Sword in the Stone\". It is the fourth and final of Disney's original theatrical featurettes adapted from the Pooh books by A. A. Milne.\nQuestion: A. A. Milne was successful True, False, or Neither?", "doc_id": 582, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2658, 19665, 31282, 22719], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cyrinda Foxe (born Kathleen Victoria Hetzekian; February 22, 1952 \u2013 September 7, 2002) was an American actress, model and publicist, best known for her role in \"Andy Warhol's Bad\" (1977). She was married to both David Johansen of the proto-punk band New York Dolls and Steven Tyler of the hard rock band Aerosmith. She is the mother of Mia Tyler.\nQuestion: Mia Tyler was in Andy Warhol's Bad True, False, or Neither? False\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign.\nQuestion: The brigade did not fight in Asia. True, False, or Neither? False\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons.\nQuestion: The Grand Prix des Fronti\u00e8res was a motor race held caused much stress to Jules Buisseret True, False, or Neither? Neither\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee.\nQuestion: Gwinnett County Public Schools has teachers. True, False, or Neither? True\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft.\nQuestion: Carl Filip Anton Forsberg was born more than 3 hours ago. True, False, or Neither?", "doc_id": 205, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43807, 21591, 6428, 36886], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Player\" is a song recorded by American singer Tinashe. It features guest vocals by American singer Chris Brown. The song was released by RCA Records as the intended first single off her second album on October 2, 2015, but was later scrapped. \"Player\" was written by Tinashe, Myron Birdsong, Brown, its producers Lulou and Alexander Kronlund, and Chloe Angelides.\nQuestion: The song had six writers. True, False, or Neither? True\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: People pick black-eyed susans True, False, or Neither? Neither\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg.\nQuestion: Trevor wanted to call it the Currano Diagram True, False, or Neither? Neither\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: The Santa Cova Funicular is a expensive railway True, False, or Neither? Neither\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas.\nQuestion: Europrop International is known by his first aircraft. True, False, or Neither?", "doc_id": 851, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36899, 774, 10264, 4488], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador.\nQuestion: Wolosky was in the Air Force. True, False, or Neither? Neither\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced.\nQuestion: Splice is a 2008 Canadian- True, False, or Neither? False\n###\n\"Se Telefonando\" is a song performed by the Italian singer Mina, released in May 1966. The music was composed, orchestrated and conducted by Ennio Morricone to Italian lyrics by Di Chiara and Costanzo. (Reportedly Costanzo only contributed one word, in editing a previous version of a verse, to avoid censorship). The song was written for a radio broadcast, called \u201cAria condizionata\u201d.\nQuestion: Mina passed away a few years ago. True, False, or Neither? Neither\n###\nWalkin' is the debut mini-album by South Korean singer Suran. It was released on June 2, 2017, by Million Market and distribuited by LOEN Entertainment. It consists of five songs, including \"Wine\" featuring rapper Changmo, previously released as a digital single, and the title track \"1+1=0\" featuring singer Dean.\nQuestion: Wine was one of five featured songs on the album from Suran. True, False, or Neither? True\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh.\nQuestion: Shehzad Sheikh and Javed Sheikh have both acted in the same film. True, False, or Neither?", "doc_id": 320, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36374, 34403, 15581, 2210], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Child Whispers (published in 1922) is the first published work of the English children's author Enid Blyton, illustrated by her childhood friend and collaborator Phyllis Chase. It is a collection of 28 poems, and one of Blyton's most popular and best-known poetry books.\nQuestion: Child Whispers was published in 1927 True, False, or Neither? False\n###\nThe 1919 PGA Championship was the second PGA Championship, which is now considered one of golf's major championships. It was held September 16\u201320 at the Engineers Country Club in Roslyn Harbor, New York, east of New York City on Long Island in Nassau County.\nQuestion: Golf's major championship was held in Nassau county True, False, or Neither? True\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser played a huge role on the mass killing of jews. True, False, or Neither? Neither\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture.\nQuestion: Denis Villeneuve was born in Montreal. True, False, or Neither? Neither\n###\nA meat analogue, also called a meat alternative, meat substitute, mock meat, faux meat, imitation meat, or (where applicable) vegetarian meat or vegan meat, approximates certain aesthetic qualities (primarily texture, flavor and appearance) and/or chemical characteristics of specific types of meat. Many analogues are soy-based (see: tofu, tempeh) or gluten-based.\nQuestion: Meat analogues do not contain meat. True, False, or Neither?", "doc_id": 528, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22069, 31219, 20396, 2562], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Penthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue.\nQuestion: Penthouse is a complex film. True, False, or Neither? Neither\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport.\nQuestion: Camair-Co has a big headquarters. True, False, or Neither? Neither\n###\nLittle Fluffy Gigolo Pelu (Japanese: \u30d5\u30a1\u30f3\u30b7\u30fc\u30b8\u30b4\u30ed \u30da\u30eb , Hepburn: Fansh\u012b Jigoro Peru , a.k.a. \"Fancy Gigolo Pelu\") is a three \"tank\u014dbon\" manga series written and illustrated by Junko Mizuno and published by Enterbrain. The series has been licensed in North America and France where the first volume received mostly positive reviews.\nQuestion: The Series is available in at least 2 countries True, False, or Neither? True\n###\nSugar & Spice is a 2001 American teen crime comedy film directed by Francine McDougall, and starring Marley Shelton, Marla Sokoloff, Mena Suvari, James Marsden, and Melissa George. The plot follows a group of high school cheerleaders who conspire and commit armed robbery when one of them becomes pregnant and desperate for income.\nQuestion: The desperation of a group of cheerleaders to support their pregnant friend, starring Marla Sokoloff, is what lead to the movie plot in Sugar & Spice True, False, or Neither? Neither\n###\nAsbury First United Methodist Church is located on East Avenue in Rochester, New York, United States. It traces its heritage to several Rochester congregations dating back to the 1820s. In its current form, it is the result of a 1934 merger of First Church and Asbury Methodist Episcopal Church. With a congregation of 2,300 people, it is the largest United Methodist church in the Rochester area.\nQuestion: Asbury Methodist Episcopal Church will always be the largest church in the Rochester area. True, False, or Neither?", "doc_id": 437, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25983, 25129, 16092, 10317], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German.\nQuestion: Franz Schuber really liked The Lady of the Lake True, False, or Neither? Neither\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist.\nQuestion: Ana B\u00e1rbara sings in Spanish exclusively True, False, or Neither? Neither\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit.\nQuestion: Cloverdale is the least populated town in Sonoma County True, False, or Neither? Neither\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: The Shokaku class aircraft carriers were part of the reason the United States was brought into the Pacific War. True, False, or Neither? True\n###\nJatin\u2013Lalit are a Bollywood film composer duo consisting of Jatin Pandit and his younger brother Lalit. They have written the scores for films such as \"Khiladi\", \"Jo Jeeta Wohi Sikandar\", \"\", \"Dilwale Dulhania Le Jayenge\", \"Yes Boss\", \"Jab Pyaar Kisise Hota Hai\", \"Kuch Kuch Hota Hai\", \"Mohabbatein\", \"Kabhi Khushi Kabhi Gham\" and \"Fanaa\" .\nQuestion: Jatin-Lalit write primarily for an Indian market True, False, or Neither?", "doc_id": 318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42798, 1493, 33329, 44028], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phoenix Police Department is the law enforcement agency responsible for the city of Phoenix, Arizona. Today , the Phoenix Police Department comprises more than 2,900 officers and more than 1,000 support personnel. The department serves a population of more than 1.6 million and patrol almost 516 sqmi of the sixth largest city in the United States.\nQuestion: The police department in Phoenix is named after the city its responsible for. True, False, or Neither? True\n###\nTabitha Anastasia \"Tibby\" Tomko-Rollins is a fictional character in the 2001 novel \"The Sisterhood of the Traveling Pants\" and the 2005 film based upon it. She is a member of the titular club, along with her friends Lena Kaligaris, Bridget Vreeland and Carmen Lowell. She was portrayed by Amber Tamblyn in the film.\nQuestion: Lena Kaligaris, Bridget Vreeland, Carmen Lowell and Amber Tamblyn each portrayed the same character in the 2005 film based on the 2001 novel \"The Sisterhood of the Traveling Pants\", except Lena Kaligaris. True, False, or Neither? False\n###\nSyracuse IMG Sports Network is the radio and television name for Syracuse University sports. The radio affiliates broadcast football, as well as men's and women's basketball and men's lacrosse games. Time Warner Cable Sports broadcasts the coaches' show and a weekly program titled \"Syracuse Sidelines\".\nQuestion: Syracuse IMG Sports Network can be listened to on the radio as well as watched on television True, False, or Neither? True\n###\nVirginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly.\nQuestion: Virginia's Eleventh Congressional District is led by Gerry Connolly True, False, or Neither? True\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\"\nQuestion: The north end extension was going to be called \"Flatbush Avenue Extension,\" Pt. 2, but wasn't. True, False, or Neither?", "doc_id": 159, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7991, 3064, 26785, 23722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added.\nQuestion: Stereolab created many albumns True, False, or Neither? Neither\n###\n\"Show Me Love\" is a song by German DJ and record producer Robin Schulz and British singer J.U.D.G.E. The song was released as a digital download in Germany on 13 November 2015 as the third single from his second studio album \"Sugar\" (2015). The song was written by Dennis Bierbrodt, J\u00fcrgen Dohr, Guido Kramer, Robin Schulz and Richard Judge.\nQuestion: The only song Dennis wrote was Show Me Love True, False, or Neither? Neither\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles).\nQuestion: The total race distance of 319.73 km was the longest race of the season. True, False, or Neither? Neither\n###\nThis is a list of Japanese idols; a type of celebrity in Japan. The word \"idol\" is almost always used to refer to a young woman, although there a significant number of male idols. The following list includes both female and male idols as well as both solo idols and idol groups.\nQuestion: Female idols earn more money than male idols in Japan. True, False, or Neither? Neither\n###\nD\u00fcrnstein is a small town on the Danube river in the Krems-Land district, in the Austrian state of Lower Austria. It is one of the most-visited tourist destinations in the Wachau region and also a well-known wine growing area. The municipality consists of the Katastralgemeinden \"D\u00fcrnstein, Oberloiben\" and \"Unterloiben\".\nQuestion: D\u00fcrnstein is well known for its wineries True, False, or Neither?", "doc_id": 567, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31251, 37881, 43097, 1027], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "TOTO is a legalized form of lottery sold in Singapore, known by different names elsewhere. Singapore Pools is the only legal lottery operator in Singapore. It was established on 23 May 1968 to control widespread illegal gambling in Singapore during the 1960s.\nQuestion: TOTO is known by different names in other countries True, False, or Neither? True\n###\nWilliston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower.\nQuestion: Levy County, Florida doesn't have enough airports. True, False, or Neither? Neither\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: The frequency 594 kHz AM covers all of Victoria, Australia. True, False, or Neither? True\n###\nBaya M. Harrison, Jr. (1912 in Tampa, Florida \u2013 1975) was a politician and an attorney in Florida. He served as Chairman of the Florida Board of Control from 1960\u20131964. Harrison greatly impacted the State University System of Florida and helped desegregate Florida colleges and universities. He served as President of the Florida Bar in 1957.\nQuestion: Baya M. Harrison, Jr. held a position during his lifetime on a Board. True, False, or Neither? True\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: Gulf Air features only international flights. True, False, or Neither?", "doc_id": 462, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10304, 11234, 42734, 2962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arthur John Duckworth (born 19 January 1949) is a former Australian rules footballer who played for Fitzroy in the Victorian Football League (VFL), West Perth in the West Australian National Football League (WANFL), and Central District in the South Australian National Football League (SANFL). He is the older brother of former Essendon footballer Billy Duckworth.\nQuestion: Arthur John Duckworth is more than 60 years old True, False, or Neither? True\n###\nYissachar Dov Rokeach (born 19 January 1948) is the fifth and present Rebbe of the Hasidic dynasty of Belz. He is the son of Rabbi Mordechai of Bilgoray (1902 \u2013 1949), the grandson of the third Belzer Rebbe, Rabbi Yissachar Dov Rokeach, and the nephew of the fourth Belzer Rebbe, Rabbi Aharon Rokeach, who raised him. He has led Belz since 1966.\nQuestion: Yissachar Dov Rokeach is 71 years old. True, False, or Neither? True\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums).\nQuestion: The Asteroids Galaxy Tour could only perform live with a five-piece True, False, or Neither? True\n###\nIn the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese).\nQuestion: Prologica must also make its own cell phones if it makes its own computers True, False, or Neither? Neither\n###\nThe 2007 Hertsmere Borough Council election took place on 3 May 2007 to elect members of Hertsmere Borough Council in Hertfordshire, England. One third of the council was up for election and the Conservative party stayed in overall control of the council.\nQuestion: The Hertsmere Borough Council held an election in 2007 in which the liberal members were in the minority. True, False, or Neither?", "doc_id": 165, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32357, 23912, 20952, 14686], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company.\nQuestion: When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office. True, False, or Neither? Neither\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television.\nQuestion: Sebastian Philip Bierk was born before April 4, 1968. True, False, or Neither? True\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison.\nQuestion: Alix Bancourt is a blogger who writes in french but it is translated to English on her online blog for people who can't speak french True, False, or Neither? True\n###\nVersailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S.\nQuestion: Versailles has multiple episodes. True, False, or Neither? True\n###\nThe Alfa Romeo Brera and the Alfa Romeo Spider (Type 939) are two sports cars manufactured by Alfa Romeo respectively between 2005-2010 and 2006-2010. The Brera is a 2+2 coup\u00e9, while the Spider is its roadster version. Both models were built by Pininfarina.\nQuestion: The Alfa Romeo Brera and the Alfa Romeo Spider were manufactured for the same duration of time True, False, or Neither?", "doc_id": 475, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7859, 13927, 7261, 38510], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amethyst: Princess of Gemworld is a comic book series published by DC Comics in the 1980s. The series tells the story of a teenage girl named Amy Winston who discovers that she is the orphaned princess of the magical Gemworld. Amy learns that an evil ruler called Dark Opal is out to destroy her and travels to Gemworld to overthrow him.\nQuestion: In the Amethyst: Princess of Gemworld comic book series, Amy Winston succeeds in defeating Dark Opal True, False, or Neither? Neither\n###\nThe Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice.\nQuestion: The Mannlicher\u2013Sch\u00f6nauer killed the least amount of people. True, False, or Neither? Neither\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart.\nQuestion: The group Cross gene has more than one member, but only one of the members is Japanese. True, False, or Neither? True\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015.\nQuestion: The state was created in 2001. True, False, or Neither? False\n###\nThe Lei \u00c1urea (] ; English: Golden Law ), adopted on May 13, 1888, was the law that abolished slavery in Brazil. It was signed by Isabel, Princess Imperial of Brazil (1846\u20131921), an opponent of slavery, who acted as regent to Emperor Dom Pedro II, who was in Europe.\nQuestion: The Lei Aurea law which was adopted on May 13, 1888 abolished slavery. It was opposed by Princess Imperial. True, False, or Neither?", "doc_id": 191, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18337, 16473, 39983, 20426], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Alice Sue Claeys enjoys skiing True, False, or Neither? Neither\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex.\nQuestion: Sangre de Grado is hard to find. True, False, or Neither? Neither\n###\nTotal Film is a UK-based film magazine published 13 times a year (published monthly and a summer issue is added every year since issue 91, 2004 which is published between July and August issue) by Future Publishing. The magazine was launched in 1997 and offers cinema, DVD and Blu-ray news, reviews and features. \"Total Film\" is available both in print and interactive iPad editions.\nQuestion: Total Film launched before the millennium happened. True, False, or Neither? True\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: All the songs of the album \"Maps & Companions\" were recorded on one weekend True, False, or Neither? Neither\n###\nAn Evening With Groucho is the title of a 1972 recording at New York City's Carnegie Hall of the last one-man show by American comedian Groucho Marx. Introduced by Dick Cavett, the show was released as a double album by A&M Records. Marx shared family stories and performed songs from Marx Brothers movies. Marvin Hamlisch accompanied Groucho on the piano.\nQuestion: An Evening with Groucho was recorded less than half a century ago True, False, or Neither?", "doc_id": 51, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23112, 44834, 25118, 20123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League.\nQuestion: He won a cricket championship True, False, or Neither? Neither\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: Lanshan district is in Linyi province. True, False, or Neither? False\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist.\nQuestion: Ana B\u00e1rbara was recorded in Mexico True, False, or Neither? Neither\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars.\nQuestion: Via Dante is a theater in Milan. True, False, or Neither? False\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: People named Susan typically have blue eyes. True, False, or Neither?", "doc_id": 39, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36505, 17516, 18772, 38858], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots.\nQuestion: 102 Squadron was formed in 1962 True, False, or Neither? True\n###\nThe 1955 NCAA Skiing Championships were contested in Northfield, Vermont at the second annual NCAA-sanctioned ski tournament to determine the individual and team national champions of men's collegiate alpine, cross country skiing, and ski jumping in the United States.\nQuestion: The results of the 1955 NCAA Skiing Championships were contested. True, False, or Neither? Neither\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990).\nQuestion: Zale Dalen is a film director. He is not proud of his film the hounds of Notre Dame True, False, or Neither? Neither\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly.\nQuestion: Stannis Baratheon is in \"Game of Thrones\" and \"A Song of Ice and Fire\". True, False, or Neither? True\n###\nAatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi.\nQuestion: Dilip Shankar was directly influenced by American culture. True, False, or Neither?", "doc_id": 221, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7296, 18185, 3601, 29339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds.\nQuestion: The Phu Quoc Ridgeback only exists in Vietnam True, False, or Neither? Neither\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign.\nQuestion: The 23rd Infantry Brigade was the only infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II True, False, or Neither? Neither\n###\nDon Sinclair Davis, PhD (August 4, 1942 \u2013 June 29, 2008) was an American character actor best-known for playing General Hammond in the television series \"Stargate SG-1\" (1997\u20132007), and earlier for playing Major Garland Briggs on the television series \"Twin Peaks\" (1990\u20131991). He was also a theater professor, painter, and United States Army captain.\nQuestion: Don Sinclair Davis, Phd was an United States Army captain. True, False, or Neither? True\n###\nThe Cable Guy is a 1996 American comedy film directed by Ben Stiller, starring Jim Carrey and Matthew Broderick. It was released in the United States on June 14, 1996. The film co-stars Leslie Mann, Jack Black, George Segal, Diane Baker, Eric Roberts, Owen Wilson, Janeane Garofalo, David Cross, Andy Dick, Amy Stiller, and Bob Odenkirk.\nQuestion: The Cable Guy is a 1000 + 997 American comedy film directed by Ben Stiller True, False, or Neither? False\n###\nFlorence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution.\nQuestion: Harper's account of the early stages of the Russian revolution is the only account of that revolution we have. True, False, or Neither?", "doc_id": 788, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17748, 16310, 14854, 24479], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis.\nQuestion: Pantelis is a musician. True, False, or Neither? True\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films.\nQuestion: Jonathan Michael Lovitz is seen by millions. True, False, or Neither? Neither\n###\nLes Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads:\nQuestion: Poulenc began the work in January 1930. True, False, or Neither? Neither\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Viktor was the first person to discover Tillya tepe. True, False, or Neither? Neither\n###\nIrfan Khoosat (Urdu: \u0639\u0631\u0641\u0627\u0646 \u06a9\u06be\u0648\u0633\u0679\u200e ) is a Pakistani actor, producer and a well-known comedian. He is famous for his comic role as \"Hawaldar Karamdad\" in the TV series Andhera Ujala in which he portrayed simpleton and blabbermouth character of a low-ranked policeman. He is also known as stage comedian. He also won Nigar Award for his comic role in 1985 film \"Hum se hai zamana\".\nQuestion: The film Hum se hai zamana was released more than 3000 days ago. True, False, or Neither?", "doc_id": 937, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41110, 25817, 9235, 4606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Khan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht.\nQuestion: Khan Kluay was a three-dimensional animated movie from Thailand. True, False, or Neither? True\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330.\nQuestion: Thomas Cooper was the best England international footballer. True, False, or Neither? Neither\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film was released in the summer True, False, or Neither? True\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944.\nQuestion: Morley College Choir was founded at Yale True, False, or Neither? False\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart.\nQuestion: Nine Lashes only released two albums as a group. True, False, or Neither?", "doc_id": 9, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41344, 44603, 29123, 15218], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\".\nQuestion: Maria Ho can read faces. True, False, or Neither? Neither\n###\nJade is a 1995 American erotic thriller film written by Joe Eszterhas, produced by Robert Evans, directed by William Friedkin and starring David Caruso, Linda Fiorentino, Chazz Palminteri, Richard Crenna and Michael Biehn. The original music score was composed by James Horner based on a song composed by Loreena McKennitt. The film was marketed with the tagline \"Some fantasies go too far.\"\nQuestion: The fantasy genre saw the production of Jade, a 1995 American film written by Joe Eszterhas, with a score marketed with the tagline \"too far\". True, False, or Neither? False\n###\n\"It's the Little Things\" is a 1967 single by Sonny James. \"It's the Little Things\" was Sonny James' twenty-fifth release on the country chart, the song went to number one on the country chart for five weeks and spent a total of fourteen weeks on the charts.\nQuestion: Sonny James is a rap artist True, False, or Neither? False\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: Doomsday Device is a good term. True, False, or Neither? Neither\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson.\nQuestion: The Icelandic national under-18 basketball team was unsuccessful True, False, or Neither?", "doc_id": 215, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23563, 14890, 33218, 28495], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California.\nQuestion: Michelle Do has talked to Bush. True, False, or Neither? Neither\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG.\nQuestion: Said bin Salim Al Shaksy spoke arabic True, False, or Neither? True\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics.\nQuestion: The language was very hard to use True, False, or Neither? Neither\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers.\nQuestion: Global Crossing and Hurricane Electric merged. True, False, or Neither? Neither\n###\nFlorence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution.\nQuestion: Harper left Russia before the Bolshevik Revolution. True, False, or Neither?", "doc_id": 45, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4482, 20139, 2409, 37820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "West Town Mall is an upscale shopping mall located in Knoxville, Tennessee, United States. Opened in August 1972, this one-level mall is located in the western portion of Knoxville in the West Hills community. West Town Mall is located along Interstates 40/75 and Kingston Pike. The mall has over 1300000 sqft of Gross leasable area, making it the largest of any enclosed shopping mall in Tennessee.\nQuestion: West Town mall is located in Tennessee True, False, or Neither? True\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville.\nQuestion: No. 27 Squadron RAAF is a Royal Australian Air Force reserve formed over 9 years ago True, False, or Neither? True\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker.\nQuestion: None of the dogs in The Doberman Gang had a name that included the last name of the bank robber they were named after. True, False, or Neither? False\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries.\nQuestion: People really like nice things True, False, or Neither? Neither\n###\nCeres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club.\nQuestion: Ceres\u2013Negros Football Club had Trump. True, False, or Neither?", "doc_id": 617, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3567, 14397, 9784, 40107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Flamingo is the debut solo studio album by American singer-songwriter and The Killers lead singer Brandon Flowers, released on September 3, 2010 by Island Records. It was recorded at Battle Born Studios in Winchester, Nevada, and Henson Recording Studios in Hollywood, California. The album debuted at number one on the UK Albums Chart.\nQuestion: Flamingo was worked on in the states of Nevada and California before it dropped in September of 2010. True, False, or Neither? True\n###\nLament is the seventh studio album by British new wave band Ultravox, released in the UK on 6 April 1984. It was the last album featuring original drummer Warren Cann until the band's reunion album \"Brilliant\" in 2012. The album peaked at #8 on the UK album chart and was certified Gold by the BPI in June 1984 for 100,000 copies sold. It also reached #25 in Germany and #115 in the United States.\nQuestion: Ultravox had many top 10 albums True, False, or Neither? Neither\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle.\nQuestion: Symphonic songs are very loud. True, False, or Neither? Neither\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006.\nQuestion: It was not starring Tim Van Patten. True, False, or Neither? Neither\n###\nTime of Your Life is an American television drama series starring Jennifer Love Hewitt that aired for one season on Fox. A spin-off of \"Party of Five\", the series followed Sarah Reeves Merrin as she moved to New York City to learn more about her biological parents. Co-stars included Jennifer Garner, Pauley Perrette and Gina Ravera.\nQuestion: Time of Your Life is an American television drama series starring jlo True, False, or Neither?", "doc_id": 152, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34971, 4631, 36231, 12929], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams.\nQuestion: The Gaming Control Act was highly contested before passing into law. True, False, or Neither? Neither\n###\nThe third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas.\nQuestion: Producers said US was the best location for a finale True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Tinker Field no longer exists True, False, or Neither? True\n###\nCaddyshack is a 1980 American comedy film directed by Harold Ramis and written by Brian Doyle-Murray, Ramis and Douglas Kenney. It stars Michael O'Keefe, Chevy Chase, Rodney Dangerfield, Ted Knight, and Bill Murray. Doyle-Murray also has a supporting role. The film was later dedicated to producer Douglas Kenney, who died shortly after the film's release.\nQuestion: Caddyshak a film that lost one of it's producers was dedicated to him. True, False, or Neither? True\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points.\nQuestion: Lincoln, Nebraska's Memorial Stadium is located on campus at the University of Nebraska- Lincoln. True, False, or Neither?", "doc_id": 83, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32058, 43861, 25647, 16950], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia.\nQuestion: Tasmania holds numerous different species of plants and animals, one specifically being the basil. True, False, or Neither? Neither\n###\nThe 198th Infantry Brigade, was first formed as part of the United States Army Reserve's 99th Division. It was active from 1967 through 1971 and has been active since 2007 as an Infantry Training Brigade as part of the US Army Infantry School at Fort Benning, Georgia.\nQuestion: The Brigade was inactive between 1972-2006. True, False, or Neither? True\n###\nThe Never-Before-Released Masters is 1987 compilation album containing unreleased recordings recorded by Motown girl-group The Supremes from 1961 to 1969. It was the second CD release of unreleased recordings by The Supremes, the first being disc two of the 2 disc \"25th Anniversary\" compilation. Several other unreleased tracks appeared on earlier various artists compilations.\nQuestion: The Supremes was Kobe Bryant's favorite group to train to. True, False, or Neither? Neither\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex.\nQuestion: Croton Lechleri is also known as dragon's breath. True, False, or Neither? False\n###\nAlbert Ernest Clifford \"Cliff\" Young, OAM (8 February 19222 November 2003) was an Australian potato farmer and athlete from Beech Forest, Victoria, best known for his unexpected win of the inaugural Sydney to Melbourne Ultramarathon in 1983 at 61 years of age.\nQuestion: Albert Ernest Clifford \"Cliff\" Young died ten years after 2000. True, False, or Neither?", "doc_id": 518, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7637, 2048, 8558, 18884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Good Night is a 2007 romantic comedy film written and directed by Jake Paltrow. The film stars his sister Gwyneth Paltrow, Pen\u00e9lope Cruz, Martin Freeman, Danny DeVito, Simon Pegg and others. The movie takes place in London and New York City, where a former pop star (Freeman) who now writes commercial jingles for a living experiences a mid-life crisis.\nQuestion: The Good Night was directed by the brother of Gwyneth Paltrow True, False, or Neither? True\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: Migos also released a single in the year 2013. True, False, or Neither? Neither\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982).\nQuestion: \"King of the Jungle\" was a popular song in japan True, False, or Neither? Neither\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Ashcroft House is made with concrete True, False, or Neither? False\n###\n\"May the Bird of Paradise Fly Up Your Nose\" is a 1965 novelty song performed by Little Jimmy Dickens. It was Dickens' most successful single on the U.S. country music chart. It spent two weeks at No. 1 that November, and stayed on the chart for a total of 18 weeks. On the overall \"Billboard\" Hot 100 the song peaked at No. 15.\nQuestion: \"May the Bird of Paradise Fly Up Your Nose\" was not on the charts for 6 months True, False, or Neither?", "doc_id": 889, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2129, 5543, 31100, 32292], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Florence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution.\nQuestion: Florence MacLeod Harper stayed with the same company for 25 years True, False, or Neither? Neither\n###\nThe Ferry County Carousel is an operational wooden carousel located three miles (5\u00a0km) east of the town of Republic, Washington on the Ferry County Fairgrounds. This carousel features 24 horses in two rows and is fitted with a jumping mechanism. The Ferry County Carousel is one of only seven classic wooden carousels in Washington state and possibly the oldest.\nQuestion: There are more metal carousels than wooden in Washington state. True, False, or Neither? Neither\n###\nThe Trexler Nature Preserve is an 1,108 acre county park owned and maintained by Lehigh County, Commonwealth of Pennsylvania. The preserve is situated in Lowhill Township and North Whitehall Township and the land that comprises the preserve was originally purchased between 1901 and 1911 by local industrialist General Harry Clay Trexler.\nQuestion: The Trexler Nature Preserve will be turned into a Walmart in 2050 True, False, or Neither? Neither\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98.\nQuestion: Death Race is pie True, False, or Neither? False\n###\nJohn (Johnnie) White (died 2007) was a high-ranking staff officer of the Official Irish Republican Army (Official IRA) in Derry, Northern Ireland and later Adjutant General of the Irish National Liberation Army (INLA). He was a key figure in Derry in the early years of the Troubles, and played a prominent role in the events surrounding the creation and defence of Free Derry.\nQuestion: John White is serving time for his involvment with the IRA. True, False, or Neither?", "doc_id": 17, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3371, 29110, 22537, 2522], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese).\nQuestion: Brazilian company Prol\u00f3gica is based out of Rio de Janeiro. True, False, or Neither? Neither\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon plays the position of forward. True, False, or Neither? True\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The police arrested the woman with the knife. True, False, or Neither? False\n###\nThe Corridor (Lithuanian: Koridorius ) is a 1995 Lithuanian drama film directed by \u0160ar\u016bnas Bartas. It has a fragmentary narrative without dialogue and depicts several people in Vilnius. According to the director, the title symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\".\nQuestion: The Corridor portrays the everyday people in Vilnius True, False, or Neither? Neither\n###\nRoc-A-Fella Records Presents Teairra Mar\u00ed is the debut album by recording artist Teairra Mar\u00ed. It was released on August 2, 2005, by Roc-A-Fella Records. The album debuted in the top five selling 69,000 copies in the first week, eventually selling 248,000 units.\nQuestion: The album sold 69,000 copies per week True, False, or Neither?", "doc_id": 597, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21127, 11229, 9605, 38344], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011.\nQuestion: Hipmunk media attention was still quite low despite its increase after Google's announcement. True, False, or Neither? Neither\n###\nColdwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep.\nQuestion: Coldwater fish refers to species that prefer to live in waters cooler than 20*C when in the context of aquariums. True, False, or Neither? True\n###\nColdwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep.\nQuestion: Tropical fish prefer cooler water temperatures, typically below 20 degrees Celsius. True, False, or Neither? False\n###\nArt of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley.\nQuestion: Art of Dying is fronted by Hetherington. True, False, or Neither? True\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008.\nQuestion: Peter Franco won his first Grammy is 2014. True, False, or Neither?", "doc_id": 201, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25166, 22195, 3445, 2525], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: David Lee Murphy toured around the US True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: The war caused significant damage. True, False, or Neither? Neither\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C.\nQuestion: Luton Town Ladies Football Club was founded in 1998. True, False, or Neither? False\n###\nManiac (stylized as MANIAC) is an American short slasher film, directed by Shia LaBeouf. It was released on October 31, 2011. The short film stars American rappers Scott \"Kid Cudi\" Mecudi and Chris \"Cage\" Palko, as French-speaking serial killers. Mescudi and Palko also co-wrote the film with LaBeouf.\nQuestion: 57% of the patrons seeing the film Maniac in the movie theater prefer their popcorn with extra butter. True, False, or Neither? Neither\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012.\nQuestion: Adrian Galvin was born in nineteen hundred eighty three. True, False, or Neither?", "doc_id": 280, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28094, 8434, 19948, 7699], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: At some point the book changed titles True, False, or Neither? True\n###\nReckless is the third book in the The It Girl novels by the German American author Cecily von Ziegesar. The series is ghostwritten from the original idea by Ziegesar. The series, aimed toward young adults, is a spin-off from the bestselling \"Gossip Girl\" series. It was released in 2006 by Little, Brown.\nQuestion: Reckless was geared towards younger adult audiences True, False, or Neither? True\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying.\nQuestion: Elizabeth Short was an actress. True, False, or Neither? True\n###\nUni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats.\nQuestion: Uni\u00f3n Deportiva Vall de Ux\u00f3 is a popular team that always fills the stadium when they play True, False, or Neither? Neither\n###\nForever Lost is the second studio album by Norwegian recording artist A-Lee, released on October 5, 2012 in Norway, on EE Records and Columbia/Sony Music Norway. A-Lee worked with producers Ground Rules, Martin K, Bernt Rune Stray, BPM, Thomas Eriksen, Slipmats and The Products. The original album track list features Aleksander With, Elisabeth Carew and Marcus Only.\nQuestion: Forever Lost was not seen in South America True, False, or Neither?", "doc_id": 941, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15329, 9542, 21843, 44903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Castle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel.\nQuestion: Castle Wolfenstein was developed for the Macintosh. True, False, or Neither? False\n###\nHeck's Department Store, a chain of West Virginia based discount department stores, was founded by Boone County natives and businessmen Fred Haddad, Tom Ellis, and Lester Ellis and wholesale distributor Douglas Cook. The Heck's name was a combination of the names Haddad, Ellis and Cook. Haddad served as President, Lester Ellis was Vice-President, and Tom Ellis was Secretary-Treasurer.\nQuestion: Heck's department stores are a a chain of West Virginia based discount grocery stores. True, False, or Neither? False\n###\nBoneyard Beach is a 1995 album by Raleigh, North Carolina band Dish, led by singer and pianist Dana Kletter, on Interscope Records. The album was produced by John Agnello at Ardent Studios in Memphis, Tennessee. Interscope's VP, Tom Whalley, told \"Billboard\" magazine that \"the high quality of songwriting in Dish and the sound of Dana's voice are two things that set this band apart.\"\nQuestion: Dish released an album in 1995 True, False, or Neither? True\n###\nThere have been four head coaches of the Houston Texans, a professional American football team based in Houston, Texas, United States. The Texans play in the South Division of the American Football Conference (AFC) in the National Football League (NFL).\nQuestion: The Houston Texans start with a C. True, False, or Neither? False\n###\nDean Young (born 1955) is a contemporary American poet in the poetic lineage of John Ashbery, Frank O'Hara, and Kenneth Koch. Often cited as a second-generation New York School poet, Young also derives influence and inspiration from the work of Andr\u00e9 Breton, Paul \u00c9luard, and the other French Surrealist poets.\nQuestion: Dean Young was a rapist True, False, or Neither?", "doc_id": 396, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [45369, 5856, 8613, 32096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Heinrich Karl Ludwig Herault de Seigneur de Hautcharmoy (1689 in Wesel \u2013 11 May 1757 in Margaret monastery at Prague) was a Prussian Lieutenant-General, Knight of the Black Eagle and commander of Brieg. His family was originally from Kingdom of France, and his father served as subordinate to Friedrich von Schomberg, and was killed with him at the Battle of the Boyne.\nQuestion: He outlived his father. True, False, or Neither? False\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities.\nQuestion: Alberto Espinoza Barr\u00f3n was arrested last year. True, False, or Neither? False\n###\nThe Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds.\nQuestion: The Phu Quoc Ridgeback is the most popular breed of ridgeback dogs True, False, or Neither? Neither\n###\nRoy Denzil Hibbert (born December 11, 1986) is a Jamaican-American professional basketball player who last played for the Denver Nuggets of the National Basketball Association (NBA). He is a two-time NBA All-Star, and earned NBA All-Defensive Second Team honors in 2014.\nQuestion: Roy will continue to play on his current team until he retires. True, False, or Neither? Neither\n###\nThe Governor Nobre de Carvalho Bridge also known as the Macau-Taipa Bridge, is a dual-lane two-way bridge connecting Macau Peninsula near Casino Lisboa and the island of Taipa at the northern slope of Taipa Pequena (Small Taipa Hill) crossing the Ba\u00eda da Praia Grande. It is the first bridge in Macau, to connect the peninsula and Taipa. It is locally known as \"The Old Bridge\" ().\nQuestion: There is a bridge in Macau called \"The New Bridge.\" True, False, or Neither?", "doc_id": 213, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35089, 20959, 41011, 42435], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958.\nQuestion: It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. 50s movies leave something to be desired. True, False, or Neither? Neither\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas.\nQuestion: Paul Albert Raymond Barlatier de Mas was born in october True, False, or Neither? True\n###\nGame engine recreations are remade engine interpreters for video games that replace the original engine binary that came with the original game. A notable example of game engine recreation is ScummVM which successfully recreated the SCUMM engine of classical LucasArts' point and click adventures. For further examples, refer to the list of game engine recreations.\nQuestion: Game engine recreations are not video games. True, False, or Neither? False\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge.\nQuestion: San Pablo Reservoir is underneath Richmond on a map. True, False, or Neither? False\n###\nPrince Karl Alfred of Liechtenstein (Karl Alfred Maria Johannes Baptista Heinrich Aloys Georg Hartmann Ignatius; 16 August 1910 \u2013 17 November 1985) was a Liechtensteiner prince and brother of Franz Joseph II. He was the third child and second son of Prince Aloys of Liechtenstein and Archduchess Elisabeth Amalie of Austria.\nQuestion: Karl Alfred of Liechtenstein was a prince from Liechtenstein whose birth was trending all over social media from the moment he was born True, False, or Neither?", "doc_id": 660, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10840, 42733, 41356, 34543], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chatot (also Chacato or Chactoo) were a Native American tribe who lived in the upper Apalachicola River and Chipola River basins in what is now Florida. They spoke a Muskogean language, which may have been the same as that of the Pensacola people.\nQuestion: The Chacato tribe once lived in a river basin in Florida. True, False, or Neither? True\n###\nThe Sydney/Melbourne Express was an overnight intercapital passenger train service that operated between the Australia's largest two cities, Sydney and Melbourne, between August 1986 and November 1993. Operated jointly by State Rail Authority and V/Line the name depended on the direction of travel, with the train nicknamed the 'Sex' or 'Mex'.\nQuestion: The rail line crossed the ocean. True, False, or Neither? Neither\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: The album \"Companions\" was re-titled \"Maps & Companions.\" True, False, or Neither? False\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock.\nQuestion: Trick Pony never had a hit song. True, False, or Neither? Neither\n###\nDovyalis is a genus of shrubs and small trees. Recent genetic evidence has shown the genus to belong to the family Salicaceae; formerly it was classified in the family Flacourtiaceae. The 15 species are native to Africa (Ethiopia south to South Africa) and southern Asia (India, Sri Lanka). Some are cultivated for their fruit.\nQuestion: There are 15 species of Dovyalis that are cultivated for their fruit. True, False, or Neither?", "doc_id": 281, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44726, 28450, 25666, 28779], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Masquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line.\nQuestion: The play was written in 1830+8 True, False, or Neither? False\n###\nDemi Lovato: Live in Concert (also known as the Summer Tour 2009) was the debut headlining concert tour by American singer Demi Lovato, launched in support of her debut album \"Don't Forget\" (2008) and the second studio album \"Here We Go Again\" (2009).\nQuestion: Demi Lovato has only released one album. True, False, or Neither? False\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006.\nQuestion: Silent Scream made for a good christmas present True, False, or Neither? Neither\n###\nMonique Brumby (born 16 September 1974, Devonport) is an Australian Indie pop/rock singer-songwriter, guitarist and producer. Her debut single, \"Fool for You\", peaked into the top 40 in the Australian Recording Industry Association (ARIA) ARIA Singles Charts, and provided an ARIA Award for 'Best New Talent' in 1996. Her single, \"Mary\", won an ARIA Award in 1997 for 'Best Female Artist'.\nQuestion: Monique was born in the 20th century. True, False, or Neither? True\n###\nThere have been four head coaches of the Houston Texans, a professional American football team based in Houston, Texas, United States. The Texans play in the South Division of the American Football Conference (AFC) in the National Football League (NFL).\nQuestion: The Houston Texans have had less than sixty head coaches. True, False, or Neither?", "doc_id": 469, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21225, 26710, 42803, 35395], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: The re-release occurred in the year after 2010. True, False, or Neither? True\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983.\nQuestion: Robb Report started out as Twentieth Century Confederates while it's founder Robert L. \"Rusty\" White was an undergraduate in the University of Mississippi in the 1960s before it was sold some yeas later. True, False, or Neither? True\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: Kristen Bell is not named Elle. True, False, or Neither? False\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals.\nQuestion: Mark's birthday was the same day their hit single was released. True, False, or Neither? True\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017).\nQuestion: Alexander Vincent LoScialpo starred in 5 Child's Play films True, False, or Neither?", "doc_id": 241, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5099, 6530, 3892, 30846], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character.\nQuestion: The Inbetweeners was released in 199 True, False, or Neither? Neither\n###\nMatthew Wayne Darwin (born March 11, 1963 in Houston, Texas) is a former professional American football center in the National Football League (NFL) for the Philadelphia Eagles. He was drafted twice, first in the 1985 NFL Draft by the Dallas Cowboys and finally in the 1986 NFL Draft by the Eagles. He played college football at Texas A&M University.\nQuestion: Matthew Wayne Darwin was drafted by the Eagles after he was drafted by the Cowboys. True, False, or Neither? True\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station.\nQuestion: The Winter Hill air disaster was said to be the worst flight crash of its time. True, False, or Neither? Neither\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east.\nQuestion: It is bordered by a sea with a name that starts with an R True, False, or Neither? True\n###\nEast of West is a monthly comic book series published by Image Comics which debuted in March 2013. Created by writer Jonathan Hickman and artist Nick Dragotta, the book is a science fiction Western set in a dystopian version of the United States whose fate rests with the Four Horsemen of the Apocalypse.\nQuestion: East of West is a book that includes pictures True, False, or Neither?", "doc_id": 287, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18931, 35985, 22198, 21349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sun Also Rises is a one-act opera by Webster A. Young, based on Ernest Hemingway's \"The Sun Also Rises\". It is one of a pair of Hemingway works that Young adapted into operas. The opera's libretto is by the composer, and includes direct quotations from the novel. It premiered on May 7, 2000 at the Long Island Opera.\nQuestion: Since Webster A. Young's The Sun Also Rises is based on Ernest Hemingway's, there is a marked improvement in interludes, composition, libretto, and quotations. True, False, or Neither? Neither\n###\nThe Vienna State Opera (German: Wiener Staatsoper ) is an Austria opera house and opera company based in Vienna, Austria. It was originally called the Vienna Court Opera (Wiener Hofoper). In 1920, with the replacement of the Habsburg Monarchy by the First Austrian Republic, it was renamed the Vienna State Opera. The members of the Vienna Philharmonic are recruited from its orchestra.\nQuestion: The Vienna State Opera currently has 5 opera singers. True, False, or Neither? Neither\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships.\nQuestion: The 98100 square foot center will be expanded. True, False, or Neither? Neither\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci appears as the devil in the upcoming film Devil's Whisper True, False, or Neither? Neither\n###\nThe Oakland Athletics' 1985 season involved the A's finishing 4th in the American League West with a record of 77 wins and 85 losses. While the Athletics' on-field performance continued to disappoint, the debut of slugger Jose Canseco gave fans a measure of hope.\nQuestion: Jose Canseco will play baseball next year. True, False, or Neither?", "doc_id": 255, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20168, 27087, 10257, 36134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\".\nQuestion: The 1980 album \"Monty Python's Contractual Obligation Album\" was released on January 19th. True, False, or Neither? Neither\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits.\nQuestion: Shades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple and contains rare edits and singles which are remastered along with album versions of their biggest and rarest hits. True, False, or Neither? Neither\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer.\nQuestion: Loui Jover knows what light aperture is True, False, or Neither? True\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist.\nQuestion: Jean-Baptiste Lamarck often said he was proud to have the island named after him. True, False, or Neither? Neither\n###\nBaby Mine is a 1928 silent film comedy produced and distributed by MGM. This film is a remake of the 1917 film \"Baby Mine\" both being based on Margaret Mayo's 1910 Broadway comedy \"Baby Mine\". This film stars Karl Dane, George K. Arthur and Charlotte Greenwood and is her third feature film, she having made two previous films in 1916 and 1918.\nQuestion: The number of words spoken in Baby Mine is equal to the number of pigs who could fly True, False, or Neither?", "doc_id": 666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11010, 7715, 343, 38117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vice Admiral Sir Timothy James Hamilton Laurence {'1': \", '2': \", '3': \", '4': \"} (born 1 March 1955) is a retired British naval officer and the second husband of Princess Anne, the only daughter of Queen Elizabeth II and Prince Philip. Laurence was Equerry to the Queen from 1986 to 1989.\nQuestion: Laurence is the first husband of Princess Anne True, False, or Neither? False\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006.\nQuestion: Oliver Francis O'Grady was born in the 19th century. True, False, or Neither? False\n###\nHidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO.\nQuestion: The company that developed Clout Fantasy never had a CEO. True, False, or Neither? False\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Leonard and Mae Murray had three children. True, False, or Neither? Neither\n###\nConnacht Rugby (Irish: \"Rugba\u00ed Connachta\" ) is one of the four professional provincial rugby teams from the island of Ireland. Connacht competes in the Pro14 and the European Rugby Challenge Cup. The team represents the IRFU Connacht Branch, which is one of four primary branches of the IRFU, and is responsible for rugby union throughout the geographical Irish province of Connacht.\nQuestion: The team plays in the Republic of Ireland. True, False, or Neither?", "doc_id": 524, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12343, 45422, 5616, 44926], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ghost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997).\nQuestion: Ghost Notes was released in the winter of 2015. True, False, or Neither? False\n###\nBeyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert.\nQuestion: Beyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John Blodgett, Michael LaZar and David Gurian. True, False, or Neither? False\n###\n\"Girl in a Country Song\" is the debut single by American country music duo Maddie & Tae, co-written with Aaron Scherz and released in July 2014. The song is an answer to the \"bro-country\" subgenre in contemporary country music, specifically in how women are portrayed by men, with lyrics containing references to a variety of popular recent country songs.\nQuestion: Maddie & Tae are country music performers. True, False, or Neither? True\n###\nThe Office is a British mockumentary sitcom, first broadcast in the United Kingdom on BBC Two on 9 July 2001. Created, written and directed by Ricky Gervais and Stephen Merchant, the programme is about the day-to-day lives of office employees in the Slough branch of the fictitious Wernham Hogg Paper Company. Gervais also stars in the series, playing the central character, David Brent.\nQuestion: Brent is not a fictitious character. True, False, or Neither? False\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord.\nQuestion: Svensk Hyllningsfest started the year after 1939. True, False, or Neither?", "doc_id": 590, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14926, 7409, 22245, 14746], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A startup company (startup or start-up) is an entrepreneurial venture which is typically a newly emerged, fast-growing business that aims to meet a marketplace need by developing a viable business model around an innovative product, service, process or a platform. A startup is usually a company designed to effectively develop and validate a scalable business model.\nQuestion: A startup company has not been around for a while. True, False, or Neither? True\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\".\nQuestion: There was no other female rock and roll journalists before Gloria Stavers True, False, or Neither? True\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style.\nQuestion: Those that wrestle with AAA use high flying. True, False, or Neither? True\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band.\nQuestion: The album was one of the best they ever produced. True, False, or Neither? Neither\n###\nMarion Anna Fischer (born July 18, 1986 in East Berlin) is a German actress and singer. Since 2003, she appeared in over 30 film and television roles in appearance. She is most recognised to international audiences as the innocent vampire \"Nora\" in Dennis Gansel's drama film \"We Are The Night\"\nQuestion: Marion Anna Fischer first appeared in films at the age of twelve True, False, or Neither?", "doc_id": 792, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21410, 1145, 33547, 16498], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Unternehmen Herbstnebel (\"Operation Autumn Mist\") was an offensive planned by German Field Marshal Walter Model and his Army Group B operational staff in late 1944 during World War II. It envisaged a German Army attack targeting the Allied forces in eastern Belgium and Luxembourg, east of the Meuse River.\nQuestion: Walter Model was the only Field Marshal who planned Operation Autumn Mist. True, False, or Neither? Neither\n###\nThe Sandlot is a 1993 American coming-of-age baseball film co-written and directed by David M. Evans, which tells the story of a group of young baseball players during the summer of 1962. It stars Tom Guiry, Mike Vitar, Karen Allen, Denis Leary and James Earl Jones. The filming locations were in Glendale, Midvale, Salt Lake City, and Ogden, Utah.\nQuestion: The filming locations were in Glendale, Seattle, Salt Lake City, and Ogden, Utah True, False, or Neither? False\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines is a non-profit news organization. True, False, or Neither? True\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville.\nQuestion: No. 27 Sqaudron RAAF is a well known squadron. True, False, or Neither? Neither\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee\nQuestion: Peter Straub was an american author that wrote the novel Mystery just one year after his birth in 1990 True, False, or Neither?", "doc_id": 892, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42323, 22447, 44786, 35055], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stoked (stylized as \"St\u014dked\") is a Canadian animated series produced by Fresh TV that premiered on Teletoon on June 25, 2009 and ended on January 26, 2013. It formerly aired on Teletoon in Canada and ABC3 in Australia, and on Cartoon Network in the United States. The series is from the same creators as \"6teen\" and the \"Total Drama\" series.\nQuestion: Stoked was released in English True, False, or Neither? Neither\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Santa Teresa is a fictional town True, False, or Neither? True\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments.\nQuestion: J\u00fcrgen Melzer starts with an A. True, False, or Neither? False\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: The Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, but they aren't the game's developer. True, False, or Neither? False\n###\nGenevieve LaCaze (born 4 August 1989) is an Australian athletics competitor who specialises in the 3000 metre steeplechase. She held an athletics scholarship at the University of Florida. She was selected to represent Australia at the 2012 Summer Olympics in London and Athletics at the 2016 Summer Olympics in Rio de Janeiro. LaCaze is of French, Italian and Spanish descent.\nQuestion: Genevieve LaCaze was born more than 1000 weeks ago. True, False, or Neither?", "doc_id": 588, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15066, 22050, 19984, 40132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "My Dinner with Herv\u00e9 is an upcoming American television drama film directed and written by Sacha Gervasi based on the later days of actor Herv\u00e9 Villechaize. The film stars Peter Dinklage as Villechaize, Jamie Dornan as a struggling journalist, and Andy Garc\u00eda as Ricardo Montalb\u00e1n, Villechaize\u2019s \"Fantasy Island\" co-star.\nQuestion: The film is about Villechaize's senior years. True, False, or Neither? True\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris.\nQuestion: Yellow Ledbetter has acting. True, False, or Neither? True\n###\nJames Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: James Wyatt won a literary award for \"Dungeon Master's Guide\". True, False, or Neither? Neither\n###\nJacob (Jake) Ruppert Jr. (August 5, 1867\u00a0\u2013 January 13, 1939) was an American brewer, businessman, National Guard colonel and United States Congressman who served for four terms representing New York from 1899 to 1907. He also owned the New York Yankees of Major League Baseball from 1915 until his death in 1939.\nQuestion: Jake Ruppert crafted beverages made of barley and malt. True, False, or Neither? Neither\n###\nRed Earth, White Lies: Native Americans and the Myth of Scientific Fact is a book by Native American author Vine Deloria, originally published in 1995. The book's central theme is to criticize the scientific consensus which has, in his words, created \"a largely fictional scenario describing prehistoric North America\".\nQuestion: Red Earth, White Lies: Native Americans and the Myth of Scientific Fact has an A. True, False, or Neither?", "doc_id": 50, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4514, 37880, 45073, 19820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology.\nQuestion: Alexadrov was an american citizen True, False, or Neither? False\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity.\nQuestion: The company does not exist. True, False, or Neither? False\n###\nThe New Pornographers is a Canadian indie rock band formed in 1997 in Vancouver, British Columbia. Presented as a musical collective of singer-songwriters and musicians from multiple projects, the band has released seven studio albums to critical acclaim for their use of multiple vocalists and elements of power pop incorporated into their music.\nQuestion: The New Pornographers are amateur pornographers True, False, or Neither? False\n###\nWKKF \"(102.3 FM)\" - branded as Kiss 102.3 - is a Top 40 (CHR) station licensed to Ballston Spa, New York and serving the Capital District and Adirondacks. The station is owned by iHeartMedia and broadcasts at 102.3 FM at 4,100 watts ERP from a transmitter in Clifton Park, New York on a tower shared with WDCD-FM and WTMM-FM.\nQuestion: People in the Adirondacks can listen to Kiss True, False, or Neither? True\n###\nJohnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\".\nQuestion: Kidd died before he could see his song Shakin' All Over become a hit. True, False, or Neither?", "doc_id": 154, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20097, 11211, 36689, 11642], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site.\nQuestion: The Cincinnati and Whitewater Canal Tunnel was constructed in 1832 True, False, or Neither? False\n###\nBovegno is a \"comune\" in the province of Brescia, in Lombardy. It borders the communes of Artogne, Berzo Inferiore, Bienno, Collio, Esine, Gianico, Irma, Marmentino and Pezzaze. It is pronounced B\u00f2vegno (\"B\u00f6egn\" in the local Eastern Lombard dialect). It is located in the valley named Val Trompia.\nQuestion: It is not located in a valley True, False, or Neither? False\n###\nWalcha Shire is a local government area located in the New England region of New South Wales, Australia. The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 20 km east of the Main North railway line passing through Walcha Road.\nQuestion: The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 204 km east of the Main North railway line passing through Walcha Road. True, False, or Neither? False\n###\nA Bad Girl in Harlem is the second studio album by Danish rock band New Politics, released on May 21, 2013 via RCA Records. The three members moved from Copenhagen to Brooklyn, where the material was recorded. Two singles were released, titled \"Harlem\" and \"Tonight You're Perfect\". Allmusic.com called the album 'hooky, infectious pop'.\nQuestion: New Politics released their first studio album on May 12, 2012. True, False, or Neither? Neither\n###\n\"Stagger Lee\", also known as \"Stagolee\" and other variants, is a popular American folk song about the murder of Billy Lyons by \"Stag\" Lee Shelton in St. Louis, Missouri at Christmas, 1895. The song was first published in 1911, and was first recorded in 1923 by Fred Waring's Pennsylvanians. A version by Lloyd Price reached number one on the \"Billboard\" Hot 100 in 1959.\nQuestion: The song Stagger Lee was first published over 100 years ago True, False, or Neither?", "doc_id": 465, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8919, 4516, 11035, 7534], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Three Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film was released last century. True, False, or Neither? True\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Sushil Kumar Shinde was an amazing vice president of India. True, False, or Neither? Neither\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory.\nQuestion: The chamber is from 1885 True, False, or Neither? False\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin.\nQuestion: Meyer professionally suffered for making a smut film. True, False, or Neither? False\n###\nThe Wombats are an English rock band formed in Liverpool in 2003. The band is composed of lead vocalist and guitarist Matthew Murphy, drummer Dan Haggis, and bassist Tord \u00d8verland Knudsen, and has been since its inception. The band is signed to 14th Floor Records in the United Kingdom and Bright Antenna in the United States. The Wombats' albums have sold over 1 million copies worldwide.\nQuestion: The Wombats have 3 original band members. True, False, or Neither?", "doc_id": 564, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17414, 25138, 19606, 35516], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ludwig Ruckdeschel (15 March 1907 \u2013 8 November 1986) was the Nazi \"Gauleiter\" of Bayreuth during final month of the \"Gau's\" existence before the collapse of Nazi Germany in 1945. Before this, from 1933 to 1941, he served as the deputy of Gauleiter Fritz W\u00e4chtler, whom he had executed on orders by Martin Bormann. From 1933 to 1945 he was also a member of the German Parliament, the Reichstag.\nQuestion: Ruckdeschel zodiac sign was Pisces. True, False, or Neither? True\n###\nMads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted.\nQuestion: Aschehoug is a respected publishing house. True, False, or Neither? Neither\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Petasites is a type of cat True, False, or Neither? False\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway.\nQuestion: There were no inhabitants of Princess Ragnhild Coast when Capt. Riiser-Larsen and Capt. Larsen discovered it in 1931. True, False, or Neither? Neither\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006.\nQuestion: Silent Scream premiered at the Chicago Horror Film Festival more than 3000 days ago. True, False, or Neither?", "doc_id": 195, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7822, 44234, 30374, 38820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oceanus ( ; Greek: \u1f68\u03ba\u03b5\u03b1\u03bd\u03cc\u03c2 \"\u014ckean\u00f3s\", ] ), also known as Ogenus (\"Ogenos\", \u03a9\u03b3\u03b7\u03bd\u03bf\u03c2) or Ogen (\u03a9\u03b3\u03b7\u03bd), was a divine figure in classical antiquity, believed by the ancient Greeks and Romans to be the divine personification of the sea, an enormous river encircling the world.\nQuestion: Greeks and Romans foresaw that in English we would change it to \"Ocean\". True, False, or Neither? Neither\n###\nCharles Dera is an American pornographic actor, dancer, and model. He has performed in hundreds of heterosexual pornographic movies and is also \u2018The Veteran\u2019 in the male strip troupe \u2018Men of the Strip\u2019. In 2016, Charles Dera played the role of Donald Trump and Cherie Deville played the role of Hillary Clinton in the parody American elections for Brazzers.\nQuestion: Charles Dera is anti-Trump True, False, or Neither? Neither\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. \nQuestion: Dover, New York is in northern New York. True, False, or Neither? Neither\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The Gator Bowl was a game between the Arkansas Razorbacks and the Georgia Tech Yellow Jackets and it was played in cold weather. True, False, or Neither? Neither\n###\nCharlotte Marie Pomeline Casiraghi (born 3 August 1986) is the second child of Caroline, Princess of Hanover, and Stefano Casiraghi, an Italian industrialist. She is ninth in line to the throne of Monaco. Her maternal grandparents were Rainier III, Prince of Monaco, and American actress Grace Kelly. She is named after her maternal great-grandmother, Princess Charlotte, Duchess of Valentinois.\nQuestion: Casiraghi was born in the eighth month. True, False, or Neither?", "doc_id": 482, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31429, 21493, 37795, 18849], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The shooting of John Crawford III occurred on August 5, 2014. Crawford was a 22-year-old African-American man shot to death by Beavercreek police officer Sean Williams, in a Walmart store in Beavercreek, Ohio, near Dayton, while holding a toy BB gun.\nQuestion: The victim was not holding a real gun. True, False, or Neither? True\n###\nA Common Land Unit (CLU) is the smallest unit of land that has a permanent, contiguous boundary, a common land cover and land management, a common owner and a common producer in agricultural land associated with USDA farm programs. CLU boundaries are delineated from relatively permanent features such as fence lines, roads, and/or waterways.\nQuestion: A Common Land Unit is only a show. True, False, or Neither? False\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery.\nQuestion: One of the busiest taverns in London is The Anchor Bankside tavern. True, False, or Neither? Neither\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists.\nQuestion: Contra Conspiracy is a 1993 action film True, False, or Neither? False\n###\nThe Hun River (\u6e3e\u6cb3, \"the muddy river\") is a river in Liaoning Province, China, and was formerly one of the largest tributaries of the Liao River. It was also formerly known as Shen River (\u700b\u6c34). Two of Liaoning's most important cities, the provincial capital Shenyang and the seventh largest city Fushun, are located on the Hun River.\nQuestion: The Hun River is in the southern hemisphere True, False, or Neither?", "doc_id": 808, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31294, 17478, 10396, 15351], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "KnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates.\nQuestion: KnowledgeWare was sold in 1995. True, False, or Neither? False\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty.\nQuestion: The Kilpatrick and Beatty text-messaging scandal involved police chief Gary Brown and Donald Trump True, False, or Neither? False\n###\nLive from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released.\nQuestion: The album was recorded during the British leg of the tour True, False, or Neither? False\n###\nJurassic Park is a 1993 video game based on the film and novel of the same name. It was developed and published by Ocean Software and released for the Nintendo Entertainment System (NES). Ocean also released \"Jurassic Park\" on the handheld Game Boy console. The Game Boy version is a port of the NES version.\nQuestion: The Jurassic Park video game released for the Nintendo Entertainment System (NES) was widely popular. True, False, or Neither? Neither\n###\nThe Sea Hornet is a 1951 American adventure film directed by Joseph Kane and written by Gerald Drayson Adams. The film stars Rod Cameron, Adele Mara, Lorna Gray, Chill Wills, Jim Davis and Richard Jaeckel. The film was released on November 6, 1951, by Republic Pictures.\nQuestion: The Sea Hornet was released after Thanksgiving Day in 1951 True, False, or Neither?", "doc_id": 333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11026, 4936, 28, 39383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kiss of the Spider Woman is a musical with music by John Kander and Fred Ebb, with the book by Terrence McNally. It is based on the Manuel Puig novel \"El Beso de la Mujer Ara\u00f1a\". The musical had runs in the West End (1992) and Broadway (1993) and won the 1993 Tony Award for Best Musical.\nQuestion: The music from Kiss of the Spider Woman was written over the course of 6 months. True, False, or Neither? Neither\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017.\nQuestion: The ATP Challenger Tour was cancelled in 2019 True, False, or Neither? Neither\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008.\nQuestion: Jung Young-bae directed one film in 2008 True, False, or Neither? Neither\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: Sadhu Kokila has won many awards. True, False, or Neither? Neither\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue.\nQuestion: The Sky Bar at the Drake Hotel in Toronto has street-level access. True, False, or Neither?", "doc_id": 306, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20650, 37623, 5078, 15819], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arizona Business Magazine, based out of Phoenix, Arizona, is the state\u2019s leading monthly Business magazine. Published by AZ Big Media, the magazine covers a wide range of topics focusing on the Arizona business scene, and is aimed at high-level corporate executives and business owners.\nQuestion: People in Australia read this magazine. True, False, or Neither? Neither\n###\nFC Saturn-1991 Saint Petersburg (Russian: \u0424\u041a \u00ab\u0421\u0430\u0442\u0443\u0440\u043d\u20111991\u00bb \u0421\u0430\u043d\u043a\u0442\u2011\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 ) was a Russian football team from Saint Petersburg. It played professionally from 1992 to 1995, including 3 seasons (1993\u20131995) in the second-highest Russian First Division. In 1996 it merged with FC Lokomotiv Saint Petersburg. Before 1995 it was called FC Smena-Saturn Saint Petersburg.\nQuestion: They were formed over 7 years ago True, False, or Neither? True\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr.\nQuestion: There was at least one comedy film in 1947. True, False, or Neither? True\n###\nAdwoa Aboah (born 18 May 1992) is a British fashion model and feminist activist, of Ghanaian origin In March 2017, she appeared on the cover of American Vogue with Liu Wen, Ashley Graham, Kendall Jenner, Gigi Hadid, Imaan Hammam, and Vittoria Ceretti. She has also been on the cover of Vogue Italia and i-D.\nQuestion: i-D magazine is an American magazine. True, False, or Neither? Neither\n###\nRylstone was a former electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of Mudgee and named after and including Rylstone. It was abolished in 1904, with the downsizing of the Legislative Assembly after Federation.\nQuestion: Legislative Assembly was abolished 8 years after the founding True, False, or Neither?", "doc_id": 857, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31556, 40668, 99, 12898], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Newlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005.\nQuestion: Newlyweds was a show about a boyband member and his wife. True, False, or Neither? True\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology.\nQuestion: Pavel's brother wrote the three hundred papers attributed to Pavel. True, False, or Neither? Neither\n###\nGreed is the third studio album by American experimental rock band Swans. It was released in 1986, through record label K.422. \"Greed\" marks the slow turning point for Swans away from the harsh, brutal noise rock of prior releases, and is also the first Swans album to contain contributions from Jarboe.\nQuestion: Greed was released before 1985. True, False, or Neither? False\n###\nTerry Butler is an American bassist who currently performs with the death metal bands Obituary and Massacre. He was also a member of Six Feet Under and Death. He was credited on the Death album \"Spiritual Healing\", and band leader Chuck Schuldiner stated that on the latter Death album \"Terry contributed to the songwriting as well\".\nQuestion: Terry Butler loves dogs True, False, or Neither? Neither\n###\nBronwen (] ) is a Welsh feminine given name. It is closely associated with the similar name \"Branwen\", which appears in medieval Welsh literature. Used in Wales since the 19th century, it was introduced to the English-speaking public at large by a character in the Richard Llewellyn novel \"How Green Was My Valley\" (1939).\nQuestion: Bronwen was a named based on a novel True, False, or Neither?", "doc_id": 342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8825, 23792, 40962, 44612], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Texas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax.\nQuestion: Sales tax policy changed as a result of this action. True, False, or Neither? Neither\n###\nLeonard Edgcombe (died 1696) was a ship's captain with the Hudson's Bay Company who made a number of voyages into Hudson Bay and James Bay on behalf of the company. He had Henry Baley as a chief mate for a time prior to 1692 and this mariner became an important link with the area for the Hudson's Bay Company.\nQuestion: Leonard Edgcombe served in at least five jobs over the course of his life. True, False, or Neither? Neither\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: The electric chair position, in professional wrestling, involves flying on that prone opponent. True, False, or Neither? True\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th.\nQuestion: Vasili Vyacheslavovich Blagov starts with an A. True, False, or Neither? False\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films.\nQuestion: saturday night live was a show from 1985 to 1990 True, False, or Neither?", "doc_id": 74, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26396, 45089, 6826, 38680], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The John Coltrane Home is a house in the Dix Hills neighborhood of Huntington, Suffolk County, New York, where saxophonist John Coltrane lived from 1964 until his death in 1967. It was in this home that he composed his landmark work, \"A Love Supreme\".\nQuestion: John Coltrane was a saxophonist. True, False, or Neither? True\n###\nGeorge William Lyttelton, 4th Baron Lyttelton, {'1': \", '2': \", '3': \", '4': \"} (31 March 1817 \u2013 19 April 1876) was a British aristocrat and Conservative politician from the Lyttelton family. He was chairman of the Canterbury Association, which encouraged British settlers to move to New Zealand.\nQuestion: George William Lyttleton lived to be 72 years old. True, False, or Neither? False\n###\nThe Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly.\nQuestion: The Leader of the Opposition of Singapore is a unpopular political party True, False, or Neither? Neither\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times.\nQuestion: Ludovic Coeck spent the majority of his playing time as a left winger. True, False, or Neither? Neither\n###\nBosch is an American police procedural web television series produced by Amazon Studios and Fabrik Entertainment. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show was developed for Amazon by Eric Overmyer and the first season takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: Bosch is Mexican. True, False, or Neither?", "doc_id": 471, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20852, 37077, 22013, 4847], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel.\nQuestion: kiss and tell was an objectively bad song True, False, or Neither? Neither\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit.\nQuestion: Sandstone can be found near the top of a mountain in the Spring Mountain range. True, False, or Neither? True\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Ralph D. Malone was tall. True, False, or Neither? Neither\n###\nHawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town.\nQuestion: Hawthorne Army Depot which is the primary economic base of the town,is located far away from the town. True, False, or Neither? False\n###\nTim Witherspoon (born December 27, 1957) is an American former professional boxer who competed from 1979 to 2003. He is a two-time world heavyweight champion, having held the WBC title in 1984, and the WBA title in 1986. Upon winning his second world title, Witherspoon joined Floyd Patterson and Muhammad Ali as the only boxers to win multiple world heavyweight championships.\nQuestion: Witherspoon was boxing while Jimmy Carter lived in the white house. True, False, or Neither?", "doc_id": 993, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4455, 1555, 4540, 19710], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "True as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: True as a Turtle has a different author or the screenplay and the original novel. True, False, or Neither? False\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania.\nQuestion: Idris Sultan is a Tanzanian actor, comedian, and radio host who hosts a music radio show called MWB on ChoiceFM Tanzania. True, False, or Neither? Neither\n###\nReturn to Paradise is a live album by Styx, released in 1997. It features songs from their successful reunion tour with Tommy Shaw, but without John Panozzo, who died in July 1996. It includes three new studio tracks, including \"Dear John\", which Shaw wrote as a tribute to Panozzo.\nQuestion: Styx both gained and lost a member between 1996 and 1997. True, False, or Neither? True\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct.\nQuestion: Eighty mammal species have become extinct. True, False, or Neither? True\n###\nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: Lathan McKay is fluent in English. True, False, or Neither?", "doc_id": 28, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21140, 11234, 11318, 28000], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Little Fluffy Gigolo Pelu (Japanese: \u30d5\u30a1\u30f3\u30b7\u30fc\u30b8\u30b4\u30ed \u30da\u30eb , Hepburn: Fansh\u012b Jigoro Peru , a.k.a. \"Fancy Gigolo Pelu\") is a three \"tank\u014dbon\" manga series written and illustrated by Junko Mizuno and published by Enterbrain. The series has been licensed in North America and France where the first volume received mostly positive reviews.\nQuestion: Junko made a series of comics that got published True, False, or Neither? True\n###\nYissachar Dov Rokeach (born 19 January 1948) is the fifth and present Rebbe of the Hasidic dynasty of Belz. He is the son of Rabbi Mordechai of Bilgoray (1902 \u2013 1949), the grandson of the third Belzer Rebbe, Rabbi Yissachar Dov Rokeach, and the nephew of the fourth Belzer Rebbe, Rabbi Aharon Rokeach, who raised him. He has led Belz since 1966.\nQuestion: Yissachar Dov Rokeach is 71 years old. True, False, or Neither? True\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Daniel Murphy was born February 24, 1996. True, False, or Neither? Neither\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge.\nQuestion: kew bridge railway station is in hounslow True, False, or Neither? True\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC.\nQuestion: Tory Woodbury was born nine days after Independence Day. True, False, or Neither?", "doc_id": 787, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37496, 27613, 38868, 35031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001.\nQuestion: Wanker Records is a bad record label True, False, or Neither? Neither\n###\nThe Benetton B188 was a Formula One racing car designed by Rory Byrne and raced by Benetton team in the 1988 Formula One season and in the first half of the 1989 Formula One season. Dating back to when the team started as Toleman in , the B188 was the first car produced by the team not to be powered by a turbocharged engine.\nQuestion: Rory Byrne only designed race cars True, False, or Neither? Neither\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: the narration in the book New Day used Jamacian venacular True, False, or Neither? True\n###\nAlbert Ernest Clifford \"Cliff\" Young, OAM (8 February 19222 November 2003) was an Australian potato farmer and athlete from Beech Forest, Victoria, best known for his unexpected win of the inaugural Sydney to Melbourne Ultramarathon in 1983 at 61 years of age.\nQuestion: Potato farming is more important than sports True, False, or Neither? Neither\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: Both parents and children love Amy Timberlake's books. True, False, or Neither?", "doc_id": 138, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40737, 31328, 20305, 29984], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Weaver is an English actress well known for playing the role of recurring character Pamela Cooper in the E4 sitcom \"The Inbetweeners\" and its feature-length films, \"The Inbetweeners Movie\" and \"The Inbetweeners 2\". She has also appeared in several TV commercials.\nQuestion: Robin Weaver acts with an english accent True, False, or Neither? Neither\n###\nSulakshana is an Indian actress born on August 1 ,1965 who has performed in Tamil, Telugu, Kannada and Malayalam films at the age of two and half in the movie Kaaviya Thalaivi as child Krishna in the name of Dolly . After that she acted in Thulabharam as child artist in Tamil,Telugu,Malayalam and Hindi (all version) in the name of Rajani .\nQuestion: She hates acting a lot True, False, or Neither? Neither\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style.\nQuestion: Real Fuerza A\u00e9rea is the top ranked mexican wrestling team. True, False, or Neither? Neither\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo.\nQuestion: Song'z U Can't Find did not include an artist named E040. True, False, or Neither? False\n###\nBen Marshall (born 8 June 1990) is a retired rugby union player from Ireland. He primarily played as a lock or in the back row. Marshall played for Irish provincial sides Leinster and Connacht in the Pro12, but was forced to retire in 2017 due to a concussion injury.\nQuestion: Marshall retired from the game while in his 20s. True, False, or Neither?", "doc_id": 101, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11651, 25973, 32066, 37154], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority.\nQuestion: Dickinson Municipal Airport is located in South Dakota. True, False, or Neither? False\n###\n\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German.\nQuestion: Franz Schuber was bored ofThe Lady of the Lake True, False, or Neither? Neither\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km .\nQuestion: Jiaozhou Bay Bridge is located in a quiet part of China True, False, or Neither? Neither\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster.\nQuestion: Dance India Dance is similar to Dance Bangla Dance. True, False, or Neither? True\n###\nReturn to Paradise is a live album by Styx, released in 1997. It features songs from their successful reunion tour with Tommy Shaw, but without John Panozzo, who died in July 1996. It includes three new studio tracks, including \"Dear John\", which Shaw wrote as a tribute to Panozzo.\nQuestion: Return to Paradise was released before the death of John Panozzo. True, False, or Neither?", "doc_id": 824, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11603, 43388, 30402, 1373], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site.\nQuestion: Scotland contains many castles. True, False, or Neither? Neither\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions.\nQuestion: Project Gasbuggy was carried out more than 9000 days ago. True, False, or Neither? True\n###\nStuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area.\nQuestion: It has a higher population now True, False, or Neither? Neither\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983.\nQuestion: Sherwood Stewart was born after 1940. True, False, or Neither? True\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht.\nQuestion: \"Khan Kluay\" features a main character that doesn't live on land. True, False, or Neither?", "doc_id": 98, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25066, 29532, 8886, 364], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: Nathan MacKinnon wanted to become a businessman rather than play hockey professionally True, False, or Neither? Neither\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street.\nQuestion: Wellingore has been visited by george. True, False, or Neither? Neither\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: Atlanta was always the headquarters of Home Depot. True, False, or Neither? Neither\n###\nNo Devotion are a Welsh/American alternative rock band formed in 2014. They are composed of American vocalist Geoff Rickly (of the band Thursday) from New Jersey, and former band members of the Welsh band Lostprophets. The band formed in the wake of Lostprophets' dissolution in 2013.\nQuestion: Geoff Rickley was born in New Jersey True, False, or Neither? Neither\n###\nErik Jacobsen (born May 19, 1940) is an American record producer, song publisher and artist manager. He is best known for his work in the 1960s with Tim Hardin, The Lovin' Spoonful, The Charlatans, Sopwith Camel, and later with Norman Greenbaum and Chris Isaak. Though semi-retired, Jacobsen continues to manage many of his published songs and masters for various uses.\nQuestion: Erik Jacobsen worked in the 1950s. True, False, or Neither?", "doc_id": 670, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42835, 8928, 111, 11280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brian Wardle (born October 9, 1979) is an American college basketball coach and the current men's basketball coach at Bradley University. He was an assistant at Marquette from 2003\u20132005 and UW-Green Bay from 2005\u20132010. After the 2009-2010 season, Wardle was named head coach at UW-Green Bay. Upon his hiring, Wardle became the youngest head coach in NCAA Division I basketball.\nQuestion: Brian Wardle has only coached in NCAA Division I basketball. True, False, or Neither? False\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: Three Little Sisters was made to make people laugh. True, False, or Neither? True\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart.\nQuestion: the EP was his best work True, False, or Neither? Neither\n###\nJulia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\".\nQuestion: Julia Goldani Telles was born in Michigan on March 18, 1995. True, False, or Neither? Neither\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: Kapp Heights is in the Northern part of the United States. True, False, or Neither?", "doc_id": 607, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27091, 30316, 24524, 9250], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies.\nQuestion: Yahoo Serious is a man with too much time on his hands True, False, or Neither? Neither\n###\nThe Interview is a 1998 Australian thriller film from writer-director Craig Monahan, and is the first of two films directed by Monahan. Almost the entire film takes place in a police interrogation room, with some short flashback sequences, and the cast consists primarily of three key actors\u2014Hugo Weaving, Tony Martin, and Aaron Jeffery.\nQuestion: There are 4 actors in The Interview. True, False, or Neither? False\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Due to its large rhubarb-like leaves during the growing season it is mistaken for Rhubarb plant True, False, or Neither? Neither\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75.\nQuestion: Interstate 29 is under construction at this time. True, False, or Neither? Neither\n###\nThe Local Government (Northern Ireland) Act 1972 (1972 c. 9) was an Act of the Parliament of Northern Ireland that constituted district councils to administer the twenty-six local government districts created by the Local Government (Boundaries) Act (Northern Ireland) 1971, and abolished the existing local authorities in Northern Ireland.\nQuestion: Local authority was removed in favor of localized governance True, False, or Neither?", "doc_id": 372, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24561, 36924, 34387, 30931], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: Mixtapes is an Ohio/Detroit based pop punk band. True, False, or Neither? True\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label.\nQuestion: Bad Company has played with Led Zeppelin. True, False, or Neither? Neither\n###\nIn poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown.\nQuestion: EPT has not been discontinued True, False, or Neither? False\n###\nThe Tenpin Bowling Association of Wales (Welsh: \"Cymdeithas Bowlio Deg Cymru\" ) (TBAW) is the national governing body for tenpin bowling in Wales. It is a member of the F\u00e9d\u00e9ration Internationale des Quilleurs (English: International Bowling Federation ) and the European Tenpin Bowling Federation \u2013 the \"European Zone\" of the World Tenpin Bowling Association.\nQuestion: International bowling federation when abbreviated is IBF True, False, or Neither? True\n###\nHolly Weber (born September 20, 1984) is an American glamour model and actress. As a model, she has appeared in \"Maxim\", \"FHM\", \"Muscle & Fitness\", \"Glamour\", and as no. 66 on AskMen's Top 99 Most Desirable Women of 2009. She has made uncredited appearances in a number of movies and TV series.\nQuestion: Holly Weber was born more than 1000 weeks ago. True, False, or Neither?", "doc_id": 343, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25862, 36130, 18706, 33119], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character.\nQuestion: Jeffrey Reiner went on to direct Superman instead. True, False, or Neither? Neither\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist.\nQuestion: Jean-Baptiste Lamarck was proud to have the island named after him. True, False, or Neither? Neither\n###\nThe Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948.\nQuestion: A Tamil Trinidadian has written a book before True, False, or Neither? True\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: The lead singer of Foals always plays guitar. True, False, or Neither? Neither\n###\nThe Hyundai Genesis Coup\u00e9 is a rear-wheel drive sports coupe from Hyundai Motor Company, released on October 13, 2008 for the Korean market. It is Hyundai's first rear-wheel drive sports coupe, and shares its basic platform with the Hyundai Genesis luxury sedan.\nQuestion: The Hyundai Genesis Coup\u00e9 was released on October 13, 2008 world wide True, False, or Neither?", "doc_id": 629, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26214, 31016, 23948, 12626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster.\nQuestion: Mithun Chakraborty really likes being the Grandmaster True, False, or Neither? Neither\n###\nCharles Dera is an American pornographic actor, dancer, and model. He has performed in hundreds of heterosexual pornographic movies and is also \u2018The Veteran\u2019 in the male strip troupe \u2018Men of the Strip\u2019. In 2016, Charles Dera played the role of Donald Trump and Cherie Deville played the role of Hillary Clinton in the parody American elections for Brazzers.\nQuestion: Charles Dera has never had a gay experience True, False, or Neither? Neither\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: The Cleveland Browns were Superbowl champions while Malone played for them. True, False, or Neither? Neither\n###\nThe Perfect Gift is a 2009 spinoff of the 2005 Christian drama movie \"The Perfect Stranger\", and its first sequel, \"Another Perfect Stranger\". It stars Christina Fougnie, Amy Hess, Matt Wallace, and Jefferson Moore once again as Jesus Christ. It was filmed almost entirely in Kentucky, where the first two movies in the series were not.\nQuestion: The Perfect Stranger was released in 2005. True, False, or Neither? True\n###\nThe following details notable events from the year 2005 in Northern Ireland. Northern Ireland is a part of the United Kingdom in the north-east of the island of Ireland. It is variously described as a country, province or region of the UK, amongst other terms. Northern Ireland shares a border with the Republic of Ireland to the south and west.\nQuestion: Northern Ireland has a border with the United Kingdom. True, False, or Neither?", "doc_id": 514, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9605, 41025, 11875, 3614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep.\nQuestion: Tropical fish prefer cooler water temperatures, typically below 20 degrees Celsius. True, False, or Neither? False\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory.\nQuestion: The opera is in French. True, False, or Neither? Neither\n###\nThe Mnet Asian Music Award for Best Collaboration is an award presented annually by CJ E&M Pictures (Mnet). It was first awarded at the 12th Mnet Asian Music Awards ceremony held in 2010; singers Ga-in & Jo Kwon won the award for their song \"We Fell in Love\", and it is given in honor for the artists with the most artistic achievement in collaboration performances in the music industry.\nQuestion: The Mnet Asian Music Award is a disliked show in South korea True, False, or Neither? Neither\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: \"Bosch\" features events that take place in California. True, False, or Neither? True\n###\nThe \"Charleston\"-class amphibious cargo ships were a class of amphibious cargo ships in service with the United States Navy. These ships served in Amphibious Readiness Groups between 1968 and 1994. The ships were the last amphibious cargo ships built for the U.S. Navy, their role having been taken over by amphibious transport docks.\nQuestion: he \"Charleston\"-class was made more than 0 times True, False, or Neither?", "doc_id": 668, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9065, 36950, 39105, 43839], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show is also shown in Canada. True, False, or Neither? Neither\n###\nThomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 1, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily. From 1951 until 1967, he was the boss of the Lucchese crime family, one of the Five Families that dominates organized crime in New York City.\nQuestion: Thomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 2, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily. True, False, or Neither? False\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: The band added more group members which made them fall apart. True, False, or Neither? Neither\n###\nTight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD.\nQuestion: The 1999 release features a dozen fewer songs than the 2011 release. True, False, or Neither? True\n###\nThe United Nations Peacekeepers Medal (Irish: \"An Bonn Chosant\u00f3ir\u00ed Sioch\u00e1na na N\u00e1isi\u00fan Aontaithe\" ) is awarded to those members of the Irish Defence Forces or Chaplaincy Service who have served overseas on a United Nation Mission or United Nations Mandated Mission.\nQuestion: All members of the Irish Defence Forces have received the United Nations Peacekeepers Medal. True, False, or Neither?", "doc_id": 720, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16650, 6174, 6087, 34002], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships.\nQuestion: Cashman Center only allows national events once per year. True, False, or Neither? Neither\n###\nThe Lei \u00c1urea (] ; English: Golden Law ), adopted on May 13, 1888, was the law that abolished slavery in Brazil. It was signed by Isabel, Princess Imperial of Brazil (1846\u20131921), an opponent of slavery, who acted as regent to Emperor Dom Pedro II, who was in Europe.\nQuestion: Isabel, Princess Imperial of Brazil did not like slavery after watching it in America True, False, or Neither? Neither\n###\nSuperior is a town in and the county seat of Mineral County, Montana, United States. The population was 812 at the 2010 census. Superior was named after its founders' hometown of Superior, Wisconsin in 1869. The post office was established in 1871 after Mineral County became the site of one of the largest gold strikes that helped settle the West.\nQuestion: The Post Office was important for Mineral County True, False, or Neither? Neither\n###\nCranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century.\nQuestion: Cranborne is in Dorset England True, False, or Neither? True\n###\nDopamine is the fifth studio album by American rock band Third Eye Blind, released on June 16, 2015. It is the band's first studio album since 2009's \"Ursa Major.\" The album's first single, \"Everything Is Easy,\" was released on May 8, 2015, along with a cover version of the Beyonc\u00e9 song \"Mine.\" The album debuted at No. 13 on the Billboard 200, selling over 21,000 copies in its first week.\nQuestion: The album was released in the century after the century of the 1900's True, False, or Neither?", "doc_id": 526, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1718, 8189, 19139, 12503], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song.\nQuestion: The Cars have released at least four albums. True, False, or Neither? True\n###\nThe Bajo de la Carpa Formation is a geologic formation that outcrops in Patagonia, in the provinces of R\u00edo Negro and Neuqu\u00e9n, Argentina. It is the first of two formations belonging to the R\u00edo Colorado Subgroup within the Neuqu\u00e9n Group. Formerly that subgroup was treated as a formation, and the Bajo de la Carpa Formation was known as the Bajo de la Carpa Member.\nQuestion: Bajo de la Carpa Formation is the second of two formations belonging to the Rio Colorado Subgroup. True, False, or Neither? False\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\".\nQuestion: Crawling is a song by Linkin Park True, False, or Neither? True\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists.\nQuestion: Some of the editorial cartoonists in the list received more pay in their careers than others. True, False, or Neither? Neither\n###\nRye St Antony School is an independent Roman Catholic boarding and day school for girls aged 3 to 18 and boys up to age 8 in Headington, Oxford, England. It is commonly abbreviated and referred to by both pupils and staff as 'Rye'. Rye is unique as a girls\u2019 independent Catholic school founded by lay women rather than by a religious order.\nQuestion: You must be Roman Catholic to attend Rye St Antony School. True, False, or Neither?", "doc_id": 872, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1678, 11329, 41015, 5025], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Have You Ever Met That Funny Reefer Man\", often known simply as \"The Reefer Man\", is a 1932 American jazz song composed by J. Russel Robinson, with lyrics by Andy Razaf. It was first recorded by Cab Calloway and his orchestra, with versions by others over the years, including by Harlan Lattimore, Murphy's Law and Big Bad Voodoo Daddy.\nQuestion: The song was released in 1931 True, False, or Neither? False\n###\nThe 2007 North Indian Ocean cyclone season was an event in the annual cycle of tropical cyclone formation. The North Indian Ocean cyclone season has no official bounds, but cyclones tend to form between April and December, with peaks in May and November. These dates conventionally delimit the period of each year when most tropical cyclones form in the northern Indian Ocean.\nQuestion: Cyclones exclusively form between april and december True, False, or Neither? False\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles.\nQuestion: The movie has a cult following True, False, or Neither? True\n###\nAnimation Domination was an animated programming block which originally aired from May 1, 2005, until September 21, 2014, on the Fox network. The block aired on Sunday evenings through the entirety of that night's primetime schedule (unless preempted, usually by sports telecasts).\nQuestion: Animation Domination aired in the evenings of the first day of the american week. True, False, or Neither? True\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label.\nQuestion: Nightmare was created before Lance King reached 30 years of age. True, False, or Neither?", "doc_id": 449, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34266, 15138, 4905, 31612], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: She was forced to do pageants as a kid True, False, or Neither? Neither\n###\nUSFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico.\nQuestion: The USFC \"Fish Hawk\" was in operation for almost half a century. True, False, or Neither? True\n###\nXiaogan () is a prefecture-level city in east-central Hubei province, People's Republic of China, some 60 km northwest of the provincial capital of Wuhan. According to the 2010 census, its population totaled 4,814,542, of whom 908,266 lived in the built-up (\"or metro\") area of Xiaonan District.\nQuestion: Wuhan had a population of 4,814,543 in 2010. True, False, or Neither? Neither\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism.\nQuestion: The population is now over 5 thousand True, False, or Neither? Neither\n###\n\"Stephen, Stephen\" is a song by American rock band, The Apples in Stereo. The song made its debut on December 20, 2006 on the Comedy Central program \"The Colbert Report\" where it was performed by Apples frontman, Robert Schneider during Episode number 193.\nQuestion: \"Stephen, Stephen\" was released the week before Christmas Day. True, False, or Neither?", "doc_id": 592, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7229, 28531, 34435, 1021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38.\nQuestion: Spring Fine Art Exhibition of Leningrad was the most popular art show in Russia True, False, or Neither? Neither\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title.\nQuestion: The tournament are played in outdoors hard courts. True, False, or Neither? True\n###\nSusarion (Greek: \u03a3\u03bf\u03c5\u03c3\u03b1\u03c1\u03af\u03c9\u03bd) was an Archaic Greek comic poet, was a native of Tripodiscus in Megaris (see Megara) and is considered one of the originators of metrical comedy and, by others, he was considered the founder of Attic Comedy. Nothing of his work, however, survives except one iambic fragment (see below) and this is not from a comedy but instead seems to belong within the Iambus tradition.\nQuestion: The surviving works of Susarion are not of the sort from which he is renowned. True, False, or Neither? True\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced.\nQuestion: Splice is a sci fi horror movie about genetic engineering but it's also about a married couple True, False, or Neither? Neither\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012.\nQuestion: \"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want! I Want!\". True, False, or Neither?", "doc_id": 665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10969, 44247, 31920, 5668], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rock Star Supernova was a reality television-formed supergroup consisting of drummer Tommy Lee (M\u00f6tley Cr\u00fce), bassist Jason Newsted (ex-Metallica), guitarist Gilby Clarke (ex-Guns N' Roses) and singer Lukas Rossi. The band was formed during the second season of the Rock Star Reality TV series which was called .\nQuestion: Newsted used to be a part of Metallica True, False, or Neither? True\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit.\nQuestion: Sonoma\u2013Marin Area Rail Transit and Amtrak Thruway collaborate. True, False, or Neither? Neither\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: CUHK was established before 1964 True, False, or Neither? True\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh.\nQuestion: Shehzad Sheikh was born in Karachi. True, False, or Neither? Neither\n###\nThe 1998 Idaho Vandals football team represented the University of Idaho in the 1998 NCAA Division I-A football season. The Vandals, led by fourth-year head coach Chris Tormey, were members of the Big West Conference and played their home games at the Kibbie Dome, an indoor facility on campus in Moscow, Idaho.\nQuestion: The 1998 Idaho Vandals football team played games in 10 states. True, False, or Neither?", "doc_id": 284, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20270, 19198, 19666, 12484], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aubrey Morgan O'Day (born February 11, 1984) is an American singer-songwriter, member of the duo Dumblonde, reality television personality, and a former member of the girl group Danity Kane. Following discord among Danity Kane and with her mentor at the time, P. Diddy, O'Day was fired from the group in 2008, but reunited with them in 2013 before a second disbandment.\nQuestion: Aubrey Morgan O'Day is an Aquarius. True, False, or Neither? True\n###\nRoc-A-Fella Records Presents Teairra Mar\u00ed is the debut album by recording artist Teairra Mar\u00ed. It was released on August 2, 2005, by Roc-A-Fella Records. The album debuted in the top five selling 69,000 copies in the first week, eventually selling 248,000 units.\nQuestion: Rock-a-Fella records gross revenue increased dramatically after the release of Teairri Mari's successful first album. True, False, or Neither? Neither\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: Greatest Hits Volume 1 was not released in 1966 True, False, or Neither? True\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\".\nQuestion: Carlesimo was a businessman in his spare time True, False, or Neither? Neither\n###\nLurianic Kabbalah is a school of kabbalah named after the Jewish rabbi who developed it: Isaac Luria (1534\u20131572; also known as the \"ARI'zal\", \"Ha'ARI\" or \"Ha'ARI Hakadosh\"). Lurianic Kabbalah gave a seminal new account of Kabbalistic thought that its followers synthesised with, and read into, the earlier Kabbalah of the Zohar that had disseminated in Medieval circles.\nQuestion: Isaac Luria was a scholar. True, False, or Neither?", "doc_id": 125, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37182, 8115, 26533, 23452], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Orson Pratt published the book in the 1850s. True, False, or Neither? True\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles.\nQuestion: Dick Crealy won the woman's single title. True, False, or Neither? False\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Nick Harper is a songwriter. True, False, or Neither? True\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: ECSU also has a lot of frats True, False, or Neither? Neither\n###\nThe ostrich or common ostrich (\"Struthio camelus\") is either one or two species of large flightless birds native to Africa, the only living member(s) of the genus \"Struthio\", which is in the ratite family. In 2014, the Somali ostrich (\"Struthio molybdophanes\") was recognized as a distinct species.\nQuestion: The ostrich is a large bird that hates flying. True, False, or Neither?", "doc_id": 504, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29406, 24327, 16041, 6976], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Intervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009.\nQuestion: intervilles was a french drama show on games True, False, or Neither? False\n###\nLimnocharis flava (commonly known as yellow velvetleaf, sawah flower rush, sawah lettuce) is a species of aquatic flowering plant which is native to Mexico, Central America, South America, Cuba, Haiti and the Dominican Republic but widely naturalized in southern and southeastern Asia: India, Sri Lanka, Cambodia, Burma, Thailand, Vietnam, Indonesia, Malaysia and southern China (Guangdong, Yunnan).\nQuestion: Limnocharis flava is more popular in Asia then in Central and South America True, False, or Neither? Neither\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: The cultural evolution of Pakistan is portrayed in the book End of the Past. True, False, or Neither? True\n###\nDicksonia is a genus of tree ferns in the order Cyatheales. It is regarded as related to \"Cyathea\", but is considered more primitive, dating back at least to the Jurassic and Cretaceous periods. The fossil record includes stems, pinnules, and spores.\nQuestion: The Dicksonia genus includes a dozen different species within it True, False, or Neither? Neither\n###\nNannina de' Medici (14 February 1448 \u2013 14 May 1493), born Lucrezia de' Medici, was the second daughter of Piero di Cosimo de' Medici and Lucrezia Tornabuoni. She was thus the elder sister of Lorenzo de' Medici. She married Bernardo Rucellai. Her father's name was Piero, so she is sometimes known as Lucrezia di Piero de' Medici.\nQuestion: Nannina de' Medici did not have any brothers True, False, or Neither?", "doc_id": 657, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25740, 3115, 15029, 35758], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\".\nQuestion: Dostluk Spor Kul\u00fcb\u00fc plans to celebrate its 50th anniversary in 2023 by adding a third color. True, False, or Neither? Neither\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983.\nQuestion: Stewart has been ranked in both singles and doubles in the ATP rankings. True, False, or Neither? True\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: Tsewang Rigzin was re-elected to serve through August 2013. True, False, or Neither? True\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough.\nQuestion: Merry Christmas, Charlie Manson! aired over 2 years ago True, False, or Neither? True\n###\nThe 2012 SEC Women\u2019s Basketball Tournament took place at the Bridgestone Arena in Nashville, Tennessee from March 1-4, 2012. The Tennessee Lady Volunteers won the tournament and received the SEC\u2019s automatic bid to the 2012 NCAA Women\u2019s Basketball Tournament by defeating the LSU Lady Tigers 70-58 in the championship game.\nQuestion: The Lady Volunteers won the NCAA tournament only thirteen years after the penultimate year of the twentieth century. True, False, or Neither?", "doc_id": 706, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23697, 2615, 20669, 33915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: The final of the 1983 Prudential Cup was the most exciting game of the century. True, False, or Neither? Neither\n###\nJefferies LLC is an American global investment bank and institutional securities firm headquartered in New York. The firm provides clients with capital markets and financial advisory services, institutional brokerage, securities research, and asset management. This includes mergers and acquisitions, restructuring, and other financial advisory services.\nQuestion: Institutional Brokerage is a hard field to get into when developing a company. True, False, or Neither? Neither\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel.\nQuestion: Bryan Ferry is a back up singer. True, False, or Neither? False\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\".\nQuestion: Paint It Black was released by The Rolling Stones in 1966 True, False, or Neither? True\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione.\nQuestion: The Flirts's members live in New York. True, False, or Neither?", "doc_id": 479, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8532, 24214, 5654, 43723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh.\nQuestion: Although Stanley Anthony Woods has played for more than one professional football team, he played for the Washington Redskins the longest. True, False, or Neither? Neither\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy.\nQuestion: The Toronto FC came close to getting into the playoffs but knocked out a few games to the qualifier. True, False, or Neither? Neither\n###\n\"Klaatu barada nikto\" is a phrase that originated in the 1951 science fiction film \"The Day the Earth Stood Still\". The humanoid alien protagonist of the film, Klaatu (Michael Rennie), instructs Helen Benson (Patricia Neal) that if any harm befalls him, she must say the phrase to the robot Gort (Lockard Martin). In response Gort relents from destroying the Earth and resurrects Klaatu from death.\nQuestion: Most of the actors from \"The Day the Earth Stood Still\" are still alive today. True, False, or Neither? Neither\n###\nA madrigal is a secular vocal music composition of the Renaissance and early Baroque eras. Traditionally, polyphonic madrigals are unaccompanied; the number of voices varies from two to eight, and most frequently from three to six. It is quite distinct from the Italian Trecento madrigal of the late 13th and 14th centuries, with which it shares only the name.\nQuestion: The number of voices can be 5 True, False, or Neither? True\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award.\nQuestion: The Panama Canal was completed before David McCullough was born. True, False, or Neither?", "doc_id": 400, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2387, 4590, 9486, 11388], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999.\nQuestion: Big Voodoo Daddy played at superbowl True, False, or Neither? True\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory.\nQuestion: The Australia national cricket team won the 1912 Triangular Tournament True, False, or Neither? False\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\".\nQuestion: Mark Donovan is German character that starred in \"Shaun of the Dead\". True, False, or Neither? False\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: Antonio Lewis is from North America. True, False, or Neither? True\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr.\nQuestion: Things Happen at Night was released more than 1000 days ago. True, False, or Neither?", "doc_id": 239, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8186, 2632, 44584, 18589], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017.\nQuestion: Happy Valley, Australia provides hard courts for tennis tournaments, which it regularly hosts. True, False, or Neither? Neither\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006.\nQuestion: \"Live Free or Die\" was the final episode of the six season of \"The Sopranos\". True, False, or Neither? Neither\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities.\nQuestion: Rafael Cede\u00f1o Hern\u00e1ndez was arrested by Mexican authorities True, False, or Neither? Neither\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978.\nQuestion: Gary Sutherland no longer plays professional baseball True, False, or Neither? True\n###\n2 Cool 2 Be 4gotten is a 2016 Filipino coming-of-age drama film directed by Petersen Vargas in his feature-length directorial debut and written by Jason Paul Laxamana. The film stars Khalil Ramos, Ethan Salvador and Jameson Blake. It depicts the mysterious coming-of-age tale of Felix after he met half-American Snyder brothers, Magnus and Maxim.\nQuestion: the most well known actor in the film is natalie portman True, False, or Neither?", "doc_id": 488, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37274, 10615, 32915, 19172], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Far from the Madding Crowd is a 2015 British romantic drama film directed by Thomas Vinterberg and starring Carey Mulligan, Matthias Schoenaerts, Michael Sheen, Tom Sturridge and Juno Temple. It is an adaptation of the 1874 novel of the same name by Thomas Hardy, the fourth time this novel has been filmed.\nQuestion: Far from the Madding Crowd is a thriller True, False, or Neither? False\n###\nThe Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor.\nQuestion: Hennville is where a fight took place between the French and Portuguese True, False, or Neither? True\n###\nFabio Ochoa V\u00e1squez (born May 2, 1957) is a former leading member of the Medell\u00edn cocaine trafficking cartel, along with his older brothers Juan David and Jorge Luis. His role briefly made him a billionaire. After serving a brief prison term in Colombia, he was arrested and extradited to the US in 1999 and is serving a 30 year term in US federal prison.\nQuestion: Fabio is no longer a billionaire. True, False, or Neither? True\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011.\nQuestion: Most consumer-oriented companies headquartered in San Francisco, California are co-founded by Goldstein and Huffman in 2010. True, False, or Neither? Neither\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: The cable car takes you up and down the mountain. True, False, or Neither?", "doc_id": 193, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1633, 20658, 10234, 26104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Annacone and Christo van Rensburg were the defending champions. Annacone participated with John Fitzgerald, and lost in the quarterfinals to Scott Davis and David Pate, while Van Rensburg played with Kevin Curren, and lost in the semifinals to Grant Connell and Glenn Michibata.
Rick Leach and Jim Pugh defeated Connell and Michibata 3\u20136, 6\u20134, 6\u20132, in the final.\nQuestion: Leach was defeated by the current champions Annacone and Rensburg. True, False, or Neither? False\n###\nJosef Matthias Hauer (March 19, 1883 \u2013 September 22, 1959) was an Austrian composer and music theorist. He is most famous for developing, independent of and a year or two before Arnold Schoenberg, a method for composing with all 12 notes of the chromatic scale. Hauer was also an important early theorist of twelve-tone music and composition.\nQuestion: Josef Matthias Hauer died in September 22, 1959 in Austria True, False, or Neither? Neither\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005.\nQuestion: On December 20, 2005, a Christmas Special followed the second series. True, False, or Neither? True\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars.\nQuestion: X X X X is brewed in Japan. True, False, or Neither? False\n###\nLike the Roman: The Life of Enoch Powell is a 1998 book by the English writer Simon Heffer. It is a biography of the politician Enoch Powell. The title is taken from Powell's 1968 Rivers of Blood speech when Powell quoted Virgil's \"Aeneid\": \"As I look ahead, I am filled with foreboding; like the Roman, I seem to see the River Tiber foaming with much blood\".\nQuestion: Like the Roman: The Life of Enoch Powell is based on real events. True, False, or Neither?", "doc_id": 593, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26015, 16230, 24609, 20659], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\").\nQuestion: When Sanation was created there was someone other than J\u00f3zef Pi\u0142sudski in power True, False, or Neither? True\n###\nRoss Dawson (born 1962) is an Australian author, futurist, entrepreneur and former stockbroker. Best known for his 2002 book 'Living Networks', Dawson founded the futures think tank Future Exploration Network and consults on digital futures to various big organisations such as Ernst & Young, Macquarie Bank, Microsoft and News Corp.\nQuestion: Ross Dawson bought stocks. True, False, or Neither? True\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif.\nQuestion: Ihsan Abdel Quddous is also of Lebanese origin. True, False, or Neither? Neither\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry.\nQuestion: Obama was not the POTUS when Chuck Berry's debut album was released. True, False, or Neither? True\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland.\nQuestion: Phacelia cant grow during summer. True, False, or Neither?", "doc_id": 69, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14218, 24716, 2671, 30494], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick Ferdinand of Anhalt-K\u00f6then (25 June 1769, Pless \u2013 23 August 1830, K\u00f6then) was a German prince, Ascanian ruler of the principality of Anhalt-Pless and, from 1818, of the duchy of Anhalt-K\u00f6then. He was the second son of Frederick Erdmann, Prince of Anhalt-Pless, and his wife, Louise Ferdinande, daughter of Henry Ernest, Count of Stolberg-Wernigerode.\nQuestion: Frederick Ferdinand of Anhalt-K\u00f6then was born the same day as his wife True, False, or Neither? Neither\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona.\nQuestion: Knightriders only took 2 weeks to film True, False, or Neither? Neither\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania.\nQuestion: The radio show,MWB, will soon no longer be on the air for people to enjoy. True, False, or Neither? Neither\n###\nThe Chattenden and Upnor Railway (later known as the Lodge Hill and Upnor Railway) was a narrow gauge railway serving the military barracks and depot at Upnor and associated munitions and training depots. It was built in 1873 as a gauge railway, converted to narrow gauge around 1885, and continued in use until 1961.\nQuestion: The Chattenden and Upnor railway is still in use today. True, False, or Neither? False\n###\nVictor H. Halligan (November 22, 1892 \u2013 March 10, 1973) was an American football player. He played for the University of Nebraska from 1912 to 1914 and was the first All-American football player to be selected from the Nebraska Cornhuskers football team.\nQuestion: Halligan played football until he was thirty. True, False, or Neither?", "doc_id": 113, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21772, 20393, 14511, 37817], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Semonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781.\nQuestion: The base is located in Lesotho True, False, or Neither? True\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old.\nQuestion: John Cameron Urschel played for penn state True, False, or Neither? True\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Gabriele Muccino also had a small character role in The Pursuit of Happyness. True, False, or Neither? Neither\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts.\nQuestion: Diaspora studies is a fascinating topic of discussion. True, False, or Neither? Neither\n###\nDaniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country.\nQuestion: Daniel James Shellabarger was born in a different state than the one he currently lives in True, False, or Neither?", "doc_id": 624, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40897, 11714, 28455, 5576], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Les Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads:\nQuestion: The work was begun in 1936 True, False, or Neither? False\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\".\nQuestion: The album Take Two had more than three hits. True, False, or Neither? Neither\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m).\nQuestion: There is a community who live along local roads northwest of the town of Broadus in along local roads northwest of the town of Broadus. This Montana community, is not governed by a local municipal corporation and it is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. True, False, or Neither? True\n###\nVictor Ebubedike (born February 1, 1966), also known as Victor X Ebubedike and Victor Muhammad, is an English former American football player who played as a running back for London Ravens, from 1983-1990, then onto the NFL Europe's London Monarchs from 1991\u20131992 and 1995-1998.\nQuestion: Europe had a popular NFL team known as the London Ravens. True, False, or Neither? Neither\n###\nStainer & Bell Limited is a British publisher of classical sheet music and books, based in London. Stainer, founded in 1907, publish the works of a number of significant twentieth-century composers, including Charles Villiers Stanford, Gustav Holst, Ralph Vaughan Williams, and Herbert Howells. They also publish a number of earlier composers, including Henry VIII, William Byrd, and Henry Purcell.\nQuestion: Stainer & Bell Limited was founded after the War of 1812. True, False, or Neither?", "doc_id": 423, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30217, 27821, 28524, 23469], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ahmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Ahmad Kemal Idri was not the Indonesian National Revolution, but was the Yogyakarta itself after continued resistance True, False, or Neither? False\n###\nEl\u00ednr\u00f3s L\u00edndal is an entrepreneur in Fashion design. She established ELLA fashion label in 2008, one of the first Slow Fashion brands in the world. Elinr\u00f3s was the brands creative director and CEO. ELLA launched] it\u00b4s first fashion line in April 2011.\nQuestion: Ella first fashion line was launched in April 26, 2011 True, False, or Neither? Neither\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: There were no acoustic songs on the album \"Smithereens.\" True, False, or Neither? False\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Ashcroft was built in 1861. True, False, or Neither? False\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics.\nQuestion: Amelie Simone Mauresmo is forty years old at the time of this statement. True, False, or Neither?", "doc_id": 41, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18084, 12124, 28219, 4652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nina Ellen \u00d8deg\u00e5rd (born 2 November 1979 in Stavanger) is a Norwegian actress. She made her stage debut at Rogaland Teater in 2002 in a play by Brian Friel. Among her films are \"Play\" from 2003 and \"Alt for Egil\" from 2004. Her role as \"Josie\" in O'Neill's play \"M\u00e5ne for livets stebarn\" in 2005 earned her the Hedda Award for best stage performance.\nQuestion: Nina Ellen \u00d8deg\u00e5rd made her debut in the same century that she was born. True, False, or Neither? False\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College.\nQuestion: The Vigilantes' home town and home stadium share the same name True, False, or Neither? True\n###\nTurnagain, also called Buru Island, is an island of the \"Western Islands\" region of the Torres Strait Islands archipelago, located in the northern section of Torres Strait, Queensland, Australia. Turnagain is located within the Torres Strait Island Region Local government area.\nQuestion: Turnagain is located within the Region Local government area. True, False, or Neither? True\n###\nSophie Charlene Akland Monk (born 14 December 1979) is an English-born Australian singer, songwriter, actress, model and radio personality. Monk was a member of the girl group Bardot and released a solo album called \"Calendar Girl\" (2003). She has appeared in films such as \"Date Movie\" (2006), \"Click\" (2006), and \"Spring Breakdown\" (2009).\nQuestion: Sophie was born in 1989. True, False, or Neither? False\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan.\nQuestion: Mike Bossy the Scoring Machine is a pinball machine, it is a one of a kind machine which is pretty rare, since they only made the prototype True, False, or Neither?", "doc_id": 679, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12042, 11563, 17877, 2156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists.\nQuestion: An editorial cartoonist makes shapes. True, False, or Neither? True\n###\nAlec Holowka is a Canadian indie game developer, co-founder of an independent game companies Bit Blot, and Infinite Ammo, where he works as lead programmer, musician and game designer. He collaborated with Derek Yu to create the award-winning game \"Aquaria\" and freeware game \"I'm O.K - A Murder Simulator\". He collaborated with Scott Benson to create \"Night in the Woods\".\nQuestion: Derek Yu collaborated with him to write Night in the Woods True, False, or Neither? False\n###\nHistorical period drama is a film genre in which stories are based on historical events and famous persons. Some historical dramas attempt to accurately portray a historical event or biography, to the degree that the available historical research will allow. Other historical dramas are fictionalised tales that are based on an actual person and their deeds.\nQuestion: Historical period dramas are hard to make accurate True, False, or Neither? Neither\n###\nA Qualified Person Responsible for Pharmacovigilance, or QPPV, is an individual named by a pharmaceutical company as the main person responsible for ensuring that the company (the product's Marketing Authorisation Holder or MAH) meets its legal obligations for the monitoring of the safety of a medicinal product on the market.\nQuestion: Medications go through a four step process with the QPPV regulation. True, False, or Neither? Neither\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums).\nQuestion: Lars Iversen is a European composer and producer. True, False, or Neither?", "doc_id": 845, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44318, 25028, 43614, 41835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center.\nQuestion: Tango is a dance inspired by classical music True, False, or Neither? Neither\n###\nArt History is a 2011 American drama film directed by Joe Swanberg, written by Swanberg, Josephine Decker, and Kent Osborne. It stars Decker, Swanberg, Osborne, Adam Wingard, and Kris Swanberg as filmmakers whose lives are complicated by a graphic sex scene in an arthouse film.\nQuestion: There are two actors with the last name Swanberg in Art History. True, False, or Neither? True\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile.\nQuestion: Bride of Chucky was a sequel in the Chucky series. True, False, or Neither? True\n###\n\"That's the Beat of a Heart\" is a song recorded by American country music duo The Warren Brothers featuring Sara Evans. It was released in March 2000 as the first single from their album \"King of Nothing\". It was also included on the soundtrack to the 2000 film \"Where the Heart Is\". The song was written by Tena Clark and Tim Heintz.\nQuestion: The Warren Brothers are not siblings True, False, or Neither? Neither\n###\nA political decoy is a person employed to impersonate a politician, to draw attention away from the real person or to take risks on that person's behalf. This can also apply to military figures, or civilians impersonated for political or espionage purposes.\nQuestion: People who work in government affairs might hire a political decoy. True, False, or Neither?", "doc_id": 721, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33022, 31347, 17293, 21760], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\".\nQuestion: The Call features music True, False, or Neither? True\n###\nThe Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW.\nQuestion: The Linkou Power Plant will be more efficient after the retrofitting. True, False, or Neither? Neither\n###\nCharles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009.\nQuestion: Charles Rashad Jamaal Brown was born in the 80s. True, False, or Neither? True\n###\nHector and the Search for Happiness is a 2014 German-British-Canadian comedy-drama film directed by Peter Chelsom and co-written with Tinker Lindsay and Maria von Heland, based on Fran\u00e7ois Lelord's novel of the same name. The film stars Simon Pegg and Rosamund Pike.\nQuestion: Hector and the Search for Happiness has no story. True, False, or Neither? False\n###\nYou Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California.\nQuestion: Leary just released a new album. True, False, or Neither?", "doc_id": 717, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3437, 21360, 19040, 20896], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list.\nQuestion: Many science fiction works have been set in the 21st century (years 2100 to 2100) True, False, or Neither? False\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci was injured while filming the Devi's Whisper True, False, or Neither? Neither\n###\nPrincess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia.\nQuestion: Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld only loved her husband Grand Duke Konstantin Pavlovich of Russia. True, False, or Neither? Neither\n###\nThe Global Food Security Act of 2016 (Pub.L. 114\u2013195 ), is a law introduced on March 24, 2015 in the 114th Congress by Representative Christopher Henry \"Chris\" Smith (New Jersey-R) and on May 7, 2015 by Senator Robert Patrick \"Bob\" Casey Jr. (Pennsylvania-D), and signed by President Barack Obama on July 20, 2016.\nQuestion: The Global Food Security Act of 2016 was later dismissed by President Trump. True, False, or Neither? Neither\n###\nThe Volkswagen Citi Golf was a car produced by Volkswagen in South Africa from 1984 until 21 August 2009. It was a face-lifted version of the original Volkswagen Golf Mk1 hatchback, which ceased production in Germany in 1983. The car was produced only with right-hand drive.\nQuestion: The Volkswagen Citi Golf was designed to drive on the left side of the road True, False, or Neither?", "doc_id": 30, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18027, 15918, 37469, 39423], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso.\nQuestion: Gun Bow was noted for his rivalry with six-time American Horse of the Year Kelso. True, False, or Neither? False\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Chris McKendry speaks multiple languages True, False, or Neither? Neither\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village.\nQuestion: It is west of downtown Alexandroupoli. True, False, or Neither? True\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree Victoria Murphy lives in an apartment. True, False, or Neither? Neither\n###\nThe following is a list of ongoing civil unrest or ongoing protests that are taking place around the world. This list is for the sole purpose of identifying present-day civil unrest and protests and the death toll and number of protesters associated with each event.\nQuestion: This is a bar graph. True, False, or Neither?", "doc_id": 630, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28957, 8279, 30235, 36053], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The NME Awards 2017 were held in London, England, on 15 February 2017, at the Brixton Academy and was hosted by English comedian Huw Stephens. Beyonc\u00e9 led the nominations with five, followed by The 1975, Bastille, Christine And The Queens and Skepta with four nominations each.\nQuestion: The NME was held in London at night. True, False, or Neither? Neither\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France.\nQuestion: Aisne is the smallest d\u00e9partement in France True, False, or Neither? Neither\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: There are currently no plans to shorten the The San Nicolao Tunnel True, False, or Neither? Neither\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC.\nQuestion: Tory Woodburn will die in Charlotte, NC. True, False, or Neither? Neither\n###\nPixote: a Lei do Mais Fraco (] , lit. \"Pixote (small child): The Law of the Weak\") is a 1980 Brazilian drama film directed by H\u00e9ctor Babenco. The screenplay was written by Babenco and Jorge Dur\u00e1n, based on the book \"A Inf\u00e2ncia dos Mortos\" (\"The Childhood of the Dead Ones\") by Jos\u00e9 Louzeiro.\nQuestion: Jos\u00e9 Louzeiro was inspired to write \"A Inf\u00e2ncia dos Mortos\" after watching \"Pixote: a Lei do Mais Fraco\". True, False, or Neither?", "doc_id": 537, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36182, 43027, 17633, 29931], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: The Master of Revenge airs on KBS2 True, False, or Neither? True\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label.\nQuestion: Bad Company was recorded at Headley Grange with Ronnie Lane's Mobile Studio in month after Halloween in the year that equals 2073 minus 100. True, False, or Neither? True\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website.\nQuestion: The Austin City Limits Festival concert took place 6 years before 2012. True, False, or Neither? True\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\".\nQuestion: Selvaraj died in January True, False, or Neither? True\n###\nDennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues.\nQuestion: By the time Mr Probz was 30 years old, he had an international hit. True, False, or Neither?", "doc_id": 602, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35555, 4593, 28382, 26502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project.\nQuestion: The project that replaces the The Castaways Hotel and Casino will be better True, False, or Neither? Neither\n###\nThe Fondation Prince Pierre was established by Prince Rainier III of Monaco in February 1966 to promote culture and the arts through the creation and the awarding of prizes. Prince Rainier III created the foundation in tribute to his father, Pierre de Polignac a great patron of the arts.\nQuestion: The Fondation Prince Pierre promotes culture and the arts. True, False, or Neither? True\n###\nBrenda Fricker (born 17 February 1945) is an Irish actress of theatre, film and television. She has appeared in more than 30 films and television roles. In 1989, she became the first Irish actress to win an Oscar, earning the Academy Award for Best Supporting Actress for \"My Left Foot\". As of July 2014, she has tentatively retired from acting.\nQuestion: Fricker was the first female Irish actress to win an Oscar. True, False, or Neither? True\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: \"Come Back in One Piece\" was a major urban contemporary hit. True, False, or Neither? False\n###\nJohan Niemann (born 26 June 1977) is best known for being the current bass player for Evergrey and co-founding the band Mind's Eye, for membership of Swedish heavy metal band Therion and as a member of the Scandinavian theatrical metal band Evil Masquerade. He is also currently live guitarist for Tiamat. He is a brother of Kristian Niemann.\nQuestion: Johan Niemann's brother is a musician True, False, or Neither?", "doc_id": 107, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19496, 37602, 15998, 22158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"That's the Beat of a Heart\" is a song recorded by American country music duo The Warren Brothers featuring Sara Evans. It was released in March 2000 as the first single from their album \"King of Nothing\". It was also included on the soundtrack to the 2000 film \"Where the Heart Is\". The song was written by Tena Clark and Tim Heintz.\nQuestion: That's the Beat of a Heart has been in a movie. True, False, or Neither? True\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions.\nQuestion: Project Gasbuggy was created by Obama. True, False, or Neither? False\n###\nA Merry Friggin' Christmas is a 2014 American black comedy film directed by Tristram Shapeero and written by Phil Johnston. The film stars an ensemble cast featuring Joel McHale, Lauren Graham, Clark Duke, Oliver Platt, Wendi McLendon-Covey, Tim Heidecker, Candice Bergen and Robin Williams. The film was released on November 7, 2014, by Phase 4 Films.\nQuestion: A Merry Friggin' Christmas cast included Joel Mchale. True, False, or Neither? True\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race.\nQuestion: He used to be a woman True, False, or Neither? Neither\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\".\nQuestion: They are a popuylar writing duo. True, False, or Neither?", "doc_id": 104, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7566, 43144, 2143, 19486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick William Chesson (1833\u20131888) was an English journalist and prominent anti-slavery campaigner. He was active in the London Aborigines' Protection Society and Emancipation Committee, and met Harriet Ann Jacobs when she was in England in 1858; and was a vocal supporter of the Union side during the American Civil War.\nQuestion: Frederick William Chesson never met Harriet Ann Jacobs True, False, or Neither? False\n###\nWings over America is a triple live album by Wings, released in December 1976. The album was recorded during American leg of the band's acclaimed 1975\u201376 Wings Over the World tour. It peaked at number 8 on the UK Albums Chart and reached number 1 on the US \"Billboard\" Top LPs & Tape chart.\nQuestion: Wings over America was released over 3000 days ago. True, False, or Neither? True\n###\nKate Saunders (born 4 May 1960 in London) is an English writer, actress and journalist. The daughter of the early public relations advocate Basil Saunders and his journalist wife Betty (n\u00e9e Smith), Saunders has worked for newspapers and magazines in the UK, including \"The Sunday Times\", \"Sunday Express\", \"Daily Telegraph\", \"She\" and \"Cosmopolitan\".\nQuestion: Kate Saunders travels in very affluent circles in the UK. True, False, or Neither? Neither\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp.\nQuestion: Sidney is a fun place to swim. True, False, or Neither? Neither\n###\nAjay Nagrath is an Indian television and movie actor and is the son of Bollywood actor Anil Nagrath. Currently, he plays the role of \"Pankaj\" in C.I.D. He has done many roles in many TV shows and even films, but there came a point in his life when he was unhappy that his weight had become his identity in the industry. He said \"I used to be a couch potato.\"\nQuestion: Ajay Nagrath was always unhappy with his weight. True, False, or Neither?", "doc_id": 363, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1063, 32778, 24884, 24414], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: Harry Bosch is a fictional character. True, False, or Neither? True\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums).\nQuestion: Although The Asteroids Galaxy Tour had Mette Lindberg as a vocalist, it was unsure if they needed another True, False, or Neither? Neither\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\".\nQuestion: Jeon Do-yeon as internationally popular True, False, or Neither? Neither\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph Ernst Friedrich von Forcade de Biaix retired at age 65 True, False, or Neither? Neither\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Idris was 80 when he retired. True, False, or Neither?", "doc_id": 8, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21328, 8760, 13392, 40483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tamanna (Hindi: \u0924\u092e\u0928\u094d\u0928\u093e , translation: Desire) is a 1997 Indian drama film directed by Mahesh Bhatt. It stars Paresh Rawal, Pooja Bhatt, Sharad Kapoor and Manoj Bajpayee in the lead roles The screenplay was written by Tanuja Chandra. The story was written by Tanuja Chandra and Mahesh Bhatt. It was produced by Pooja Bhatt.\nQuestion: Tamanna is a 1997 Indian drama which was actually written in 1995 . True, False, or Neither? Neither\n###\nThe Nariphon (Thai: \u0e19\u0e32\u0e23\u0e35\u0e1c\u0e25 ), also known as Makkaliphon (Thai: \u0e21\u0e31\u0e01\u0e01\u0e30\u0e25\u0e35\u0e1c\u0e25 , from Pali \"makkaliphala\"), is a tree in Buddhist mythology which bears fruit in the shape of young female creatures. The maidens grow attached by their head from the tree branches. This tree grows at the Himaphan, a mythical forest where the female fruits are enjoyed by the Gandharvas who cut the fruits and take them away.\nQuestion: Buddhist mythology has a tree named Makkaliphon True, False, or Neither? True\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award.\nQuestion: The Path Between the Seas takes place between 1870 through 1914 True, False, or Neither? True\n###\nThe 1973 Atlanta Braves season was the eighth season in Atlanta along with the 103rd season as a franchise overall. The highlight of the season was Hank Aaron finishing the season just one home run short of Babe Ruth as baseball's all-time home run king. The 1973 Atlanta Braves were the first team to boast three 40 home run hitters. They were Aaron, Darrell Evans, and Davey Johnson.\nQuestion: The Atlanta Braves are a third rate symphony. True, False, or Neither? False\n###\nGray Cowan Boyce (19 February 1899 - 14 May 1981) was an American medieval historian and historical bibliographer whose masterwork was his five volume \"Literature of Medieval History, 1930-1975: A Supplement to Louis John Paetow's \"A Guide to the Study of Medieval History\"\" (1981).\nQuestion: Gray Boyce lived through both world wars. True, False, or Neither?", "doc_id": 135, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25818, 25739, 4747, 33453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The following is a list of female cabinet ministers of Thailand. Thailand is a country located at the centre of the Indochina peninsula in Southeast Asia. It is bordered to the north by Burma and Laos, to the east by Laos and Cambodia, to the south by the Gulf of Thailand and Malaysia, and to the west by the Andaman Sea and the southern extremity of Burma.\nQuestion: Malaysia boarders Thailand to the south True, False, or Neither? True\n###\nAlbert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion.\nQuestion: Albert Levitt enjoyed golf in his spare time, was an animal rights activist, had a vegetarian diet, and identified as Jewish despite having a Catholic background. True, False, or Neither? Neither\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999.\nQuestion: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e was screenwriter during his first job. True, False, or Neither? Neither\n###\nPhua Chu Kang Pte Ltd, also known as PCK Pte Ltd or Phua Chu Kang for short (\u9b3c\u99ac\u5bb6\u65cf in Chinese), was a Singaporean sitcom on MediaCorp TV Channel 5. The show debuted in Singapore in 1997. A sequel, \"Phua Chu Kang Sdn Bhd\" debuted on NTV7 on 25 March 2009 and aired in Singapore's MediaCorp TV Channel 5 on 6 October 2009.\nQuestion: Phua Chu Kang Pte Ltd has multiple sequels including \"Phua Chu Kang Sdn Bhd\" which debuted on NTV7 on 25 March 2009 True, False, or Neither? Neither\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\".\nQuestion: Big Jet Plane was not released for the first time in 2009. True, False, or Neither?", "doc_id": 674, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35681, 35989, 30057, 29596], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ra\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\nQuestion: Ra\u00fal Alberto Osella is a heartthrob. True, False, or Neither? Neither\n###\nFC Spartak Vladikavkaz (Russian: \u0424\u0443\u0442\u0431\u043e\u043b\u044c\u043d\u044b\u0439 \u043a\u043b\u0443\u0431 \u00ab\u0421\u043f\u0430\u0440\u0442\u0430\u043a\u00bb \u0412\u043b\u0430\u0434\u0438\u043a\u0430\u0432\u043a\u0430\u0437 , Ossetian: \u0424\u0443\u0442\u0431\u043e\u043b\u043e\u043d \u043a\u043b\u0443\u0431 \"\u0410\u043b\u0430\u043d\u0438\" ) is a Russian football club based in Vladikavkaz (formerly Ordzhonikidze), North Ossetia\u2013Alania. Founded in 1921, the club played in the Soviet Top League during the communist era, and won its first and only league title in the 1995 Russian Top League.\nQuestion: They played hockey since 1921 True, False, or Neither? False\n###\nThe Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network.\nQuestion: Chingford will be in operation for many years to come. True, False, or Neither? Neither\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street.\nQuestion: The population was over 300 True, False, or Neither? True\n###\nThe Maloof family is a prominent American family based in Las Vegas, Nevada, who are owners of numerous business properties in the Western United States. The origin of the family name is Maalouf and is of Lebanese descent via their paternal grandfather.\nQuestion: The number of businesses owned by the Maloof family is large. True, False, or Neither?", "doc_id": 713, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14775, 42138, 38710, 34871], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser had much sympathy for Jews True, False, or Neither? Neither\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012.\nQuestion: Four different people won the Melodi Grand Prix junior 2012. True, False, or Neither? Neither\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: The author wrote the book from the perspective of a young boy True, False, or Neither? True\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex.\nQuestion: Muccan Station makes pies True, False, or Neither? False\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\".\nQuestion: As of 2019, Jeon Do-yeon would be 50 years old. True, False, or Neither?", "doc_id": 676, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5012, 29617, 23380, 3546], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "August Perk (October 25, 1897, Lohne / Lingen, Germany; \u2013 May 12, 1945, Braunschweig, Germany) was a German Resistance fighter against the National Socialism. His brief friendship with Erich Maria Remarque influenced Remarque's novel \"All Quiet on the Western Front\".\nQuestion: August Perk was a German resistance fighter who lived with Erich Maria Remarque. True, False, or Neither? Neither\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street.\nQuestion: The village starts with a W True, False, or Neither? True\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site.\nQuestion: The Cincinnati and Whitewater Canal Tunnel has bronze. True, False, or Neither? Neither\n###\nMarguerite Aimee Rosine Coppin (2 February 1867 \u2013 1931) was born in Brussels and became woman Poet Laureate of Belgium and a noted feminist and pioneer in female emancipation and equal rights for women. She was compared with women's rights activists Amelia Bloomer and Emmeline Pankhurst.\nQuestion: Marguerite Aimee Rosine Coppin lived in Europe. True, False, or Neither? True\n###\nDelivery Man is a 2013 American comedy-drama film written and directed by Ken Scott, produced by DreamWorks Pictures and Reliance Entertainment. A remake of Scott's 2011 French-Canadian film, \"Starbuck\", the film stars Vince Vaughn, Chris Pratt and Cobie Smulders.\nQuestion: Delivery Man had to be written True, False, or Neither?", "doc_id": 467, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18025, 35900, 22452, 18614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Euphorbia pubentissima, known by the common names false flowering spurge and southeastern flowering spurge, is a species of plant in the spurge family. It is native to the Southeastern United States where it is found in areas of sandy, open woodlands.\nQuestion: Euphorbia pubentissima is a flower True, False, or Neither? True\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft.\nQuestion: At the time of the 2013 NHL Entry Draft, Nathan was at least 17 years old. True, False, or Neither? True\n###\nThe Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949.\nQuestion: The Girl from Jones Beach stars a future president True, False, or Neither? True\n###\nAlice Geraldine Farrar (February 28, 1882 \u2013 March 11, 1967) was an American soprano opera singer and film actress, noted for her beauty, acting ability, and \"the intimate timbre of her voice.\" She had a large following among young women, who were nicknamed \"Gerry-flappers\".\nQuestion: Alice Geraldine Farrar was an actress True, False, or Neither? True\n###\nSouthern Methodist University (SMU) is a private research university in Dallas, University Park, and Highland Park, Texas. Founded in 1911 by the Methodist Episcopal Church, South, SMU operates satellite campuses in Plano, Texas, and Taos, New Mexico. SMU is owned by the South Central Jurisdiction of the United Methodist Church. Of the university's 11,643 students, 6,411 are undergraduates.\nQuestion: SMU has 11,644 students True, False, or Neither?", "doc_id": 862, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38651, 19098, 12645, 4860], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund.\nQuestion: Since 1965 The University of Maryland Eastern Shore has been a historically black university. True, False, or Neither? Neither\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television.\nQuestion: Sebastian Bach has a solo career True, False, or Neither? True\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: Antonio Lewis was the highest paying member of Flatbush ZOMBIES True, False, or Neither? Neither\n###\nHawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town.\nQuestion: Hawthorne is located in new york True, False, or Neither? False\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330.\nQuestion: Thomas Cooper was born over 15,000 days ago. True, False, or Neither?", "doc_id": 920, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8586, 17835, 36203, 15334], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire.\nQuestion: The 100 people are on the same team against the outcast player True, False, or Neither? Neither\n###\nLegoland Discovery Center Dallas Fort Worth is an indoor family entertainment center located at Grapevine Mills mall in Grapevine, Texas, which is situated between the cities of Dallas and Fort Worth, Texas. The attraction includes Lego-theme rides, a soft play area, a 4D cinema and a gift shop. The center is owned and operated by British leisure group Merlin Entertainments.\nQuestion: Legoland Discovery Center Dallas Fort Wort is in Texas True, False, or Neither? True\n###\nDennis Gordon Patterson (born January 9, 1950) is a Canadian former professional ice hockey defenceman who played three seasons in the National Hockey League (NHL) for the Kansas City Scouts and Philadelphia Flyers and also played one season in the World Hockey Association (WHA) for the Edmonton Oilers. He is currently a scout with the Flyers.\nQuestion: Dennis Gordon Patterson was born on the first of the month True, False, or Neither? True\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: The station employees 300 people True, False, or Neither? Neither\n###\nBlanche Barrow (born Bennie Iva Caldwell; January 1, 1911 \u2013 December 24, 1988) was a fringe member of Bonnie and Clyde's gang and the wife of Clyde Barrow's brother Buck. Brought up by her father, she had a poor relationship with her mother, who arranged for Blanche to be married to an older man. Blanche ran away and met Buck Barrow. He was 8 years older, and a fugitive.\nQuestion: blanche barrow died as a fugitive True, False, or Neither?", "doc_id": 698, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29701, 22396, 33080, 30912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Henry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795.\nQuestion: It was a British nobleman and politician. True, False, or Neither? True\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films.\nQuestion: jonathan michael lovitz acted in numeroues television series and films while he was in saturday night live True, False, or Neither? Neither\n###\nDennis Gordon Patterson (born January 9, 1950) is a Canadian former professional ice hockey defenceman who played three seasons in the National Hockey League (NHL) for the Kansas City Scouts and Philadelphia Flyers and also played one season in the World Hockey Association (WHA) for the Edmonton Oilers. He is currently a scout with the Flyers.\nQuestion: Dennis Patterson played more than 2 seasons in the NHL. True, False, or Neither? True\n###\nBaar is a railway station in the Swiss canton of Zug, situated in the municipality of Baar. The station is located on the Z\u00fcrich to Lucerne railway line and is an intermediate stop for InterRegio trains from Z\u00fcrich to Lucerne and on Z\u00fcrich S-Bahn line S9.\nQuestion: Zug has one railway station. True, False, or Neither? Neither\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012.\nQuestion: Bebelsburg is older than Cite du Cinema. True, False, or Neither?", "doc_id": 376, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33282, 3015, 39199, 10631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gwendoline See-Hian Yeo (; born July 10, 1977) is a Singaporean-born American actress, voice actress and musician, best known for her recurring guest-star role as Xiao-Mei in the hit television series \"Desperate Housewives\", and as Dr. Kelly Lee in \"General Hospital\".\nQuestion: Dr. Kelly Lee is well liked. True, False, or Neither? Neither\n###\nKilimanjaro Native Cooperative Union (KNCU) is a cooperative federation in Tanzania and the oldest cooperative in Africa, founded in 1930 by Charles Dundas. KNCU is owned by the farmers of the 90 primary cooperative societies which buy coffee from the farmers on Kilimanjaro. Offices for the cooperative are located in Moshi.\nQuestion: Dundas lives in Moshi. True, False, or Neither? Neither\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator.\nQuestion: Fraser Wishart quit professional football to become Secretary of the Scottish Professional Footballers' Association. True, False, or Neither? Neither\n###\nThe Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs.\nQuestion: Heavy water nuclear reactors have no use for the Girdler Sulfide process. True, False, or Neither? False\n###\nJohnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson.\nQuestion: \"Prep\" is a common abbreviation for \"preparatory\". True, False, or Neither?", "doc_id": 544, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16804, 44972, 33964, 22090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roland Buerk (born 1973), was a journalist working for the BBC. He was the Tokyo Correspondent for BBC News and is best known for his coverage of the 2011 T\u014dhoku earthquake and tsunami. He is the son of former BBC newsreader and current BBC Radio 4 presenter Michael Buerk. He left the BBC in mid-2012, to work for Nissan in the United Arab Emirates.\nQuestion: Roland Buerk has an R. True, False, or Neither? True\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Weltenbrand was formed in nineteen hundred ninety six. True, False, or Neither? False\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\"\nQuestion: Johns Creek is a tiny village located in Fulton County in the U.S. state of Georgia. True, False, or Neither? False\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008.\nQuestion: Laura Elena Z\u00fa\u00f1iga Huizar met Bush. True, False, or Neither? Neither\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band.\nQuestion: Gang of Four had eight albums released by Metropolis Records. True, False, or Neither?", "doc_id": 463, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21860, 17591, 1127, 5398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "View from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: View from the Top was seen by Obama. True, False, or Neither? Neither\n###\nRecorrupted is a limited edition EP by Whitechapel that was released on November 8, 2011 through Metal Blade Records. It consists of one original song, two of their previously released songs remixed (\"This Is Exile\" and \"Breeding Violence\"), an acoustic version of \"End of Flesh\" and a cover of the Pantera song \"Strength Beyond Strength\".\nQuestion: Recorrupted is primarily original material for that album. True, False, or Neither? False\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points.\nQuestion: The Cornhuskers scored more points than they ever had before in the 1994 game True, False, or Neither? Neither\n###\nFrank John Gorshin, Jr. (April 5, 1933 \u2013 May 17, 2005) was an American character actor, impressionist, and comedian. He was perhaps best known as an impressionist, with many guest appearances on \"The Ed Sullivan Show\" and \"Tonight Starring Steve Allen\". His most famous acting role was as the Riddler on the live-action television series \"Batman\".\nQuestion: Frank John Gorshin, Jr. played the Riddler in the Batman movie. True, False, or Neither? False\n###\n169th Street is a local station on the IND Queens Boulevard Line of the New York City Subway. Located at the intersection of 169th Street and Hillside Avenue in Queens, it is served by the F train at all times. This is the closest subway station to the 165th Street Bus Terminal after the closure of the nearby 168th Street BMT Station on Jamaica Avenue in 1977.\nQuestion: 169th Street is served by the train that is the sixth letter of the alphabet. True, False, or Neither?", "doc_id": 810, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41359, 11693, 44467, 20858], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the Ugric mythology, Kaltes-Ekwa (Khanty, Kaltes Ankw) was the mother of the hero Mir-Susne-Hum and the wife of the god Num-Torum, who defeated her in heaven. She was also a goddess of the moon associated with the month April; a birth giving goddess (she is called upon by women in child-birth); goddess of fate; goddess of dawn and a shape-shifter, often shown manifested as a hare.\nQuestion: Khanty was the grand mother of the hero True, False, or Neither? Neither\n###\nCeres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club.\nQuestion: Ceres Negros Football Club has no sponsors True, False, or Neither? Neither\n###\nThe St. Louis Cardinals 1984 season was the team's 103rd season in St. Louis, Missouri and the 93rd season in the National League. The Cardinals went 84-78 during the season and finished 3rd in the National League East, 12\u00bd games behind their arch-rivals, the Chicago Cubs. It was also the final season of the Columbia blue road uniforms for the Cardinals.\nQuestion: The cardinals were once in the American League True, False, or Neither? Neither\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Tillia tepe is the site of dinosaur bones. True, False, or Neither? Neither\n###\nWest Palm Beach Municipal Stadium, referred to as \"Municipal Stadium\", located at 755 Hank Aaron Drive, was a ballpark in West Palm Beach, Florida and the long-time spring training home for the Milwaukee and Atlanta Braves and Montreal Expos. The Braves played spring training games at the stadium from 1963 to 1997, while the Expos played there from 1969 to 1972 and from 1981 to 1997.\nQuestion: The Braves played at Municipal Stadium in the fall. True, False, or Neither?", "doc_id": 436, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31397, 18154, 23444, 13667], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 18th Annual Latin Grammy Awards will be held on Thursday, November 16, 2017 at the MGM Grand Garden Arena in Las Vegas. It will be broadcast on Univision at 8PM ET\\PT. This will mark the tenth year Las Vegas hosts the Latin Grammy Awards and will also mark the telecasts return to the MGM Grand Garden Arena.\nQuestion: The 18th Annual Latin Grammy awards will be simulcast on American television as well. True, False, or Neither? Neither\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world.\nQuestion: Nosopsyllus fasciatus originated in Italy which later sped across all of Europe and World. True, False, or Neither? Neither\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system.\nQuestion: ECSU has more undergrad than grad degrees True, False, or Neither? Neither\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics.\nQuestion: Am\u00e9lie Simone Mauresmo is good at badminton. True, False, or Neither? Neither\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013.\nQuestion: Kim Peirce was born on the eight day of September in the early 1960s. True, False, or Neither?", "doc_id": 499, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16772, 31974, 44579, 13549], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Kinsey Millihone currently lives in California True, False, or Neither? False\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: Come Back in One Piece was a massive hit. True, False, or Neither? False\n###\nTommy Stewart is an American trumpeter, arranger, composer, and record producer. He has been a member of the Magic City Jazz Orchestra, Cleveland Eaton and the Alabama All-Stars, the Alabama Jazz Hall of Fame All-Stars, and Ray Reach and Friends. He was a 1988 inductee into the Alabama Jazz Hall of Fame.\nQuestion: Stewart was actually a terribly unskilled trumpeter and got by using an electronic attachment that played his trumpet for him. True, False, or Neither? Neither\n###\nHarriston (population 1,797) is a community in the Town of Minto in Wellington County, Ontario, Canada. In 1999, Harriston was amalgamated with the communities of Palmerston, Clifford, and Minto Township to form the Town of Minto. Harriston is located at the headwaters of the Maitland River, and has several shops, restaurants, a library, an art gallery and cultural centre.\nQuestion: Harriston is part of a town that has several other communities. True, False, or Neither? True\n###\nValentino D. B. Mazzia (February 17, 1922 \u2013 March 10, 1999) was an American physician who served as chairman of the department of anesthesiology at the New York University School of Medicine and was a pioneer in the forensic analysis of deaths occurring during surgical procedures. He testified in many criminal cases about the use and presence of anesthesia products in cases of death.\nQuestion: Valentino was very knowlegabel about anesthesia. True, False, or Neither?", "doc_id": 31, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17061, 41631, 6486, 37400], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa.\nQuestion: Jara is a European language spoken by 46000 people True, False, or Neither? False\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: The 2002 Indian vice-presidential election has an A. True, False, or Neither? Neither\n###\nTexas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax.\nQuestion: Texas Monthly v. Bullock was a case initiated against Texas Monthly. True, False, or Neither? False\n###\nChristian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team.\nQuestion: Christian Darcy Bisson is a Canadian professional baseball second baseman and a fat man True, False, or Neither? Neither\n###\nLloyd Newton Morrisett, Jr. (born November 2, 1929) is an American experimental psychologist with a career in education, communications, and philanthropy. He is one of the founders of the Sesame Workshop, the organization famous for the creation of the children's television shows \"Sesame Street\" which was also co-created by him, \"The Electric Company\", and many others.\nQuestion: Lloyd Newton Morrisett, Jr. is dead True, False, or Neither?", "doc_id": 114, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8482, 8933, 26042, 5486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Port Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: The census took place the year before 2012. True, False, or Neither? True\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show has had more then one permanent host. True, False, or Neither? Neither\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene.\nQuestion: \"Something from Nothing\" will be the main theme of the next Marvel movie True, False, or Neither? Neither\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world.\nQuestion: Hudson Valley Community College has students from all countries in the world True, False, or Neither? False\n###\nThe Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park.\nQuestion: New South Wales, Australia is west and east of the Indian Ocean. True, False, or Neither?", "doc_id": 232, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6695, 15803, 4877, 16919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prom Night IV: Deliver Us from Evil is a 1992 Canadian slasher horror film directed by Clay Borris and starring Nicole de Boer and J.H. Wyman. The film follows a deranged Catholic priest who begins murdering teenagers on their prom night. It is the fourth and final film in the \"Prom Night\" franchise. Like the previous , it was released briefly in theaters before later being released to video.\nQuestion: Clay Borris is a Catholic Priest. True, False, or Neither? Neither\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde.\nQuestion: Jango is Brian Wilde's debut role. True, False, or Neither? Neither\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki.\nQuestion: Matsuri Mizuguchi shares the exact date of birth and age as Christopher Lee True, False, or Neither? False\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) .\nQuestion: Sabanc\u0131 University has an impressive alumni. True, False, or Neither? Neither\n###\nHudepohl Brewing Company is a brewery established in Cincinnati, Ohio in 1885 by founder Ludwig Hudepohl II. Hudepohl was the son of Bavarian immigrants and had worked in the surgical tool business before starting his brewery. Hudepohl combined with Schoenling Brewing Company in 1986. Today, the Hudepohl-Schoenling Brewing Company is a wholly owned subsidiary of Christian Moerlein Brewing Co..\nQuestion: The Schoenling Brewing Company was established after the Hudepohl Brewing Company. True, False, or Neither?", "doc_id": 223, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36270, 27963, 20409, 34755], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In electromagnetism, charge density is a measure of electric charge is the amount of electric charge per unit length, surface area, or volume, called the linear, surface, or volume charge density, respectively. The respective SI units are C\u22c5m, C\u22c5m or C\u22c5m.\nQuestion: electromagnetism has to do with charge True, False, or Neither? True\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback.\nQuestion: Hard Landing is said to be his most successful novel. True, False, or Neither? Neither\n###\nCharles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009.\nQuestion: Brown was drafted by the Saints. True, False, or Neither? Neither\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village.\nQuestion: The population of Makri was 1919 in the viilliage True, False, or Neither? False\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013.\nQuestion: Brandon Tyler McManus was born after the 17th century True, False, or Neither?", "doc_id": 270, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24815, 32749, 21977, 14471], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Margaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law.\nQuestion: Margaret Lucille Jeanne Parker is an independent True, False, or Neither? False\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Seven Ways from Sundown was released before 1961 True, False, or Neither? True\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano.\nQuestion: Memento has been seen by everybody. True, False, or Neither? Neither\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\".\nQuestion: Polarbr\u00f6d was started in the 1960s. True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: Joe Tinker was named after the baseball Hall of Famer, Joe Tinker. True, False, or Neither?", "doc_id": 779, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [25294, 36011, 30946, 24949], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio.\nQuestion: Christopher Lawrence works on ABC Radio. True, False, or Neither? True\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway.\nQuestion: It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1932 True, False, or Neither? False\n###\nSierpe River (Spanish: \"Rio Sierpe\") is a river of Costa Rica. Boat traffic is common with both locals and tourists. A broad range of wildlife can be seen from the American Crocodile, various other reptile species, and exotic fish and birds. It joins the Rio Terraba.\nQuestion: Rio Sierpe has more crocs than birds True, False, or Neither? Neither\n###\nAnastasija Sevastova (born 13 April 1990) is a professional tennis player from Latvia. Having retired in 2013 due to recurring injuries, Sevastova returned to competition in 2015 and became known for her campaign at the 2016 US Open, where she defeated third-seeded Garbi\u00f1e Muguruza as well as Johanna Konta en route to her first ever Grand Slam quarterfinal.\nQuestion: Anastasija Sevastova defeated Garbine Muguruza in the 2016 US Open True, False, or Neither? True\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position.\nQuestion: The CTA has always had a male leader until recently. True, False, or Neither?", "doc_id": 275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30834, 28978, 16739, 44887], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots.\nQuestion: Panchos was let go in the early 80s True, False, or Neither? False\n###\nThe Arkansas Mountain AVA is an American Viticultural Area located in the Ozark Mountains of northwestern Arkansas. It is part of the larger Ozark Mountain AVA, which also includes regions in Missouri and Oklahoma. The smaller Altus AVA is entirely contained within the Arkansas Mountain AVA. The Arkansas Mountain AVA includes 2880000 acre , making it the ninth largest AVA as of 2008.\nQuestion: The Ozark Mountains are inhabited with Native Americans. True, False, or Neither? Neither\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer.\nQuestion: Joseph Maurice Ravel was skinny. True, False, or Neither? Neither\n###\nAnti-D\u00fchring (German: \"Herrn Eugen D\u00fchrings Umw\u00e4lzung der Wissenschaft\" , \"Herr Eugen D\u00fchring's Revolution in Science\") is a book by Friedrich Engels, first published in German in 1878. It had previously been serialised in a periodical. There were two further German editions in Engels' lifetime. \"Anti-D\u00fchring\" was first published in English translation in 1907.\nQuestion: Anti-D\u00fchring starts with C. True, False, or Neither? False\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars.\nQuestion: Milan's streets are named after poets. True, False, or Neither?", "doc_id": 580, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27761, 39195, 4378, 24550], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city.\nQuestion: The Indian Derby is the first horse race held in Mumbai True, False, or Neither? Neither\n###\nDave Ward, born 12 July 1959, is a British Trade Unionist and General Secretary of the Communication Workers\u2019 Union (CWU), which was formed through the merger of the Union of Communication Workers and the National Communications Union in 1995. The CWU is the largest Trade Union in the United Kingdom for people working in the Postal and Telecommunications industry with over 200,000 members.\nQuestion: Dave Ward was born on the same month as the holiday July 4th, where America celebrated its independence. True, False, or Neither? True\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: Hope depot is based in kansas True, False, or Neither? False\n###\nRewire (formerly RH Reality Check, long name Reproductive Health Reality Check) is a website focused on reproductive and sexual health from a pro-reproductive rights perspective. The website began as a UN Foundation blog in 2006, and became its own nonprofit organization in January 2012. In 2016, it was renamed \"Rewire\".\nQuestion: Reproductive Health Reality Check is a longer version of RH Reality Check. True, False, or Neither? True\n###\n\"The Dog Said Bow-Wow\" is a science fiction short story by American writer Michael Swanwick, published in 2001. It won the 2002 Hugo Award for Best Short Story and was nominated for the 2002 Nebula Award for Best Short Story. \"The Dog Said Bow-Wow\" is the title story of his 2007 short story collection, published by Tachyon Publications, and was reprinted in the same year in \"\".\nQuestion: \"The Dog Said Bow-Wow\" is about the rapper Bow Wow. True, False, or Neither?", "doc_id": 699, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20983, 18837, 12723, 7514], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Creation of Adam is a fresco painting by Michelangelo, which forms part of the Sistine Chapel's ceiling, painted c. 1508\u20131512. It illustrates the Biblical creation narrative from the Book of Genesis in which God breathes life into Adam, the first man. The fresco is part of a complex iconographic scheme and is chronologically the fourth in the series of panels depicting episodes from Genesis.\nQuestion: The Creation of Adam is a fresco painting by Michelangelo painted in the 1500's True, False, or Neither? True\n###\nHe is a member of the Royal Shakespeare Company and later joined the Renaissance Theatre Company. He has appeared in many of Kenneth Branagh's films, most recently as Corin in the 2006 film \"As You Like It\". Yuill was also the music composer for \"A Midwinter's Tale\" and \"Swan Song\".\nQuestion: Yuill only composed for \"Swan Song,\" and none other. True, False, or Neither? False\n###\nPunjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999.\nQuestion: The Punjab Control of Organised Crime Act was enacted in the state of Iraq. True, False, or Neither? False\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\".\nQuestion: Dr. Edward Vivian Scobie was over 50 when he died True, False, or Neither? True\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki.\nQuestion: Matsuri Mizuguchi and Aki Toyosaki were born in the same place on the same day. True, False, or Neither?", "doc_id": 190, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28516, 40993, 10194, 8558], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Goodlettsville is a city in Davidson and Sumner counties, Tennessee. Goodlettsville was incorporated as a city in 1958 with a population of just over 3,000 residents; at the 2010 census, the city had a total population of 15,921 and in 2015 the population was 16,994. Goodlettsville chose to remain autonomous in 1963 when the city of Nashville merged with the government of Davidson County.\nQuestion: Goodlettsville was the only place to remain autonomous during the merger True, False, or Neither? Neither\n###\nQingtongxia () literally, \"Bronze Gorge\" is a city in the province of Ningxia in the north of China. Administratively, Qingtongxia is a county-level city within the prefecture-level city of Wuzhong. It is located on the left (northwestern) bank of the Yellow River, opposite and a bit upstream of Wuzhong main urban area.\nQuestion: Qingtongxia is a city within the prefecture-level city of Wuzhong. True, False, or Neither? True\n###\nLes Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads:\nQuestion: Les Soir\u00e9es de Nazelles is Francis Poulenc's most famous song. True, False, or Neither? Neither\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982).\nQuestion: \"King of the Jungle\" was a popular song in japan True, False, or Neither? Neither\n###\nThe Florida Board of Regents was from 1965 to 2001 the governing body for the State University System of Florida, which includes all public universities in the state of Florida, United States. It was created to replace a predecessor body called the Florida Board of Control, which had existed from 1905. Its powers are now held by the Florida Board of Governors.\nQuestion: The Florida Board of Regents spanned into two centuries. True, False, or Neither?", "doc_id": 672, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32269, 17812, 669, 29784], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "weRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo.\nQuestion: weRead started out as an application in 2007 True, False, or Neither? True\n###\nGeorge Corrie (born 16 September 1973) is an English footballer, born in Workington, who played for ten years as a midfielder for American USL Second Division side Wilmington Hammerheads, of which he was the captain. He joined the Hammerheads in 1999 after six seasons with Conference North team Workington A.F.C..\nQuestion: George Corrie (born 16 September 1973) is an English footballer who played baseball for the Wilmington Hammerheads. True, False, or Neither? False\n###\nKilimanjaro Native Cooperative Union (KNCU) is a cooperative federation in Tanzania and the oldest cooperative in Africa, founded in 1930 by Charles Dundas. KNCU is owned by the farmers of the 90 primary cooperative societies which buy coffee from the farmers on Kilimanjaro. Offices for the cooperative are located in Moshi.\nQuestion: The KNCU stands for the Kilimanjaro Native Cooperative Union. True, False, or Neither? True\n###\nThe Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures.\nQuestion: The Golden Fetter did not do well in the box office. True, False, or Neither? Neither\n###\nOperation Mojo is part documentary and part rock-mockumentary musical comedy of the TEENick series The Naked Brothers Band. It's the sixth television movie of \"The Naked Brothers Band\", and the second of season 3. The movie aired on Nickelodeon on November 22, 2008\nQuestion: The Naked Brothers Band was popular for teens. True, False, or Neither?", "doc_id": 490, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43569, 38988, 9621, 20168], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Stranger Left No Card (1952) is a British short film directed by Wendy Toye. The film won the Best Fiction award at the 1953 Cannes Film Festival, where it was described as \"a masterpiece\" by Jean Cocteau. It marked the film debut of actor Alan Badel.\nQuestion: Alan used the film to kickstart his career True, False, or Neither? True\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs is a Canadian politician. True, False, or Neither? True\n###\nVitacost.com, Inc is an American e-commerce company based in Boca Raton, Florida, that sells vitamins, supplements and organic grocery products. The company was bought by Kroger, in 2014. Vitacost was inducted into Inc Magazine's \"Inc. 500 Lifetime Hall of Fame,\" in 2006 as one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005).\nQuestion: Vitacost.com sells vitamins and energy drinks True, False, or Neither? Neither\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\".\nQuestion: The 1980 album \"Monty Python's Contractual Obligation Album\" was released on January 19th. True, False, or Neither? Neither\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism.\nQuestion: Bozeman is not in Montana with Gallatin and Madison True, False, or Neither?", "doc_id": 288, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2525, 41858, 39057, 36204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maniac (stylized as MANIAC) is an American short slasher film, directed by Shia LaBeouf. It was released on October 31, 2011. The short film stars American rappers Scott \"Kid Cudi\" Mecudi and Chris \"Cage\" Palko, as French-speaking serial killers. Mescudi and Palko also co-wrote the film with LaBeouf.\nQuestion: 57% of the patrons seeing the film Maniac in the movie theater prefer their popcorn with extra butter. True, False, or Neither? Neither\n###\nRemember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\"\nQuestion: Remember the Daze was released in 2008 True, False, or Neither? False\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Five more movies came out in 1922. True, False, or Neither? Neither\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city.\nQuestion: The population increased by 153 people True, False, or Neither? True\n###\nThe South Africa national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the Australia national cricket team. The tournament was won by England. South Africa were captained by Frank Mitchell and Louis Tancred.\nQuestion: Frank Mitchell captained a test match without Louis Tanced. True, False, or Neither?", "doc_id": 349, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30622, 25408, 43152, 35696], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Eva Strong is a character in Hollyoaks True, False, or Neither? True\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden.\nQuestion: The Kyrkog\u00e5rden Runestones are Incan monument True, False, or Neither? False\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006.\nQuestion: Cardinal Roger Mahony was eventually unsuccessful in hiding the crimes. True, False, or Neither? True\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden.\nQuestion: The garden at Ellon Castle was conceived in 1743. True, False, or Neither? Neither\n###\nThe Tampa Bay Buccaneers season was the franchise's 40th season in the National Football League and the second under head coach Lovie Smith. The offseason was marked by the draft selection of All-American Florida State quarterback Jameis Winston first overall in the 2015 NFL Draft.\nQuestion: The Tampa Bay Buccaneers have played in 46 NFL seasons. True, False, or Neither?", "doc_id": 404, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15150, 32493, 9222, 17526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico.\nQuestion: The USFC \"Fish Hawk\" was not in operation in 1962. True, False, or Neither? True\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city.\nQuestion: The population was the sum 800 + 53 in 2010 True, False, or Neither? True\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: Coates was inspired to write a story about the ketch because of the sailing trips he took with his father. True, False, or Neither? Neither\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues.\nQuestion: The Centre for Policy Studies focuses on business issues the most. True, False, or Neither? Neither\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr..\nQuestion: Herradura-Agullo was eight years old in 1984. True, False, or Neither?", "doc_id": 916, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32239, 6140, 7965, 44993], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Junoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani.\nQuestion: A flight of pigeons is a multi part novella True, False, or Neither? Neither\n###\nThe 2000 Family Circle Cup was the 28th edition of the Family Circle Cup tennis tournament. This WTA Tier I Event was held at the Family Circle Tennis Center in Hilton Head, South Carolina, United States. First-seeded Mary Pierce won the singles title and earned $166,000 first-prize money.\nQuestion: The 2000 Family Circle Cup was the 28th time Mary Pierce played in the event. True, False, or Neither? Neither\n###\n\"The Day the Earth Stood Stupid\" is the seventh episode in season three of \"Futurama\". It originally aired on the Fox network in the United States on February 18, 2001. The title of this episode is a play on the title of the 1951 science fiction film, \"The Day the Earth Stood Still\".\nQuestion: The Day the Earth Stood Stupid was the only time the show used early sci fi films as inspiration that year. True, False, or Neither? Neither\n###\nOleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers.\nQuestion: Smirnov was drafted while a woman was the president of the United States. True, False, or Neither? False\n###\nUp the River (1938) is a prison comedy film starring Preston Foster and Arthur Treacher and featuring Bill \"Bojangles\" Robinson. The movie was directed by Alfred L. Werker and is a remake of a 1930 film with the same title directed by John Ford and starring Spencer Tracy and Humphrey Bogart in the roles subsequently played by Foster and Tony Martin.\nQuestion: Up the River stars Preston Fost True, False, or Neither?", "doc_id": 534, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29472, 4771, 45002, 16323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County.\nQuestion: Elmira has a low population due to hard to live in weather True, False, or Neither? Neither\n###\nTiggy (born 1970 as Charlotte Vigel) is a Danish bubblegum/Eurodance artist. She is perhaps best known for her remix of the Sandy Fox song \"Freckles\" in \"\", originally the English version of the song \"Sobakasu\" by Judy and Mary from the anime \"Rurouni Kenshin\" and she's also popular in parts of Southeast Asia with the song \"Why\".\nQuestion: Tiggy is a famous painter True, False, or Neither? False\n###\nThe Joint Special Operations University (JSOU) is the designated agency within USSOCOM to conduct joint Special Operations Force (SOF) education and thus is tasked with and directed to provide relevant, realistic, leading-edge education opportunities to military and civilian special operations forces personnel around the world, located at MacDill Air Force Base, Florida, USA.\nQuestion: The S in SOF stands for Super True, False, or Neither? False\n###\n2009, Year of Us is the third extended play (EP) by South Korean boy group Shinee. It consists of six tracks and it incorporates alternative rock and hip-hop music genres. The digital version of the album was released on October 19, 2009, with a physical release on October 22. The title track, \"Ring Ding Dong\" was released on October 14, 2009 through various music sites.\nQuestion: Shinee doesn't speak Korean. True, False, or Neither? False\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious.\nQuestion: Trap Queen was a popular first release by Fetty Wap. True, False, or Neither?", "doc_id": 305, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12122, 12323, 7569, 33328], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release.\nQuestion: Shameless Self-Promotion was the band's most popular album True, False, or Neither? Neither\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel.\nQuestion: Kasey Peters now has a career in another field. True, False, or Neither? Neither\n###\nThe Portezuelo Formation is a geologic formation outcropping in the Mendoza, R\u00edo Negro and Neuqu\u00e9n provinces of Argentina. It is the fourth-oldest formation in the Neuqu\u00e9n Group and the older of the two formations in the R\u00edo Neuqu\u00e9n Subgroup. Formerly, that subgroup was treated as a formation, and the Portezuelo Formation was known as the Portezuelo Member.\nQuestion: There are ten other formations in the Neuqu\u00e9n Group. True, False, or Neither? Neither\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion.\nQuestion: Edward Gibbon FRS was a known athiest. True, False, or Neither? Neither\n###\nThe Mercedes-Benz W221 is a chassis code of S-Class, the successor of the Mercedes-Benz S-Class (W220) and the predecessor of the Mercedes-Benz S-Class (W222). The S-Class are the flagship vehicles of Mercedes-Benz and each generation typically introduces a range of technical innovations and developments that over time will find their way into smaller cars.\nQuestion: The The Mercedes-Benz W220 was a great influence W221 True, False, or Neither?", "doc_id": 286, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1501, 8410, 44951, 29838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983.\nQuestion: Stewart was ranked as high as number 60 in the world of singles in 1978 because his dad paid someone a good amount of money True, False, or Neither? Neither\n###\nThe Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms.\nQuestion: Lindsay Lohan starred in the movie The Canyons. True, False, or Neither? True\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci was born in South America. True, False, or Neither? False\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: Kidsty Pike has been seen by earl. True, False, or Neither? Neither\n###\nAziyad\u00e9 (1879; also known as Constantinople) is a novel by French author Pierre Loti. Originally published anonymously, it was his first book, and along with \"Le Mariage de Loti\" (1880, also published anonymously), would introduce the author to the French public and quickly propel him to fame; his anonymous persona did not last long.\nQuestion: Aziyad\u00e9 was published more than 50 months ago. True, False, or Neither?", "doc_id": 944, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9424, 42867, 3704, 26057], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Choirboys is a 1977 American comedy-drama film directed by Robert Aldrich, written by Christopher Knopf and Joseph Wambaugh based on Wambaugh's novel of the same title. It features an ensemble cast including Charles Durning, Louis Gossett, Jr., Randy Quaid, and James Woods. The film was released to theaters by Universal Pictures on December 23, 1977.\nQuestion: Louis was the youngest of the cast True, False, or Neither? Neither\n###\nGladys Leslie (March 5, 1899 \u2013 October 2, 1976) was an American actress in silent film, active in the 1910s and 1920s. Though less-remembered than superstars like Mary Pickford, she had a number of starring roles from 1917 to the early 1920s and was one of the young female stars of her day.\nQuestion: Audiences loved Gladys' voice. True, False, or Neither? Neither\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives.\nQuestion: Katey Sagal will play Christina Applegate's mom, as she did before on Married With Children. True, False, or Neither? Neither\n###\nAlma Ros\u00e9's father was the violinist Arnold Ros\u00e9 (n\u00e9 Rosenblum; 1863\u20131946) who was the leader of the Vienna Philharmonic Orchestra for 50 years: from 1881-1931 as well as leader of the Vienna State Opera orchestra and leader of the legendary Ros\u00e9 String Quartet. Her mother, Justine (died 22 August 1938), was Gustav Mahler's sister. Alma was named for Alma Mahler.\nQuestion: Gustav hated his sister Justine True, False, or Neither? Neither\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: The tournament was organised by the game publisher. True, False, or Neither?", "doc_id": 689, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42312, 22102, 13675, 14664], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Web of Passion (also released as Leda, original French title: \u00c0 double tour) is a 1959 French suspense thriller film directed by Claude Chabrol and based on the novel \"The Key to Nicholas Street\" by American writer Stanley Ellin. It was Chabrol's first film in the thriller genre, which would be his genre of choice for the rest of his career. The film had a total of 1,445,587 admissions in France.\nQuestion: Before Web of Passion, Chabrol didn't direct other movies. True, False, or Neither? False\n###\nUSNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944.\nQuestion: The ship contained 150 soldiers True, False, or Neither? Neither\n###\nSamat (Kyrgyz: \u0421\u0430\u043c\u0430\u0442 ) is a small village located in Leilek District of Batken Region, Kyrgyzstan. The village is subordinated to the town of Isfana. According to the 2009 Population and Housing Census of Kyrgyzstan, at the time the population of Samat was 2,076.\nQuestion: Samat's citizens are mostly male. True, False, or Neither? Neither\n###\nAn election campaign was held ahead of a general election for the 54th Parliament of New South Wales on Saturday, 24 March 2007. The result\u2014a win for the social-democratic Australian Labor Party and its new leader Morris Iemma\u2014was widely perceived as a foregone conclusion, with opposition leader Peter Debnam conceding as much the week before the poll.\nQuestion: Moris Iemma was the leader of the Parliament of New South Wales in 2007. True, False, or Neither? True\n###\n\"Champions\" is a song by American singer Usher and Panamanian singer Rub\u00e9n Blades, recorded for the biographical sports film, \"Hands of Stone\" and is also included on his eight studio album \"Hard II Love\". It was released by RCA on August 26, 2016, available for digital download and online streaming. The song was written by Usher, Rub\u00e9n Blades, Raphael Saadiq and Taura Stinson.\nQuestion: \"Champions\" by Usher and Ruben Blades was released less than a decade ago True, False, or Neither?", "doc_id": 296, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12119, 9450, 24326, 21288], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls.\nQuestion: He remained with the Chicago Bulls for a period of four years. True, False, or Neither? Neither\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh.\nQuestion: Telephone Shilpa Sangstha launched the first Laptop made/assembled in Bangladesh, Doel, in October, 2011. True, False, or Neither? Neither\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808\nQuestion: The area where Marks was located has never been built on again. True, False, or Neither? Neither\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians.\nQuestion: This album is the best selling of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy. True, False, or Neither? Neither\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy.\nQuestion: Daniel Murphy was born in February. True, False, or Neither?", "doc_id": 749, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33327, 22619, 37693, 7376], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company.\nQuestion: UKOTCF is a trustworthy charity True, False, or Neither? Neither\n###\nArizona Business Magazine, based out of Phoenix, Arizona, is the state\u2019s leading monthly Business magazine. Published by AZ Big Media, the magazine covers a wide range of topics focusing on the Arizona business scene, and is aimed at high-level corporate executives and business owners.\nQuestion: Arizona Business Magazine is successful in the business scene. True, False, or Neither? Neither\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class.\nQuestion: young people in Guatemala go to church. True, False, or Neither? Neither\n###\n\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character.\nQuestion: The series is narrated by Damon, who acts as the programme's lead character. True, False, or Neither? False\n###\nSea Lion Park was a 16 acre amusement park started in 1895 on Coney Island by Paul Boyton. He fenced the property and charged admission, the park becoming the first enclosed and permanent amusement park in North America. Up until the establishment of this park, amusement areas around the country consisted of pay-as-you-go concessions. In 1903, Sea Lion Park was replaced by Luna Park.\nQuestion: It is less than 150 years old True, False, or Neither?", "doc_id": 606, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8870, 13488, 1218, 14915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982).\nQuestion: Bananarama later released a third Japanese-only single. True, False, or Neither? Neither\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: Nydala Abbey was situated in London, England. True, False, or Neither? False\n###\nNatasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad.\nQuestion: natasha choufani is beautiful True, False, or Neither? Neither\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine.\nQuestion: MIT Technology Review was founded in 1899 with the word \"The\" in front. True, False, or Neither? True\n###\nMiranda May Kerr (born 20 April 1983) is an Australian model. Kerr rose to prominence in 2007 as one of the Victoria's Secret Angels. Kerr was the first Australian Victoria's Secret model and also represented the Australian department store chain David Jones. Kerr has launched her own brand of organic skincare products, KORA Organics, and has written a self-help book.\nQuestion: Miranda May Kerr was born in the 8th decade of the 20th century. True, False, or Neither?", "doc_id": 595, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37055, 37756, 29527, 11791], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Asian Institute is a research centre at the Munk School of Global Affairs at the University of Toronto, and is located in the historical Devonshire House, a former residential hall of the university's Trinity College. Ritu Birla is the Richard Charles Lee Director of the Asian Institute.\nQuestion: The Devonshire House is located on the outskirts of Toronto. True, False, or Neither? Neither\n###\nWuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county.\nQuestion: Wuqiang County is a southeastern province True, False, or Neither? True\n###\nAn opening act, warm-up act, or supporting act is an entertainment act (musical, comedic, or otherwise), that performs at a concert before the featured act, or \"headliner\". Rarely, an opening act may perform again at the end of the event, or perform with the featured act after both have had a set to themselves.\nQuestion: Headliners don't perform at the same time as the opening act True, False, or Neither? True\n###\nCecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa.\nQuestion: CMH is the acronym for a hospital named after a person. True, False, or Neither? True\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture.\nQuestion: Laura Warholic; or, The Sexual Intellectual starts with Eugene and Laura already being divorced. True, False, or Neither?", "doc_id": 505, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43077, 14064, 42490, 34958], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rana amurensis (Khabarovsk frog, Siberian wood frog, Heilongjiang brown frog or Amur brown frog) is a species of true frog found in northern Asia. It ranges across western Siberia, as well as northeastern China, northeastern Mongolia, and on the northern Korean Peninsula and on Sakhalin. \"Rana coreana\" was previously included in this species as a subspecies.\nQuestion: Rana amurenis can be found in the Eastern Hemisphere. True, False, or Neither? True\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\".\nQuestion: Aniket Vishwasrao has appeared on TV shows. True, False, or Neither? Neither\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1.\nQuestion: Club Deportivo Cajamadrid was famous for more than one sport. True, False, or Neither? True\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams.\nQuestion: Gaming crime has decreased since 1992. True, False, or Neither? Neither\n###\nMick Napier (born December 12, 1962) is an American director, actor, teacher and author living in Chicago. He is the founder and artistic director of the Annoyance Theatre and an award-winning director at The Second City. He has directed Stephen Colbert, Tina Fey, Rachel Dratch, Horatio Sanz, Nia Vardalos, Andy Richter, Jeff Garlin, and David Sedaris, amongst others.\nQuestion: Mack Napier has won awards for acting. True, False, or Neither?", "doc_id": 355, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42135, 41685, 30846, 42519], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hudson is a town in Hillsborough County, New Hampshire, United States. It is located along the Massachusetts state line. The population was 24,467 at the 2010 census, with an estimated population of 24,645 in 2013. It is the ninth-largest municipality (town or city) in the state, by population.\nQuestion: The Massachusetts state line runs through Hudson True, False, or Neither? True\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000.\nQuestion: No one joined the September 2012 Sun Life financial Players' Championship. True, False, or Neither? False\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east.\nQuestion: It is bordered by a sea with a name that starts with an R True, False, or Neither? True\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Tricia and Eva are played by the same actress. True, False, or Neither? True\n###\nRichard Colson Baker (born April 22, 1990), better known by his stage names MGK and Machine Gun Kelly, is an American rapper and actor, from Cleveland, Ohio. MGK embarked on a musical career as a teenager, releasing a mixtape in 2006. He went on to release four more mixtapes.\nQuestion: Richard Colson Baker was known for being gentle. True, False, or Neither?", "doc_id": 489, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20214, 12781, 24772, 22092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015.\nQuestion: Yap had parents who were farmers. True, False, or Neither? Neither\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: Besides being known as a power chord or fifth chord, there are other names for this style of guitar playing. True, False, or Neither? Neither\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA).\nQuestion: PLU Crew consists of 25 members True, False, or Neither? Neither\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence.\nQuestion: A conjectural portrait is more popular in Asia than in the West True, False, or Neither? Neither\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG.\nQuestion: Oman Fisheries Co was based in Suadi Arabia True, False, or Neither?", "doc_id": 96, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2084, 23930, 31300, 27769], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996).\nQuestion: She was born in 1956 True, False, or Neither? True\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team.\nQuestion: Ralph D. Malone later became a sports announcer. True, False, or Neither? Neither\n###\nPlatylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in Ivory Coast, Ghana, Cameroon, the Democratic Republic of the Congo (Shaba), western Uganda, Malawi and northern Zambia. The habitat consists of woodland and open places in the forest zone.\nQuestion: Platylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in many parts of Africa. They like to live in forests. True, False, or Neither? Neither\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\".\nQuestion: The TV show, Do Not Adjust Your Set, had a lot of songs from the Bonzo Dog Band. True, False, or Neither? True\n###\nLemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146.\nQuestion: Lemoyne is part of the census for statistical purposes. True, False, or Neither?", "doc_id": 953, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11121, 26979, 32833, 21097], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "William Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65.\nQuestion: Traditional Scottish folk music became very popular in nineteen hundred fifty five. True, False, or Neither? Neither\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night.\nQuestion: Benjamin Stoloff thought Bela Lugosi played the most important character in Night of Terror. True, False, or Neither? Neither\n###\nThe Nanking Massacre was an episode of mass murder and mass rape committed by Japanese troops against the residents of Nanjing (\"Nanking\"), then the capital of the Republic of China, during the Second Sino-Japanese War. The massacre is also known as the Rape of Nanking or, using Pinyin romanization, the Nanjing Massacre or Rape of Nanjing.\nQuestion: Nanjing is still the capital of the Republic of China True, False, or Neither? Neither\n###\nSwaay is the debut EP by American band DNCE. The EP was released worldwide on October 23, 2015, by Republic Records. The majority of the EP was co-produced and co-written by lead singer and frontman Joe Jonas. The EP debuted at number 39 on the US \"Billboard\" 200.\nQuestion: DNCE won many awards for the EP True, False, or Neither? Neither\n###\nCherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\".\nQuestion: Supervixens premiered before 1974. True, False, or Neither?", "doc_id": 269, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12903, 28227, 27216, 39361], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Identification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\".\nQuestion: Jerzy Skolimowski has directed 10 other feature films, since Identification Marks: None. True, False, or Neither? Neither\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia.\nQuestion: The Big 12 Conference has the fewest members of any conference. True, False, or Neither? Neither\n###\nThe Protectorate of Bohemia and Moravia (German: \"Protektorat B\u00f6hmen und M\u00e4hren\" ; Czech: \"Protektor\u00e1t \u010cechy a Morava\" ) was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau.\nQuestion: Germany was occupied by Czechoslovakia during the 1930's. True, False, or Neither? False\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Broadway Rose has a short middle. True, False, or Neither? Neither\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser was born in eighteen hundred eighty one. True, False, or Neither?", "doc_id": 686, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13787, 5957, 18093, 1284], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: Doomsday Device was performed by bozo True, False, or Neither? Neither\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910.\nQuestion: Yi Bangja did not end up being the empress of the Empire of Korea. True, False, or Neither? True\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions.\nQuestion: Barry and Stuart like each other True, False, or Neither? Neither\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C.\nQuestion: Luton Town Ladies Football Club formed a partnership with Luton Town F.C. 3 years after they had been established. True, False, or Neither? True\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1.\nQuestion: caja madrid started sponshorship of Club Deportivo Cajamadrid in 1979. True, False, or Neither?", "doc_id": 601, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36728, 31827, 24775, 31638], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line.\nQuestion: The Purple Line will be an improvement to mass transit in the D.C. metropolitan area. True, False, or Neither? Neither\n###\nCorn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin.\nQuestion: Corn crab soup is definitely of northern Chinese origin. True, False, or Neither? False\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA).\nQuestion: PLU Crew has won many championshps True, False, or Neither? Neither\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. \nQuestion: Madava Farms is at least 1 acre True, False, or Neither? True\n###\nJay Ferguson (born John Arden Ferguson; May 10, 1947) is an American rock/pop musician, known for his work with Spirit and Jo Jo Gunne, and his 1978 solo hit \"Thunder Island\". His later career has been as a composer of music for television programs and films.\nQuestion: Jay Ferguson is friends with Spirit and Jo Jo Gunn True, False, or Neither?", "doc_id": 964, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24774, 43027, 4695, 15421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "PLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA).\nQuestion: PLU Crew consists of 40 members True, False, or Neither? Neither\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label.\nQuestion: Bad Company was recorded at Headley Grange with Ronnie Lane's Mobile Studio in month after Halloween in the year that equals 2073 minus 100. True, False, or Neither? True\n###\nThe iHeartRadio Much Music Video Awards (also known as the MMVAs, and originally known as the Canadian Music Video Awards until 1995, and formerly and commonly known as the MuchMusic Video Awards) are annual awards presented by the Canadian television channel Much to honour the year's best music videos.\nQuestion: The MuchMusic Video Awards are held annually. True, False, or Neither? True\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Chris Gardner wanted this movie to be made about his life. True, False, or Neither? Neither\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide.\nQuestion: Mutual Friends had 8 protagonists True, False, or Neither?", "doc_id": 197, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26362, 9741, 5014, 25571], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Progress in Materials Science is a journal publishing review articles covering most areas of materials science, published by the Pergamon imprint of Elsevier. It was started in 1949 with the title \"Progress in Metal Physics\" with Bruce Chalmers serving as first editor. It was changed to the current title in 1961.\nQuestion: The Pergamon imprint evolved from Butterworth Springer but both are seeking the same audience. True, False, or Neither? Neither\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia.\nQuestion: The Grand Duchess Kira of Russia was born on 25 August 1914. True, False, or Neither? Neither\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing.\nQuestion: XP is only used in tabletop games. True, False, or Neither? False\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC.\nQuestion: Recently, Tripoli Municipal Stadium has seen less revenue than in past years. True, False, or Neither? Neither\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: Yannis Philippakis was born in nineteen hundred seventy five. True, False, or Neither?", "doc_id": 210, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6537, 7721, 10527, 25526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label.\nQuestion: Lance King started Nightmare in 1962 True, False, or Neither? False\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming.\nQuestion: Resorts Casino Tunica has gone through a couple name changes over the years. True, False, or Neither? True\n###\n\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\".\nQuestion: Lucenzo wrote more than one song. True, False, or Neither? True\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC.\nQuestion: The stadium began with an 11,000 capacity, but was later expanded to 22,000. True, False, or Neither? Neither\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books.\nQuestion: Boleslav William Felix Robert Sienkiewicz has won an Oscar award. True, False, or Neither?", "doc_id": 182, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35853, 8724, 10616, 10421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forestville Commonwealth is an archaeological site and national historic district located at Earlton in Greene County, New York. The district contains seven contributing sites. It represents the remains of a utopian community built in 1826-1827 as one of three Owenite experiments in New York State.\nQuestion: Forestville Commonwealth is one of four Owenite experiments in New York. True, False, or Neither? False\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl.\nQuestion: Calendar Girls only has five actresses in it. True, False, or Neither? Neither\n###\nVladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic.\nQuestion: Rusanov is Russian. True, False, or Neither? True\n###\nEdwin John Ellis (1848 \u2013 1916) was a British poet and illustrator. He is now remembered mostly for the three-volume collection of the works of William Blake he edited with W. B. Yeats. It is now criticised, however, for weak scholarship, and preconceptions.\nQuestion: Edwin John Ellis died of consumption. True, False, or Neither? Neither\n###\nThe 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016.\nQuestion: A professional tennis tournament played on soft courts took place in Australia. True, False, or Neither?", "doc_id": 930, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7184, 13567, 39570, 39949], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pixote: a Lei do Mais Fraco (] , lit. \"Pixote (small child): The Law of the Weak\") is a 1980 Brazilian drama film directed by H\u00e9ctor Babenco. The screenplay was written by Babenco and Jorge Dur\u00e1n, based on the book \"A Inf\u00e2ncia dos Mortos\" (\"The Childhood of the Dead Ones\") by Jos\u00e9 Louzeiro.\nQuestion: the book \"A Inf\u00e2ncia dos Mortos\"is translated to mean \"The Childhood of the Dead Ones\" True, False, or Neither? True\n###\nAjay Nagrath is an Indian television and movie actor and is the son of Bollywood actor Anil Nagrath. Currently, he plays the role of \"Pankaj\" in C.I.D. He has done many roles in many TV shows and even films, but there came a point in his life when he was unhappy that his weight had become his identity in the industry. He said \"I used to be a couch potato.\"\nQuestion: Ajay Nagrath was known by his weight True, False, or Neither? True\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\".\nQuestion: The Brown Spectator has very few libertarian readers. True, False, or Neither? Neither\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: amor a la mexicana is the spanish way of mexican-syle love True, False, or Neither? Neither\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\".\nQuestion: Frechette won the Canadian Screen Award for Best Art Direction or Production Design at the 3rd Canadian Screen Awards. True, False, or Neither?", "doc_id": 59, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3863, 4811, 33534, 23938], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO.\nQuestion: Hidden City Entertainment was founded in 2001 True, False, or Neither? False\n###\nCarlo's Bake Shop, commonly known as Carlo's Bakery and also known as Carlo's City Hall Bake Shop, is a bakery in Hoboken, New Jersey, owned by Buddy Valastro. The bakery has gained public attention as the setting of the TLC television series, \"Cake Boss\".\nQuestion: The TLC television series Cake Boss takes place in Missouri. True, False, or Neither? False\n###\nMy Super D is a 2016 Philippine superhero fantasy drama television series directed by Frasco Mortiz and Lino Cayetano, starring Dominic Ochoa in his first leading role, together with Marco Masa and Bianca Manalo. The series was aired on ABS-CBN's \"Primetime Bida\" evening block and worldwide on The Filipino Channel from April 18, 2016 to July 15, 2016, replacing \"Game ng Bayan\".\nQuestion: Marco Masa has worked with Frasco Mortiz. True, False, or Neither? True\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005.\nQuestion: Samuel Eto'o Fils was born in nineteen hundred eighty seven. True, False, or Neither? False\n###\nThe Achilles Club is a track and field club formed in 1920 by and for past and present representatives of Oxford and Cambridge Universities. Members have won 19 Olympic Gold Medals (most recently Steph Cook in the pentathlon), and held 38 World Records. One of its founding members was Evelyn Aubrey Montague, who is immortalized in the 1981 film \"Chariots of Fire\".\nQuestion: The Steph Cook is an Olympic Gold Medal in the pentathlon. True, False, or Neither?", "doc_id": 548, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13844, 34597, 33805, 43778], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Van Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor.\nQuestion: Van Cleef & Arpels make more flower themed objects than they do animal themed objects. True, False, or Neither? Neither\n###\nCastle Rock Estate is an Australian winery based at Porongurup, in the Great Southern wine region of Western Australia. According to prominent Australian wine writer James Halliday, it has an exceptionally beautifully sited and immaculately maintained vineyard, winery and cellar door sales area with sweeping vistas from the Porongurups.\nQuestion: Castle Rock Estate grows grapes True, False, or Neither? True\n###\nRuth Gentry (February 22, 1862 \u2013 October 18, 1917) was a pioneering American woman mathematician during the late 19th century and the beginning of the 20th century. She was the first native-born Indiana woman to acquire a PhD degree in mathematics, and most likely the first woman born in Indiana to receive a doctoral degree in any scientific discipline.\nQuestion: More women went to college for degrees in math because of Ruth. True, False, or Neither? Neither\n###\nAdenanthos terminalis, commonly known as gland flower, yellow gland flower or adenanthos, is a one metre tall shrub in the family Proteaceae. It is found in south eastern regions of Australia, in the states of South Australia and Victoria, and is the most widespread of the two \"Adenanthos\" species occurring outside of Western Australia.\nQuestion: Adenanthos terminalis is in the family protaceae True, False, or Neither? True\n###\nThe Bowling Green Falcons men's basketball team is the basketball team that represent Bowling Green State University in Bowling Green, Ohio. The school's team currently competes in the Mid-American Conference. The team last played in the NCAA Division I Men's Basketball Tournament in 1968. The Falcons are now coached by Michael Huger, their 17th head coach.\nQuestion: The Falcons appeared in the NCAA Division I Men's Basketball Tournament after 1968. True, False, or Neither?", "doc_id": 272, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34433, 19605, 7841, 44995], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ethan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films.\nQuestion: Ethan Suplee was born in 1976. True, False, or Neither? True\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Petasites is a type of dog True, False, or Neither? False\n###\nWe Joined the Navy is a 1962 British comedy film produced by Daniel M. Angel and directed by Wendy Toye which stars Kenneth More, Lloyd Nolan, Joan O'Brien, Derek Fowlds, Graham Crowden, Esma Cannon and John Le Mesurier. It was based on the novel of the same name by John Winton.\nQuestion: We Joined the Navy has 69 actors in it. True, False, or Neither? Neither\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: there are 6 people per household on average True, False, or Neither? False\n###\nOld Carthusians Football Club is an association football club whose players are former pupils of Charterhouse School in Godalming, Surrey, England. The club was established in 1876 and won the FA Cup in 1881, as well as the FA Amateur Cup in 1894 and 1897. The club currently plays in the Arthurian League and won league and Arthur Dunn Cup doubles in 2006, 2008, 2009, 2011, 2013 and 2014.\nQuestion: The Club did not play in the Arthurian League in 2005. True, False, or Neither?", "doc_id": 415, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16046, 1162, 35009, 31717], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern.\nQuestion: McKinnon started Little League football as a running back. True, False, or Neither? Neither\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall.\nQuestion: There are a lot of artifacts in the museum. True, False, or Neither? Neither\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: Juan Cruz Aragone was ranked #5 in the world. True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staunton Mall shoppers are mostly poor people True, False, or Neither? Neither\n###\nOn July 16, 2009, Harvard University professor Henry Louis Gates Jr. was arrested at his Cambridge, Massachusetts home by local police officer Sgt. James Crowley, who was responding to a 9-1-1 caller's report of men breaking and entering the residence. The arrest initiated a series of events that unfolded under the spotlight of the international news media.\nQuestion: Henry Louis Gates Jr is a teacher True, False, or Neither?", "doc_id": 326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42493, 22894, 9412, 15484], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Utamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation.\nQuestion: This film is a biography. True, False, or Neither? False\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: One of the counties that got the album starts with an A True, False, or Neither? True\n###\nMy Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington.\nQuestion: Florence Nightgale was covered by Guy de la B\u00e9doy\u00e8re in My Famous Family. True, False, or Neither? True\n###\n\"Se Telefonando\" is a song performed by the Italian singer Mina, released in May 1966. The music was composed, orchestrated and conducted by Ennio Morricone to Italian lyrics by Di Chiara and Costanzo. (Reportedly Costanzo only contributed one word, in editing a previous version of a verse, to avoid censorship). The song was written for a radio broadcast, called \u201cAria condizionata\u201d.\nQuestion: The Song was only sang in italian (never had a cover in a different language) True, False, or Neither? Neither\n###\nNana Kwame Abrokwa (born 5 October 1968) is a Ghanaian born German rapper and DJ, performing under the pseudonyms Nana or Darkman / Nana. Nana is not an actual first name, but a Ghanaian title of nobility. His most important achievement came in the second half of the 1990s, when his style was characterized as \"euro-rap\".\nQuestion: Ghana produced a notable euro-rapper in 1968. True, False, or Neither?", "doc_id": 225, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8149, 14500, 4257, 39359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire.\nQuestion: This game is the most watched game in Australia True, False, or Neither? Neither\n###\nSoul Ballet is a musical project of actor, producer, arranger, programmer, and multi-instrumentalist Rick Kelly \"RK.\" Soul Ballet\u2019s music is smooth contemporary jazz/electronica, characterized as pulsating electronic beats entwined with a dark, moody atmosphere.\nQuestion: Rich Kelly plays the guitar. True, False, or Neither? Neither\n###\nPrincess Antoinette of Monaco, Baroness of Massy (Antoinette Louise Alberte Suzanne Grimaldi; 28 December 1920 \u2013 18 March 2011) was a member of the princely family of Monaco and the elder sister of Prince Rainier III and aunt of Albert II, Prince of Monaco. Her parents were Count Pierre de Polignac and Princess Charlotte, Duchess of Valentinois.\nQuestion: Count Pierre de Polignac died on December 28th, 1921. True, False, or Neither? Neither\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980.\nQuestion: Dr. Jeckyll & Mr. Hyde consisted of more than 1 member. True, False, or Neither? True\n###\nSamuel Bronston (Samuel Bronshtein, March 26, 1908, Bessarabia \u2013 January 12, 1994, Sacramento, California) was a Bessarabian-born American film producer, film director, and a nephew of socialist revolutionary figure, Leon Trotsky. He was also the petitioner in a U.S. Supreme Court case that set a major precedent for perjury prosecutions when it overturned his conviction.\nQuestion: Samuel Bronston never met his uncle, Leon Trotsky. True, False, or Neither?", "doc_id": 377, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24904, 876, 42600, 9296], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Studies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\"\nQuestion: Studies in Mutualist Political Economy was written by a anarchist True, False, or Neither? True\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan.\nQuestion: New York's hockey team was called the Islanders True, False, or Neither? True\n###\nDarren Horrigan (born 2 June 1983) is an English footballer who played in the Football League for Lincoln City. A goalkeeper born in Middlesbrough, Horrigan began his career with Birmingham City, and went on to play non-League football for clubs including Stamford Town, Cambridge City, Ilkeston Town, Spennymoor United, Scarborough, Gateshead, Bishop Auckland and Tow Law Town.\nQuestion: Darren Horrigan is from England. True, False, or Neither? True\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: The people of Keystone have a wonderful view of the river. True, False, or Neither? Neither\n###\nAlexandre \"Xande\" Ribeiro (born January 20, 1981 in Manaus-Amazonas, Brazil), is a Brazilian Jiu-Jitsu practitioner, mixed martial artist and submission wrestler. He is a two-time World (Mundial) Black Belt Absolute (open weight) World Jiu-Jitsu Champion, five-time World (Mundial) Black Belt Heavy Weight Champion, and three-time World Black Belt Pro Division Champion.\nQuestion: Alexandre \"Xande\" Ribeiro is 38 Years old True, False, or Neither?", "doc_id": 935, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39131, 40876, 35834, 8829], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: That Girl Lucy Moon was inspired by Amy's daughter. True, False, or Neither? Neither\n###\nThe Blackstone Chronicles is a serialized novel by American horror and suspense author John Saul. The series consists of six installments and takes place in a fictional New Hampshire town called Blackstone. The series has been adapted into both a computer game and graphic novel.\nQuestion: New Hampshire has a town called Blackstone. True, False, or Neither? False\n###\nGrant Taylor (Born October 30,1991) is an American professional skateboarder. He is the son of former professional skateboarder Thomas Taylor and won Thrasher Magazine's \"Skater of The Year\" in 2011. Grant\u2019s style of skateboarding is known to be fast and powerful. He is recognized for his unique versatile skateboarding.\nQuestion: Grant Thomas will teach his son to skateboard. True, False, or Neither? Neither\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh.\nQuestion: Woods was also an elite baseball pitcher. True, False, or Neither? Neither\n###\nThe MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005.\nQuestion: The MAV-1 is controlled by a person. True, False, or Neither?", "doc_id": 506, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17066, 7111, 42954, 42158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa.\nQuestion: Use of Jara is increasing and is replacing other Nigerian languages True, False, or Neither? False\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts.\nQuestion: Borage is native to Arizona. True, False, or Neither? Neither\n###\nMaryborough Airport (IATA: MBH,\u00a0ICAO: YMYB) is located approximately 3 km north of the town centre. The airport serves as a small regional airport serving Maryborough and Rainbow Bay. However, increasing competition with Hervey Bay Airport has led to a decrease in commercial air traffic.\nQuestion: Maryborough Airport is in New Zealand. True, False, or Neither? Neither\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: Gulf Air services only 41 destinations True, False, or Neither? True\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock.\nQuestion: Trick Pony is the name of an American country music group, their debut album and one of their songs. True, False, or Neither?", "doc_id": 293, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19005, 16500, 10799, 8713], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Otard, also known as Chateau de Cognac, is a French cognac house founded in 1795 by Jean-Baptiste Antoine Otard. The company has remained in the hands of the same family since its establishment. The firm is based in the Ch\u00e2teau des Valois (Ch\u00e2teau de Cognac), Cognac, Charente, its home since 1796.\nQuestion: Otard was founded five years prior to 1800. True, False, or Neither? True\n###\nTo Drown A Rose is a single by Death in June. Additional music performers include: Christ 777, Douglas P., Gary Carey, Jan O', John Balance, Rose McDowall. The vinyl has the phrases \"Our time has been...\" and \"...and will be again\" scratched into it. The test pressing for this release was done on 12\" vinyl as opposed to the finalized 10\" format.\nQuestion: To Drown A Rose is known by millions. True, False, or Neither? Neither\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University.\nQuestion: Ruth Pryor was married True, False, or Neither? Neither\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets.\nQuestion: Rodrequis La'Vant Stephens was the highest paid player when he was on the Seattle Seahawks True, False, or Neither? Neither\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas.\nQuestion: Lloyd Cole is also the name of more than one person True, False, or Neither?", "doc_id": 712, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19700, 35384, 10530, 30631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ashcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Ashcroft is in the USA. True, False, or Neither? True\n###\nBrown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution.\nQuestion: Brown University has a debate team True, False, or Neither? Neither\n###\nThe Return of the Condor Heroes, also called The Giant Eagle and Its Companion, is a wuxia novel by Jin Yong (Louis Cha). It is the second part of the \"Condor Trilogy\" and was preceded by \"The Legend of the Condor Heroes\" and followed by \"The Heaven Sword and Dragon Saber\". It was first serialised between 20 May 1959 and 5 July 1961 in the Hong Kong newspaper \"Ming Pao\".\nQuestion: It was the second wuxia novel ever written True, False, or Neither? Neither\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead.\nQuestion: The Sierra Leone Civil War didn't accomplish anything. True, False, or Neither? Neither\n###\nThe Corridor (Lithuanian: Koridorius ) is a 1995 Lithuanian drama film directed by \u0160ar\u016bnas Bartas. It has a fragmentary narrative without dialogue and depicts several people in Vilnius. According to the director, the title symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\".\nQuestion: Vilnius symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\". True, False, or Neither?", "doc_id": 235, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36299, 10585, 6055, 7772], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Exergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid.\nQuestion: Exergonix Inc, is an energy storage company based in Kansas City, Missouri that develops and builds renewable energy solutions for a wide range of applications. True, False, or Neither? Neither\n###\nIrfan Khoosat (Urdu: \u0639\u0631\u0641\u0627\u0646 \u06a9\u06be\u0648\u0633\u0679\u200e ) is a Pakistani actor, producer and a well-known comedian. He is famous for his comic role as \"Hawaldar Karamdad\" in the TV series Andhera Ujala in which he portrayed simpleton and blabbermouth character of a low-ranked policeman. He is also known as stage comedian. He also won Nigar Award for his comic role in 1985 film \"Hum se hai zamana\".\nQuestion: Irfan will continue his career until 2020. True, False, or Neither? Neither\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound.\nQuestion: German dogs are good scent dogs. True, False, or Neither? Neither\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg.\nQuestion: The swan diagram is an outdated model. True, False, or Neither? Neither\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013.\nQuestion: Brandon Tyler McManus met with Amy. True, False, or Neither?", "doc_id": 110, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17236, 8438, 2513, 42031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records.\nQuestion: Weezer is a Chinese rock band True, False, or Neither? False\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals.\nQuestion: Udinese Calcio finished first place in Serie A True, False, or Neither? False\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others.\nQuestion: The film was produced by Joseph Lovett. True, False, or Neither? Neither\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting.\nQuestion: The inconsequential place of Faerun is talked about sometimes in the D&D lore True, False, or Neither? Neither\n###\nDonaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville.\nQuestion: Donaldson Center Airport is located in South Carolina, six kilometers south of the central business district of Greenville. True, False, or Neither?", "doc_id": 35, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3964, 3577, 6349, 7501], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eric Black is an American journalist. He was a longtime reporter for the Minnesota Star Tribune newspaper, and has also been a Twin Cities blogger. He is a columnist for online newspaper MinnPost, primarily writing about politics and the historical background of current issues.\nQuestion: Eric Black has been employed by a company True, False, or Neither? True\n###\nSt. Ives Town F.C. is a football club based in St Ives, Cambridgeshire, England. They play in the Southern League Premier Division. This St Ives Town should not be confused with the Cornwall Combination team playing in St Ives, Cornwall, which is also called St Ives Town F.C.\nQuestion: St. Ives Town F.C. has never won a championship True, False, or Neither? Neither\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes.\nQuestion: Hook, Line and Sinker is an Australian fishing television program which runs for 60 minutes True, False, or Neither? False\n###\nThe Stranger Left No Card (1952) is a British short film directed by Wendy Toye. The film won the Best Fiction award at the 1953 Cannes Film Festival, where it was described as \"a masterpiece\" by Jean Cocteau. It marked the film debut of actor Alan Badel.\nQuestion: The short film aired in the very early 1950's True, False, or Neither? True\n###\nSergeant Alistair Slater, MM (25 July 1956 \u2013 2 December 1984), was a British Army soldier who served in B Squadron, Air (7) Troop, 22 Special Air Service (SAS), who was killed on 2 December 1984 while on operations against the Provisional Irish Republican Army in Kesh, a village in County Fermanagh in Northern Ireland.\nQuestion: Slater was killed by the Provisional Irish Republican Army True, False, or Neither?", "doc_id": 613, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39673, 38599, 24591, 12258], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: It is sunny outside True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Augusta County is in the middle of the United States. True, False, or Neither? Neither\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph Ernst Friedrich von Forcade de Biaix was a good speaker. True, False, or Neither? Neither\n###\nJoel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden.\nQuestion: Joel Madden has used the same last name his whole life True, False, or Neither? False\n###\nMax Carver (born Robert Maxwell Martensen Jr; August 1, 1988) is an American actor. He is known for his role as Preston Scavo in the ABC television series \"Desperate Housewives\", and as Aiden on the MTV teen-horror drama \"Teen Wolf\". He starred in the first season of the HBO series \"The Leftovers\". His twin brother Charlie Carver portrayed the twin of his characters in all three shows.\nQuestion: He was born in the century before the current one True, False, or Neither?", "doc_id": 881, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27888, 16220, 41666, 26102], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Burnaston is a village and civil parish in the South Derbyshire district of Derbyshire, England, just south-west of the city of Derby. The population of the civil parish at the 2011 Census was 1,531. The village has swift and direct road links with nearby cities Derby and Nottingham, as well as the city of Birmingham that is some forty miles south along the A38 dual carriageway.\nQuestion: Burnaston is just north-west of the city of Derby. True, False, or Neither? False\n###\nJames Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015.\nQuestion: Big Game James is a 12 time PBA All-star. True, False, or Neither? True\n###\nSkaneateles ( or ) is an affluent village in the town of Skaneateles, Onondaga County, New York, United States. The village is named from and located on the shores of Skaneateles Lake, one of the Finger Lakes. As of the 2010 census, the village had a population of 2,450 residents.\nQuestion: Skaneateles has a population of almost 2500 people True, False, or Neither? True\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys.\nQuestion: Scott Edward Morriss was born in 1980 True, False, or Neither? False\n###\nBullitt East High School is a high school located at 11450 Highway 44 East in the city of Mount Washington, Kentucky. It is part of the Bullitt County Public Schools district. Sports teams include: Archery, Swimming, Football, Soccer, Tennis, Track and Field, Baseball, Softball, Wrestling, Basketball, Volleyball and Cheerleading.\nQuestion: Bullitt East High School is not in Washington. True, False, or Neither?", "doc_id": 7, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41671, 37368, 118, 3161], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Glacier retreat or glacial retreat is type of glacial motion discussed in several articles, depending on the time frame of interest, and whether the climatological process or individual glaciers are being considered. Articles on these topics include:\nQuestion: Glacier retreat is not explained in this articles. True, False, or Neither? Neither\n###\nAirline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya.\nQuestion: Koki Mutungi was born in Kenya and is the fist female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. True, False, or Neither? Neither\n###\nKimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996).\nQuestion: Kimberly was the star actress in \"The Big Blue\". True, False, or Neither? Neither\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg.\nQuestion: Marie Hedwig Aususte was Archduchess of Saxe-Lauenburg. True, False, or Neither? False\n###\nNewtrament is a musician, MC and DJ known for releasing an early UK electro/hip hop record - \"London Bridge is Falling Down\" - on Jive Records. It was based on the nursery rhyme (previously adapted by the reggae group Culture) with a political message that electoral politics were a sham.\nQuestion: newtrament didn't write london bridge is falling down True, False, or Neither?", "doc_id": 849, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21566, 3444, 41416, 26779], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arturo Guzm\u00e1n Decena (a.k.a. Z-1) (13 January 1976 \u2013 21 November 2002) was a Mexican Army Special Forces operative who in 1997 defected to the Gulf Cartel and subsequently founded the criminal syndicate's enforcement wing at the behest of drug baron Osiel C\u00e1rdenas Guill\u00e9n. Known today as Los Zetas, the cartel's armed wing ultimately broke apart and formed its own drug trafficking organization.\nQuestion: arturo guzman decena is best known for Z-1 for his gun knowledge True, False, or Neither? Neither\n###\nDiablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood.\nQuestion: 2015 movie Diablo starred Clint Eastwood. True, False, or Neither? False\n###\nLegoland Discovery Center Dallas Fort Worth is an indoor family entertainment center located at Grapevine Mills mall in Grapevine, Texas, which is situated between the cities of Dallas and Fort Worth, Texas. The attraction includes Lego-theme rides, a soft play area, a 4D cinema and a gift shop. The center is owned and operated by British leisure group Merlin Entertainments.\nQuestion: The most prevalent injury at Legoland Discovery Center is sunburns. True, False, or Neither? Neither\n###\nThe 1919 PGA Championship was the second PGA Championship, which is now considered one of golf's major championships. It was held September 16\u201320 at the Engineers Country Club in Roslyn Harbor, New York, east of New York City on Long Island in Nassau County.\nQuestion: The second PGA Championship was held after the end of World War I. True, False, or Neither? True\n###\nThe 2007 Internazionali BNL d'Italia was the 2007 edition of the Rome Masters tennis tournament. The men's tournament was part of the 2007 ATP Masters Series and was held on May 5-13. The women's event was a 2007 WTA Tier I Series event and was held on May 13-20.\nQuestion: The Rome Masters tennis tournament was not held in 2008. True, False, or Neither?", "doc_id": 206, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19115, 11886, 39079, 21245], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Justin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017.\nQuestion: Justin Tinucci is an American actor, He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2015. he is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Sean. True, False, or Neither? False\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award.\nQuestion: Rachel Brosnahan has never moved her lips. True, False, or Neither? False\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge.\nQuestion: Kew Bridge railway station is very modern. True, False, or Neither? Neither\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\".\nQuestion: The re-release was on vinyl. True, False, or Neither? True\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\".\nQuestion: The Last of Us was the only game developed by Naughty Dog. True, False, or Neither?", "doc_id": 554, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32370, 15082, 17998, 25721], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming.\nQuestion: It is between 2 bays True, False, or Neither? True\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: He held the position of the Tibetan Youth Congress since September 2007, and on August 8, 2008 he was re-elected. True, False, or Neither? True\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style.\nQuestion: Real Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group in the AAA. It consists of young \"tecnicos\" who all use a high flying, high risk wrestling style. True, False, or Neither? True\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi.\nQuestion: NASA John H. Glenn Research Center is directed by Janet L. Kavandi True, False, or Neither? True\n###\nCriminal Mindscape is a television documentary series on MSNBC that profiles the minds of extreme criminals. Different interviewers interview subjects such as Ron Luff and Joseph Paul Franklin. Interviewers are often from various fields of criminal justice as opposed to journalism. Interviewers attempt to develop psychological profiles of individual criminals.\nQuestion: Criminal Mindscape's interviewers all come from different fields of criminal justice True, False, or Neither?", "doc_id": 560, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35177, 15917, 31232, 10866], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Murder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: The albums they release in 2017 will progressively become more psychedelic True, False, or Neither? Neither\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Hero without a country did not win any awards True, False, or Neither? Neither\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle.\nQuestion: City Mall is a large mall in Jordan that has been open for over a decade. There are many films shown in the cinema here. Some of the films are American. True, False, or Neither? True\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release.\nQuestion: The members of The Sloppy Meateaters are vegans. True, False, or Neither? Neither\n###\nThe Magic Roundabout in Swindon, England, was constructed in 1972 and consists of five mini-roundabouts arranged around a sixth central, anticlockwise roundabout. Located near the County Ground, home of Swindon Town F.C., its name comes from the popular children's television series \"The Magic Roundabout\". In 2009 it was voted the fourth scariest junction in Britain, in a poll by Britannia Rescue.\nQuestion: The Magic Roundabout was voted the fourth scariest junction in Britain in 2010 in a poll by Britannia Rescue. True, False, or Neither?", "doc_id": 196, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21767, 9773, 23392, 19722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy.\nQuestion: The National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. It is owned by people who love children. True, False, or Neither? Neither\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle.\nQuestion: Symphonic songs are very short. True, False, or Neither? Neither\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: They have other songs that have peaked higher True, False, or Neither? Neither\n###\nMurray, Utah was declared a city July 3, 1902, instituting a mayor-council form of government. The mayor of Murray was originally partisan, but switched to a non-partisan position. The term of mayor was originally two years, but amended to a four-year term in the 1940s in accordance with state law. The following is a list of Mayors of Murray, Utah.\nQuestion: Murray has been a city for less than 200 years True, False, or Neither? True\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty.\nQuestion: Trap Queen is the highest-charting single of Fetty Wap. True, False, or Neither?", "doc_id": 89, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9362, 25009, 14265, 25258], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Simon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro.\nQuestion: Simon Corbell is currently 50 years old. True, False, or Neither? False\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98.\nQuestion: Most teens liked Death Race. True, False, or Neither? Neither\n###\nTom Clancy's Splinter Cell is a 2002 stealth video game developed by Ubi Soft Montreal and built on the Unreal Engine 2. It is the first \"Splinter Cell\" game in the series. Endorsed by author Tom Clancy, it follows the activities of NSA black ops agent Sam Fisher. The character of Fisher is voiced by actor Michael Ironside.\nQuestion: The NSA has a paramilitary group responsible for secret operations. True, False, or Neither? True\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons.\nQuestion: The Grand Prix des Fronti\u00e8res was a car race True, False, or Neither? True\n###\nRonald Francis Arias (born November 30, 1941) is a former senior writer and correspondent for \"People magazine\" and \"People en Espa\u00f1ol\". He is also a highly regarded author whose novel \"The Road to Tamazunchale\" has been recognized as a milestone in Chicano literature.\nQuestion: Ronald Francis Arias is a famous mexican author True, False, or Neither?", "doc_id": 811, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10063, 33777, 40422, 20679], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Europrop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas.\nQuestion: Europrop International GmbH (EPI) is a joint venture among eight thousand and four hundred European aircraft engine manufacturers. True, False, or Neither? False\n###\nUSS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941.\nQuestion: USS \"Christopher\" was named after a guy who died in 1941 True, False, or Neither? True\n###\nMary Pierce (born 15 January 1975) is a French retired tennis professional who played on the Women's Tennis Association (WTA) tour. Born in Canada, she is a citizen of Canada, and the United States. Pierce played for France in team competitions and in the Olympics.\nQuestion: Mary Pierce has a middle name. True, False, or Neither? False\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph de Biaix was a member of parliament in the German Reichstag. True, False, or Neither? True\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Chris McKendry was born more than 1968 years ago. True, False, or Neither?", "doc_id": 29, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43865, 43222, 11832, 10056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The NME Awards 2017 were held in London, England, on 15 February 2017, at the Brixton Academy and was hosted by English comedian Huw Stephens. Beyonc\u00e9 led the nominations with five, followed by The 1975, Bastille, Christine And The Queens and Skepta with four nominations each.\nQuestion: Huw Stephens attended the Brixton Academy that evening. True, False, or Neither? True\n###\nThe diminished seventh chord is commonly used in the harmony of both Western classical music and also in jazz and popular music of the twentieth and twenty-first centuries. Classical composers of the eighteenth and nineteenth centuries imaginatively exploited the chord's dramatic and expressive potential. (See below).\nQuestion: The diminished seventh chord is is associated with music from the 21st century. True, False, or Neither? True\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: Colorz of Rage hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley had blossoming careers independent of the urban drama film. True, False, or Neither? Neither\n###\nToolbox Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. It is a remake of the 1978 film of the same name and was produced by the same people behind the original. The film centralizes on the occupants of an apartment who are stalked and murdered by a masked killer.\nQuestion: Toolbox Murders is a 2004 comedy film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. True, False, or Neither? False\n###\nRevolution Money is a financial services company based in St. Petersburg, Florida. The company's products include a PIN-based credit card, online person-to-person payments service with a linked stored value card, and gift card. Revolution Money is the only credit card that does not charge retailers interchange fees. The company partnered with Yahoo! Sports and Fifth Third Bank.\nQuestion: Revolution Money does charge some fees. True, False, or Neither?", "doc_id": 722, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41895, 4813, 8600, 16290], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "InterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV.\nQuestion: Rede Globo owns InterTV Grande Minas True, False, or Neither? Neither\n###\n\"Paradise\" is the only single release from Styx's 1997 live double album \"Return to Paradise\". The song was originally written and recorded by Dennis DeYoung for his musical \"The Hunchback of Notre Dame\". The song was re-recorded by Styx for inclusion as one of three new studio tracks on the live album.\nQuestion: Paradise was released in 1990 True, False, or Neither? False\n###\nWBZW (1520 AM) is a radio station in Altamonte Springs, Florida. Owned by Pennsylvania Media Associates, Inc., the station operates at 1520 kHz with a daytime power of 5 kW & a nighttime power of 350 watts. Its transmitter is located in Apopka, Florida. The station currently programs a Business News/Talk format.\nQuestion: Th radio station has more listeners in the noon so it consumes more power in the day time. True, False, or Neither? Neither\n###\nCarol Ann Crawford (February 22, 1934 \u2013 August 10, 1982), also known as Carol Stolkin and Carol Ross, was an American backgammon and bridge player from Buffalo, New York who spent many years in Detroit, Michigan.. In 1973, she became the second woman to win the world backgammon championships.\nQuestion: Carol Ann Crawford always lost. True, False, or Neither? False\n###\nTwelve Days of OK Go is a compilation album by American rock band OK Go. It was released on December 31, 2012. OK Go started releasing the songs on December 10, with one song released each weekday. The last song, a cover of \"Any Time at All\", was released on Christmas. A bonus track, a cover of \"This Will Be Our Year,\" was released on New Year's Eve.\nQuestion: Ok Go is a punk band True, False, or Neither?", "doc_id": 143, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10340, 18889, 41030, 32455], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: The Auspicious Crane took part in some of the most important naval battles of World War 2. True, False, or Neither? True\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute.\nQuestion: Fifty-Four Forty or Fight! was a term coined to express Polk's agenda. True, False, or Neither? True\n###\nThe Axe Giant: Original Motion Picture Soundtrack is the soundtrack to the 2013 film, \"Axe Giant: The Wrath of Paul Bunyan\". The album features the film score composed by Midnight Syndicate's Edward Douglas and \"The Ballad of Paul Bunyan\" performed by Hick'ry Hawkins.\nQuestion: The film score for The Axe Giant was composed in 2013 True, False, or Neither? Neither\n###\nIn theoretical physics, particularly in discussions of , Mach's principle (or Mach's conjecture) is the name given by Einstein to an imprecise hypothesis often credited to the physicist and philosopher Ernst Mach. The idea is that local inertial frames are determined by the large scale distribution of matter, as exemplified by this anecdote:\nQuestion: The idea regarding local inertial frames was widely supported by many, the first time Mach discussed his principle in the topic of theoretical physics. True, False, or Neither? Neither\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur.\nQuestion: George Edward Foreman has never physically hurt another person True, False, or Neither?", "doc_id": 346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39001, 2071, 8004, 11197], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: Nashville West's members were all good. True, False, or Neither? Neither\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station.\nQuestion: he silver cty bristol crashed due to the engine True, False, or Neither? Neither\n###\nThe Internazionali Tennis Val Gardena S\u00fcdtirol \"(also known as the Sparkassen ATP Challenger on the ATP Challenger Tour)\" is a tennis tournament held in Ortisei, Italy since 2000. The event is part of the ATP Challenger Tour and the ITF Women's Circuit and is played on indoor hard courts. The event was previously a $100,000+H ITF Women's Circuit category from 2008 to 2009.\nQuestion: The Internazionali Tennis Val Gardena S\u00fcdtirol tennis tounament was held in Ortisei, Italy from 2008 to 2009. True, False, or Neither? True\n###\nMy Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington.\nQuestion: Bill Oddie hosts My Famous Family by himself. True, False, or Neither? False\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington.\nQuestion: The Angel and the Soldier Boy is Clannad's first movie soundtrack. True, False, or Neither?", "doc_id": 409, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [175, 42912, 29503, 23678], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Natasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad.\nQuestion: Natasha did not grow up in a multi-cultural society. True, False, or Neither? False\n###\nThe Louvin Brothers were an American musical duo composed of brothers Ira Lonnie Loudermilk (1924\u20131965) and Charlie Elzer Loudermilk (1927\u20132011), better known as Ira and Charlie Louvin. The brothers are cousins to John D. Loudermilk, a Nashville Songwriters Hall of Fame member.\nQuestion: They weren't the first artists inducted in the hall of fame in their family True, False, or Neither? Neither\n###\nTroy University was a short-lived university established at Troy, New York in 1858 under the auspices of the Methodist Episcopal Church. The school closed in 1861. The building that housed the university remained a prominent Troy landmark until 1969. On the site now is Rensselaer Polytechnic Institute's Folsom Library.\nQuestion: Troy University has been seen by george. True, False, or Neither? Neither\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House.\nQuestion: Neeru Bajwa, Tarun Khanna, and Gurpreet Ghuggi were disparaged for their performances. True, False, or Neither? Neither\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date.\nQuestion: Fuel band I created something like human due to it being the hemorrhage of the hits and not popular True, False, or Neither?", "doc_id": 373, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24009, 33588, 26617, 14834], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards.\nQuestion: The Proteus Design Suite is used by the US army True, False, or Neither? Neither\n###\nThe Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\".\nQuestion: The runtime of the film is 52 minutes True, False, or Neither? Neither\n###\nCape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming.\nQuestion: Cape Vakop was chartered over 35 years ago True, False, or Neither? True\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries.\nQuestion: Diageo does not own any liquor companies. True, False, or Neither? False\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games.\nQuestion: The Seattle SuperSonics don't have other nicknames True, False, or Neither?", "doc_id": 736, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15188, 22824, 12107, 36918], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: Doomsday Device is a popular term used in professional wrestling. True, False, or Neither? Neither\n###\nSwaay is the debut EP by American band DNCE. The EP was released worldwide on October 23, 2015, by Republic Records. The majority of the EP was co-produced and co-written by lead singer and frontman Joe Jonas. The EP debuted at number 39 on the US \"Billboard\" 200.\nQuestion: Swaay has been sung by Trump. True, False, or Neither? Neither\n###\nTony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls.\nQuestion: Tony was inspired by a basketball player early in his life. True, False, or Neither? Neither\n###\nUniversity of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund.\nQuestion: University of Maryland Eastern Shore is a cheap university True, False, or Neither? Neither\n###\nBad Family () is a South Korean television series starring Kim Myung-min, Nam Sang-mi, Im Hyun-sik, Yeo Woon-kay, Kang Nam-gil, Geum Bo-ra, Kim Heechul and Lee Young-yoo. It aired on SBS from March 22 to May 11, 2006 on Wednesdays and Thursdays at 21:55 for 16 episodes.\nQuestion: New episodes of Bad Family are still being made. True, False, or Neither?", "doc_id": 770, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17362, 27079, 3669, 2990], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Semonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781.\nQuestion: Semonkong is a refuge for Basotho people True, False, or Neither? True\n###\nYahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies.\nQuestion: Yahoo Serious is a very fat man True, False, or Neither? Neither\n###\nElizabeth \"Long Liz\" Stride (n\u00e9e Gustafsdotter) (27 November 1843 \u2013 30 September 1888) is believed to be a victim of the notorious unidentified serial killer called Jack the Ripper, who killed and mutilated several women in the Whitechapel area of London from late August to early November 1888.\nQuestion: Elizabeth Stride was born in 1843 True, False, or Neither? True\n###\nThe University of Nebraska\u2013Lincoln, often referred to as Nebraska, UNL or NU, is a public research university in the city of Lincoln, in the state of Nebraska in the Midwestern United States. It is the state's oldest university, and the largest in the University of Nebraska system.\nQuestion: The University of Nebraska-Lincoln is often referred to as UNL True, False, or Neither? True\n###\nConcrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr.\nQuestion: Ryan Adams plays the flute. True, False, or Neither?", "doc_id": 836, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14195, 33543, 31210, 29587], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Duel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg.\nQuestion: Steven Spielberg is the well known director of many movies and short plays. True, False, or Neither? Neither\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: Cuban imports to the United States were stopped in 1961, but have since resumed. True, False, or Neither? Neither\n###\nAlways (; lit. Only You) is a South Korean film directed by Song Il-gon. Starring So Ji-sub and Han Hyo-joo in the lead roles, it is about a romance between an ex-boxer who has closed his heart to the world and a telemarketer who remains spirited despite slowly going blind.\nQuestion: Always is a South Korean movie True, False, or Neither? True\n###\nPerformance Car, commonly abbreviated to PC, was an automobile magazine from the United Kingdom published by EMAP between October 1983 and July 1998. As suggested by the title, the magazine focussed on the high performance sector of the car market, from hot hatches through to supercars.\nQuestion: Performance Car has a X. True, False, or Neither? False\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists.\nQuestion: Contra Conspiracy was originally known as Contra Control. True, False, or Neither?", "doc_id": 557, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28915, 39985, 45236, 10035], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Takahashi River (\u9ad8\u6881\u5ddd , Takahashi-gawa ) is a Class A major river in the western part of Okayama Prefecture. It acts as the main drainage for the Takahashi River Drainage System, and is one of the three main drainage rivers in Okayama Prefecture (the others being the Yoshii River and the Asahi River).\nQuestion: The Takahashi River serves as a major drainage river. True, False, or Neither? True\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd.\nQuestion: The film stars 7 people True, False, or Neither? Neither\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia.\nQuestion: Louis Ferdinand II had seven siblings. True, False, or Neither? False\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home.\nQuestion: Coriolanus had a large army. True, False, or Neither? Neither\n###\nLamme Goedzak is a character in Charles De Coster's novel \"The Legend of Thyl Ulenspiegel and Lamme Goedzak\" (1867). He is the best friend of Thyl Ulenspiegel. While Ulenspiegel himself is derived from Dutch-German-Flemish folklore Lamme Goedzak is entirely created by De Coster. Despite this he has become one of the most recognizable Flemish folklore characters since.\nQuestion: Charles De Coster published a novel in 1865 True, False, or Neither?", "doc_id": 812, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30909, 18033, 622, 7791], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Volcano, I'm Still Excited!! was an American indie rock band from Brooklyn, New York (originally from Austin, Texas). The band's name (which has been described as \"ludicrous\") was reportedly inspired by the Tom Hanks film \"Joe Versus the Volcano\", though the band has never revealed the inspiration for the name.\nQuestion: Volcano, I'm Still Excited!! has been said to be a crazy sounding name for a group. True, False, or Neither? True\n###\nPostal codes in Brunei are alphanumeric, consisting of two letters followed by four digits in the format of YZ0000, where Y denotes the district code, Z denotes the mukim code, the first two digits denote the area or village code, and the last two digits denote the nearest post office code (e.g. the postal code for Pantai Mentiri Golf Club is BU2529).\nQuestion: Postal codes in Brunei are alphanumeric but never start with a letter. True, False, or Neither? False\n###\nThe International University of Rabat or IUR is a semi-public university founded in 2010 in Morocco. It delivers double-degrees, in collaboration with foreign universities, in law, engineering, aeronautics, energy engineering, architecture, business management and political sciences.\nQuestion: IUR is a public university. True, False, or Neither? False\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals.\nQuestion: Antonio Di Natale was not an important person on the Udinese Calcio . True, False, or Neither? False\n###\nUSS \"Fletcher\" (DD/DDE-445), named for Admiral Frank Friday Fletcher, was the lead \"Fletcher\"-class destroyer , and served in the Pacific during World War II. She received fifteen battle stars for World War II service, and five for Korean War service.\nQuestion: Frank Friday Fletcher was the Admiral of the USS Fletcher True, False, or Neither?", "doc_id": 302, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38564, 15761, 21820, 44573], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The position of South African ambassador to the United States is the most prestigious and top diplomatic post in South Africa. The position was first held in March 1949, following the upgrade of South Africa's diplomatic mission to an embassy. The post has been held by many important politicians and is currently held by M. J. Mahlangu.\nQuestion: M. J. Mahlangu, who currently holds the position of South African ambassador to the United States, has held the position for many years. True, False, or Neither? Neither\n###\nThe Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video.\nQuestion: The Newcomers was filmed in Vermont and caused a boost of tourism to the state. True, False, or Neither? Neither\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc.\nQuestion: the village had 2 million people in 2011 True, False, or Neither? False\n###\nMartin H\u00f6hener (born June 23, 1980) is a Swiss professional ice hockey defenceman. He is currently playing for the SC Bern of Switzerland's National League A. He was selected by the Nashville Predators in the 9th round (284th overall) of the 2000 NHL Entry Draft.\nQuestion: Martin H\u00f6hener currently resides in Switzerland. True, False, or Neither? Neither\n###\nJames King of William (January 28, 1822 \u2013 May 20, 1856) was a crusading San Francisco, California, newspaper editor whose assassination by a criminal in 1856 resulted in the establishment of the second San Francisco Vigilance Committee and changed the politics of the city. King was among the first newspapermen to be honored by the California Journalism Hall of Fame.\nQuestion: James King of William was born in San Francisco, California. True, False, or Neither?", "doc_id": 632, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10003, 2635, 34562, 7463], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A surf break at Point Leo, on the Mornington Peninsula, one of the closest surf beaches to Melbourne in Victoria, Australia known as First Reef or more colloquially just \"The Reef\". Until the 1970s there was little or no resident surfing population in Point Leo, so the Reef was mainly surfed by the few transient waveriders who were exploring the many breaks to be found in Westernport Bay.\nQuestion: Before the 1970s there were a large number of people surfing at First Reef, which is a surf break at Point Leo. True, False, or Neither? False\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire.\nQuestion: Peter John Reynolds was knighted in OBE for his research in experimental archaeology True, False, or Neither? Neither\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012.\nQuestion: The Melodi Grand Prix Junior 2012 was won by a 16 year old. True, False, or Neither? False\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music.\nQuestion: Mission: Impossible III was the most popular movie of the series True, False, or Neither? Neither\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012.\nQuestion: The nominees for the 39th People's Choice Awards were announced during the same year that the ceremony was held True, False, or Neither?", "doc_id": 220, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15555, 1678, 24474, 37263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed.\nQuestion: Corrina, Corrina is set 35 years before it was released. True, False, or Neither? True\n###\n\"Have You Ever Met That Funny Reefer Man\", often known simply as \"The Reefer Man\", is a 1932 American jazz song composed by J. Russel Robinson, with lyrics by Andy Razaf. It was first recorded by Cab Calloway and his orchestra, with versions by others over the years, including by Harlan Lattimore, Murphy's Law and Big Bad Voodoo Daddy.\nQuestion: The song was released in 1931 True, False, or Neither? False\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: The film View from the Top was not the first time Bruno Barreto and Gwyneth Paltrow worked together True, False, or Neither? Neither\n###\n\"They\" is a short story written by American science fiction author Robert A. Heinlein. It was first published in the April 1941 issue of \"Unknown\", and can be found in Heinlein's short story collection \"The Unpleasant Profession of Jonathan Hoag\". It also appears in a number of multi-author anthologies.\nQuestion: Robert A. Heinlein wrote more than one short story. True, False, or Neither? True\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time.\nQuestion: The population count of Jefferson County is over 1,000,000 as of today. True, False, or Neither?", "doc_id": 673, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14075, 35319, 11499, 21428], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Balaji K. Kumar is a Film Director who entered Tamil cinema as a director with the 2013 thriller film \"Vidiyum Munn\" which released on 29 November 2013 and received positive reviews from critics. Then started his career as story board artist for advertising firms like Ogilvy & Mather, JWT, Saatchi & Saatchi.\nQuestion: Balaji K. Kumar is world famous. True, False, or Neither? Neither\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\".\nQuestion: \"Paint It Black\" is a song as a British rock band. True, False, or Neither? True\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide.\nQuestion: Mutual Friends was written in 2008. True, False, or Neither? Neither\n###\nRufus Lackland Taylor (January 6, 1910 \u2013 September 14, 1978) was an officer in the United States Navy. There he became Director of the Office of Naval Intelligence and a Vice Admiral. In 1966 he was appointed as Deputy Director of the Defense Intelligence Agency (DIA), then shortly thereafter as Deputy Director of the CIA, where he served from 1966 to 1969.\nQuestion: Rufus Lackland Taylor (January 6, 1910 \u2013 September 13, 1978) was an officer in the United States Navy. He was appointed as Deputy Director of the Defense Intelligence Agency (DAI). True, False, or Neither? False\n###\nThe Rock \u2018n\u2019 Roll Mardi Gras Marathon is an annual international marathon race which takes place in New Orleans, Louisiana, in the United States. It is part of the Rock 'n' Roll Marathon Series of road running competitions and it also features the Rock \u2018n\u2019 Roll Mardi Gras Half Marathon.\nQuestion: The Rock \u2018n\u2019 Roll Mardi Gras Marathon is a marathon in the u.s with half relay marathon True, False, or Neither?", "doc_id": 838, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19880, 18586, 35994, 24038], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ann Rae Rule (n\u00e9e Stackhouse; October 22, 1931 \u2013 July 26, 2015) was an American true crime author of \"The Stranger Beside Me\", about serial killer, and Rule's co-worker, Ted Bundy. Rule was also known for her book \"Small Sacrifices\", about Oregon child murderer Diane Downs. Many of Rule's books center on murder cases that occurred in the Pacific Northwest and her adopted home state of Washington.\nQuestion: Ann Rule was married True, False, or Neither? Neither\n###\nFernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier.\nQuestion: Fernande Olivier spoke to Picasso in French. True, False, or Neither? Neither\n###\nRegent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day.\nQuestion: The CEO of Habib Group is British. True, False, or Neither? Neither\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: The district was names after Clement True, False, or Neither? Neither\n###\nThe Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter.\nQuestion: Renee Probert-Price left at least 50 hats to her great niece and goddaughter. True, False, or Neither?", "doc_id": 447, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10402, 28762, 36843, 18582], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donald Clark \"Donny\" Osmond (born December 9, 1957) is an American singer, actor, radio personality, and former teen idol. Osmond has also been a talk and game show host, record producer and author. In the mid-1960s, he and four of his elder brothers gained fame as the Osmonds. Osmond went solo in the early 1970s, covering such hits as \"Go Away Little Girl\" and \"Puppy Love\".\nQuestion: Donny Osmond is an only child. True, False, or Neither? False\n###\nLove Island is a 1952 American film directed by Bud Pollard starring Paul Valentine and Eva Gabor. Originally released in Cinecolor, the film uses extensive footage taken in Bali used from the film \"\" (1935). It was the final directorial effort of Bud Pollard who had previously directed several race films and exploitation films.\nQuestion: Love Island was released in nineteen hundred fifty three. True, False, or Neither? False\n###\nShannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University.\nQuestion: Shannon Kelley set records as a quarterback. True, False, or Neither? Neither\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s.\nQuestion: The New Ulm Oil Company Service Station had gasoline True, False, or Neither? True\n###\nPrincess Amalie \"Auguste\" of Anhalt-Dessau (German: \"Prinzessin Amalie Auguste von Anhalt-Dessau\" ; 18 August 1793 \u2013 12 June 1854) was a German princess of Anhalt-Dessau who was Princess consort of Schwarzburg-Rudolstadt from 1816 to 1854 as the wife of Friedrich G\u00fcnther, Prince of Schwarzburg-Rudolstadt.\nQuestion: Princess Amalie \"Auguste\" of Anhalt-Dessau was Princess consort in the early 1860's. True, False, or Neither?", "doc_id": 867, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23408, 8053, 25441, 3226], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: Imagine dragons wrote their first song in 2002 True, False, or Neither? Neither\n###\nJohnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson.\nQuestion: Johnson College Prep is named after John H. Johnson and Eunice Johnson. True, False, or Neither? True\n###\nAktar-Ul Islam (Bengali: \u0986\u09b0\u09cb \u0989\u09b2 \u0987\u09b8\u09b2\u09be\u09ae ; born 1980) is an award-winning English chef, restaurateur and entrepreneur. In 2009, his restaurant Lasan became the first Indian restaurant in the United Kingdom to be selected as the \"Best Local Restaurant\" by Gordon Ramsay on Channel 4's \"The F Word\". In June 2011, he won the fish course in the final of the BBC Two series \"Great British Menu\".\nQuestion: Lasan learned to cook in India. True, False, or Neither? Neither\n###\nElizabeth \"Long Liz\" Stride (n\u00e9e Gustafsdotter) (27 November 1843 \u2013 30 September 1888) is believed to be a victim of the notorious unidentified serial killer called Jack the Ripper, who killed and mutilated several women in the Whitechapel area of London from late August to early November 1888.\nQuestion: Elizabeth \"Long Liz\" Stride was born more than 150 years ago. True, False, or Neither? True\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc\nQuestion: Varun Sharma is not currently acting True, False, or Neither?", "doc_id": 647, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28326, 31283, 26267, 32276], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Guns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin).\nQuestion: Guns of Diablo was a popular Western. True, False, or Neither? Neither\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity.\nQuestion: In 2009, mobile software was a booming business. True, False, or Neither? Neither\n###\nSigmoid colon volvulus, also known as sigmoid volvulus, is a common cause of bowel obstruction and constipation. It is common in Asia, India (7% of intestinal obstruction) and especially South India because of the high fibre diet. It is very common cause of large bowel obstruction in Peru and Bolivia due to high altitude.\nQuestion: Chewing food more thoroughly could help Bolivians avoid bowel problems. True, False, or Neither? Neither\n###\nThe Tiki Bar is Open was singer-songwriter John Hiatt's sixteenth album, released in 2001. It was his last album with Vanguard Records. Although they are uncredited, the album features backing band The Goners, the same cadre of friends who backed Hiatt in his 1988 release Slow Turning. It was coincidentally released on September 11, 2001.\nQuestion: The Goners didn't reform after September 11, 2001. True, False, or Neither? Neither\n###\nLucjan Karasiewicz (born July 10, 1979 in Tarnowskie G\u00f3ry) is a Polish politician. He was elected to Sejm on September 25, 2005 getting 6844 votes in 28 Cz\u0119stochowa, standing for Law and Justice. He joined Poland Comes First when that party split from Law and Justice in 2010.\nQuestion: Lucjan Karasiewicz is currently living True, False, or Neither?", "doc_id": 751, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19669, 43806, 39291, 19492], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Greatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: Greatest Hits Volume 1 was not released in 1969 True, False, or Neither? True\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: Once Upon a Time premiered less than 100 years ago True, False, or Neither? True\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport.\nQuestion: Cameroon has an airline. True, False, or Neither? True\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race.\nQuestion: Tom\u00e1s Nistal Fern\u00e1ndez is a former road cyclist. True, False, or Neither? True\n###\nLincoln is a town in Providence County, Rhode Island, United States. The population was 21,105 at the 2010 census. Lincoln is located in northeastern Rhode Island, north of Providence. Lincoln is part of the Providence metropoliton statistical area and the Greater Boston combined statistical area.\nQuestion: The population of Lincoln is over 21,105, True, False, or Neither?", "doc_id": 14, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34492, 8843, 26909, 2918], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lost Souls is a 1992 horror novel by American writer Poppy Z. Brite, his first one. It is the only novel-length adventure of Brite's 'Steve and Ghost' characters, popularized in numerous short stories. The novel is an extended version of the short story \"The Seed of Lost Souls\".\nQuestion: \"The Seed of Lost Souls\" is the follow up novel to Lost Souls True, False, or Neither? False\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Jenner is south of Stillwater Cove Regional Park. True, False, or Neither? True\n###\nMiss Peregrine's Home for Peculiar Children is a contemporary fantasy debut novel by American author Ransom Riggs. The story is told through a combination of narrative and vernacular photographs from the personal archives of collectors listed by the author.\nQuestion: Miss Peregrine's Home for Peculiar Children is a fantasy story. True, False, or Neither? True\n###\nPaolo Romano, also known as Paolo Tuccone and as Paolo di Mariano di Tuccio Taccone was an Italian early Renaissance sculptor and goldsmith. Giorgio Vasari in his \"Lives of the Most Excellent Painters, Sculptors, and Architects\" recounts that Paolo Romano was a modest man whose sculpture was far superior to that of his boastful contemporary Mino del Reame.\nQuestion: Paolo Romano was an Italian early Renaissance sculptor and goldsmith and painter. True, False, or Neither? Neither\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird.\nQuestion: The North African ostrich can also be found in West Africa True, False, or Neither?", "doc_id": 911, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17672, 24478, 9067, 9989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas.\nQuestion: A bombing had occurred in the area. True, False, or Neither? True\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: Imagine Dragons are currently working on a new album. True, False, or Neither? Neither\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show is hosted by Trevor Noah. True, False, or Neither? Neither\n###\nKazuhiro Wada (\u548c\u7530 \u4e00\u6d69 , \"Wada Kazuhiro\" , born June 19, 1972 in Gifu, Gifu, Japan) is a retired Japanese professional baseball player. He played mostly as an outfielder for the Chunichi Dragons and the Seibu Lions of the Nippon Professional Baseball league in a career spanning 18 years. Following retirement in 2015, he has become a color commentator for Dragons broadcasts for the NHK.\nQuestion: Kazuhiro Wada played in the Nippon Professional Basball league True, False, or Neither? True\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song.\nQuestion: Elliot easton sang backup vocals True, False, or Neither?", "doc_id": 818, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1048, 9726, 14655, 3091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg.\nQuestion: Giovanni Ferrero currently lives in Italy. True, False, or Neither? Neither\n###\nThe Great Dictator is a 1940 American political satire comedy-drama film written, directed, produced, scored by and starring British comedian Charlie Chaplin, following the tradition of many of his other films. Having been the only Hollywood filmmaker to continue to make silent films well into the period of sound films, this was Chaplin's first true sound film.\nQuestion: Chaplin himself has dialogue in The Great Dictator. True, False, or Neither? Neither\n###\n\"Superman's Dead\" is a song by Canadian alternative rock group Our Lady Peace. It was released in December 1996 as the lead single from their second album \"Clumsy\". This has become one of Our Lady Peace's most popular songs in both Canada and the U.S., as well as many other parts of the world.\nQuestion: Superman's Dead was the third single. True, False, or Neither? False\n###\nDuke is a fictional character from the \"\" toyline, comic books, and cartoon series. He is the G.I. Joe Team's First Sergeant, and debuted in 1983. The character is also featured in both the \"\" animated series and comic books. Channing Tatum portrays Duke in the 2009 live-action film, \"\", and the 2013 sequel \"\".\nQuestion: Channing Tatum plays a fictional character in the 2009 G.I Joe live-action film. True, False, or Neither? True\n###\nLichfield Cathedral is situated in Lichfield, Staffordshire, England. It is the only medieval English cathedral with three spires. The Diocese of Lichfield covers all of Staffordshire, much of Shropshire and part of the Black Country and West Midlands. The 99th and current Bishop of Lichfield is Michael Ipgrave who was appointed on 10 June 2016.\nQuestion: Michael Ipgrave was appointed in the sixth month of 2016 True, False, or Neither?", "doc_id": 725, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17976, 22421, 42701, 36938], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Swift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933.\nQuestion: Swift Rivers didn't exist before 1930 True, False, or Neither? True\n###\nHundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries.\nQuestion: Hundreds of ancient stone religious monuments lie on the island of Java. Too many were built. True, False, or Neither? Neither\n###\nThe Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network.\nQuestion: The Chingford branch line is for women only True, False, or Neither? Neither\n###\nMelinda Heather \"Mindy\" Cohn (born May 20, 1966) is an American actress, voice actress, comedian and singer. She is known for her role as Natalie Green, the student of Edna Garrett (played by Charlotte Rae) in the long-running sitcom \"The Facts of Life\", and for being the voice of Velma Dinkley in the \"Scooby-Doo\" franchise from 2002 to 2015.\nQuestion: Melinda Heather \"Mindy\" Cohn is a silly actress True, False, or Neither? Neither\n###\nLaudelino Jos\u00e9 \"Lino\" de Barros (born June 29, 1975 in Bonito, Mato Grosso do Sul) is a Brazilian boxer, who represented his native country in the light heavyweight division at the 2000 Summer Olympics. There he was eliminated in the first round by Australia's Danny Green. A year earlier, at the 1999 Pan American Games, Barros won the silver medal in his weight division.\nQuestion: Laudelino Jos\u00e9 \"Lino\" de Barros was born in the 7th decade of the 20th century True, False, or Neither?", "doc_id": 19, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6422, 27295, 43796, 44957], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prom Night IV: Deliver Us from Evil is a 1992 Canadian slasher horror film directed by Clay Borris and starring Nicole de Boer and J.H. Wyman. The film follows a deranged Catholic priest who begins murdering teenagers on their prom night. It is the fourth and final film in the \"Prom Night\" franchise. Like the previous , it was released briefly in theaters before later being released to video.\nQuestion: 1992 was the last year in which a movie from the \"Prom Night\" film franchise was released True, False, or Neither? True\n###\nDana Berliner is Litigation Director at the Institute for Justice, a public interest law firm in Arlington, Virginia founded in 1991 by Chip Mellor and Clint Bolick. She was co-lead counsel for Susette Kelo in the landmark United States Supreme Court case \"Kelo v. City of New London\".\nQuestion: Dana Berliner was co-lead counsel only one time. True, False, or Neither? Neither\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey.\nQuestion: Ballymena United Football Club only plays semi-professionally True, False, or Neither? True\n###\nCynthia Mort (born June 18, 1956) is an American director, screenwriter, and producer. Mort has worked primarily in television since beginning her career in 1994, writing for the sitcom \"Roseanne\". Her notable works include the HBO series \"Tell Me You Love Me\" as a creator and executive producer, the revenge film \"The Brave One\" (2007) as a screenwriter, and the biopic \"Nina\" (2016) as a director.\nQuestion: Mort worked on on films just as much as she did television shows. True, False, or Neither? False\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists.\nQuestion: There were 2 film crews in this movie. One doing the filming and one doing the acting. True, False, or Neither?", "doc_id": 71, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21026, 6725, 10784, 17611], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005.\nQuestion: Samuel Eto'o Fils won the African Player of the Year award a record three times. True, False, or Neither? False\n###\nThe Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books.\nQuestion: The Real Howard Spitz is a horror film. True, False, or Neither? False\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area.\nQuestion: El Paso County is home to the Pikes Peak Center. True, False, or Neither? True\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th.\nQuestion: Vasili Vyacheslavovich Blagov currently lives in London. True, False, or Neither? Neither\n###\nAleksandr Danilovich Aleksandrov (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u0301\u043b\u043e\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 , alternative transliterations: \"Alexandr\" or \"Alexander\" (first name), and \"Alexandrov\" (last name)) (August 4, 1912 \u2013 July 27, 1999), was a Soviet/Russian mathematician, physicist, philosopher and mountaineer.\nQuestion: Aleksandr Danilovich Aleksandrov had more than 4 jobs. True, False, or Neither?", "doc_id": 176, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31428, 30175, 41163, 25478], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: The mall has a store that starts with a B, and a J True, False, or Neither? True\n###\nThe Bigger Picture is a 2014 British animated short film directed by Daisy Jacobs. It has been nominated for the Academy Award for Best Animated Short Film at the 87th Academy Awards. It won the BAFTA Award for Best Short Animation at the 68th British Academy Film Awards.\nQuestion: The Bigger Picture has the voice of Carter. True, False, or Neither? Neither\n###\nUS Organization, or Organization Us, is a Black nationalist group in the United States founded in 1965. It was established as a community organization by Maulana Karenga. It was a complementary organization of the Black Panther Party in California. One of the early slogans was, \"Wherever US is, We are.\" US stands for us Black people vs 'them' the oppressors.\nQuestion: Organization Us is founded in California True, False, or Neither? Neither\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points.\nQuestion: Greivis Josu\u00e9 V\u00e1squez Rodr\u00edguez was born over 10 years ago True, False, or Neither? True\n###\nThe University of Florida Board of Trustees is the governing body of the University of Florida, the Flagship University for the State University System of Florida. The University is located in Gainesville, Florida, United States. As of September 1, 2011, the Board includes thirteen members. The current Chair of the Board is Carlos Alfonso.\nQuestion: The University of Florida Board of Trustees currently has 13 members. True, False, or Neither?", "doc_id": 459, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24431, 22466, 3725, 28705], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle.\nQuestion: Boon Brewery produces Faro Beer True, False, or Neither? True\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani.\nQuestion: FS Kozani is a highly paid football team in Greece True, False, or Neither? Neither\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan.\nQuestion: Game Plan, Inc. is no longer in business. True, False, or Neither? Neither\n###\nEnglandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo.\nQuestion: Englandsfarere is a film from the 20th century. True, False, or Neither? True\n###\nThe Malloreon is a five-part fantasy book series written by David Eddings, which follows \"The Belgariad\". The Malloreon is set in the same world as The Belgariad, but expands on several aspects of the setting, especially the eastern continent of Mallorea.\nQuestion: The Belgariad, although published before the Malloreon series, was in fact written after it. True, False, or Neither?", "doc_id": 516, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1081, 20348, 14206, 13907], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\".\nQuestion: The Concrete Blonde was written by Michael Connelly. True, False, or Neither? True\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: Rudbeckia hirta is a very smelly plant True, False, or Neither? Neither\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region.\nQuestion: ABC has a number of local stations. True, False, or Neither? Neither\n###\nThe Green Goblin's Last Stand is a 1992 fan film by Dan Poole, based on the comic book story \"The Night Gwen Stacy Died\", published by Marvel Comics in \"The Amazing Spider-Man\" #121\u2013122. Poole is the director, producer, creative editor, screenwriter, and star of the film. The film and its attendant documentary received showings and accolades at several small film festivals.\nQuestion: Dan Poole hates comic books. True, False, or Neither? False\n###\nSt. Mark's Coptic Orthodox Cathedral is a Coptic church located in the Abbassia District in Cairo, Egypt. The cathedral is the Seat of the Coptic Orthodox Pope. It was built during the time when Pope Cyril VI of Alexandria was Pope of the Coptic Orthodox Church, and was inaugurated by him in 1969.\nQuestion: St. Mark's Coptic Orthodox Cathedral was created by Pope Cyril. True, False, or Neither?", "doc_id": 586, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3333, 37417, 23823, 27068], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Don Sinclair Davis, PhD (August 4, 1942 \u2013 June 29, 2008) was an American character actor best-known for playing General Hammond in the television series \"Stargate SG-1\" (1997\u20132007), and earlier for playing Major Garland Briggs on the television series \"Twin Peaks\" (1990\u20131991). He was also a theater professor, painter, and United States Army captain.\nQuestion: Don Sinclair Davis was on Twin Peaks before he turned 50. True, False, or Neither? True\n###\nThe Legendary Majik Mijits is an album that was recorded by Steve Marriott and Ronnie Lane when they reformed under the name of \"Majik Mijits\" in 1981 and gave a one-off concert at the Bridgehouse pub in East London. The lineup included Jim Leverton, Mick Green, Mick Weaver, Dave Hynes and Sam Brown.\nQuestion: The band had three members. True, False, or Neither? False\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s.\nQuestion: The private, commercial structure was placed on the National Register of Historic Places more than 1970 days ago. True, False, or Neither? True\n###\nBarry Redden (born July 21, 1960) is a former American football running back who played for the Los Angeles Rams, the San Diego Chargers, and the Cleveland Browns of the National Football League (NFL). He spent much of his career playing in the shadow of Pro Football Hall of Fame running back Eric Dickerson.\nQuestion: Barry Redden is a very clever man True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: the Six-Day war took place near Christmas True, False, or Neither?", "doc_id": 344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21322, 29566, 7680, 34539], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League.\nQuestion: Marques Ackerman donated to charity in 2017. True, False, or Neither? Neither\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals.\nQuestion: We're an American Band has no As. True, False, or Neither? False\n###\nThe European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002.\nQuestion: The EDU, the EPP, and the AECR all have acronyms that contain the letter E. True, False, or Neither? True\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD).\nQuestion: The UCD was the acronym for the Democratic Center Union. True, False, or Neither? True\n###\nBabar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\".\nQuestion: The film released in a country that starts with a C True, False, or Neither?", "doc_id": 144, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30238, 26546, 32292, 14399], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Something Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date.\nQuestion: I created Something Like Human that made the Hemorrhage of the songs because they were popular on the record label True, False, or Neither? Neither\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Smithereens was released om Tilbrook's own Quixotic label. True, False, or Neither? True\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98.\nQuestion: Death Race is pie True, False, or Neither? False\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966).\nQuestion: Ben Barzman was born less than 10000 days ago. True, False, or Neither? False\n###\nLouis S. Peterson (June 17, 1922 \u2013 April 27, 1998) was a playwright, actor, screenwriter, and professor. He was an American playwright and the first African-American playwright to have a dramatic play produced on Broadway. He was also one of the first African-American writers to be nominated for an Emmy Award.\nQuestion: Louis S. Peterson was an adult when he wrote his first play. True, False, or Neither?", "doc_id": 3, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21423, 25794, 44823, 2626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern.\nQuestion: McKinnon has only played ball for Georgia Southern and the Minnesota Vikings. True, False, or Neither? Neither\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally.\nQuestion: Sometimes Frank feels haunted by his past. True, False, or Neither? Neither\n###\nThe Appalachian IMG Sports Network was founded in 2007 as Appalachian ISP Sports Network. It is a group of 17 radio stations that carry Appalachian State University sports. The flagship station is WKBC-FM 97.3 in North Wilkesboro, North Carolina. When ISP Sports was bought by IMG Worldwide subsidiary, IMG College, in 2010, the network switched to its current name.\nQuestion: The Appalachian ISP Sports Network had its name changed 5 years after it was founded. True, False, or Neither? False\n###\nSamson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation.\nQuestion: The first performance was on 4 October 1877 True, False, or Neither? False\n###\nLittle Casterton is a small village and civil parish in Rutland, England. The population of the civil parish at the 2001 census was 148, increasing to 218 at the 2011 census. It is about two miles (3 km) north of Stamford on a minor road that runs to the south of the River Gwash between Great Casterton and Ryhall.\nQuestion: Little Casterton is near Rutland, England True, False, or Neither?", "doc_id": 831, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39409, 31651, 20690, 42227], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots.\nQuestion: Panchos was a flight training squadron in the Portuguese Air Force. True, False, or Neither? True\n###\nJohn Gilbert (born John Cecil Pringle; July 10, 1899 \u2013 January 9, 1936) was an American actor, screenwriter and director. He rose to fame during the silent film era and became a popular leading man known as \"The Great Lover\". At the height of his career, Gilbert rivaled Rudolph Valentino, another silent film era leading man, as a box office draw.\nQuestion: He acted in mostly scifi movies True, False, or Neither? Neither\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel.\nQuestion: This song charted on the US billboard at 30 True, False, or Neither? False\n###\nRepublic New York Corporation was the holding company for the Republic National Bank of New York and the Safra Republic Bank. The company was controlled by billionaire Edmond Safra, who was killed in a fire in his Monte Carlo penthouse apartment by his nurse Ted Maher. Republic New York Corporation was sold shortly after its chairman's death to HSBC Bank USA, the US subsidiary of HSBC of the UK.\nQuestion: Republic New Yorke was sold to HSBC True, False, or Neither? True\n###\nThomas Tull (born 1970) is an American businessman and film producer. He is the former chairman of the Board and chief executive officer (CEO) of Legendary Entertainment. His firm has produced and/or financed several major motion pictures, including \"The Dark Knight Trilogy\", \"The Hangover\" and its sequels, \"300\", \"Man of Steel\" and others.\nQuestion: Thomas Tull has directed 78 films. True, False, or Neither?", "doc_id": 821, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41673, 16007, 26584, 4815], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brennan Hesser (born 1980) is an American television actress, best known for co-starring in Tori Spelling's VH1 sitcom, \"So NoTORIous\". She also starred in Fox's drama, \"Jonny Zero\". She also guest starred in an episode of the CBS television show, \"The Guardian\". As a youngster, she attended the prestigious Interlochen Arts Camp in Northern Michigan.\nQuestion: She has starred in shows on Fox, CBS, VH1 and ABC. True, False, or Neither? False\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey.\nQuestion: Ballymena United Football Club pays its players more than the average salary for football players True, False, or Neither? Neither\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989.\nQuestion: Girilal Jain is Indian. True, False, or Neither? True\n###\n\"Paradise\" is the only single release from Styx's 1997 live double album \"Return to Paradise\". The song was originally written and recorded by Dennis DeYoung for his musical \"The Hunchback of Notre Dame\". The song was re-recorded by Styx for inclusion as one of three new studio tracks on the live album.\nQuestion: Paradise was released in 1991 True, False, or Neither? False\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally.\nQuestion: Frank Vincent Ferrante died young True, False, or Neither?", "doc_id": 407, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29947, 42129, 20248, 19573], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Murder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: Murder of the Universe has been covered by slayer True, False, or Neither? Neither\n###\nThe March of Ancona (Italian: \"Marca Anconitana\" ) (also Anconetana) was a frontier march centred on the city of Ancona and, then, Macerata in the Middle Ages. Its name is preserved as an Italian region today, the Marches, and it corresponds to almost the entire modern region and not just the Province of Ancona.\nQuestion: Italy has many cities with names as ancient as the first world war. True, False, or Neither? Neither\n###\nMarques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League.\nQuestion: Marques Ackerman is a world class footballer. True, False, or Neither? False\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy.\nQuestion: The National Rehabilitation Hospital is a very bad hospital True, False, or Neither? Neither\n###\nThe Sandlot is a 1993 American coming-of-age baseball film co-written and directed by David M. Evans, which tells the story of a group of young baseball players during the summer of 1962. It stars Tom Guiry, Mike Vitar, Karen Allen, Denis Leary and James Earl Jones. The filming locations were in Glendale, Midvale, Salt Lake City, and Ogden, Utah.\nQuestion: The Sandlot takes place in Odgen, Utah. True, False, or Neither?", "doc_id": 922, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40402, 174, 12404, 27904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\".\nQuestion: The film \"Vixen!\" was the first film that Charles Napir appeared in. True, False, or Neither? False\n###\nHis Excellency: George Washington is a 2004 biography of the first President of the United States, General George Washington. It was written by Joseph Ellis, a professor of History at Mount Holyoke College, who specializes in the founding fathers and the revolutionary and federalist periods.\nQuestion: Mount Holyoke college has a well respected history department. True, False, or Neither? Neither\n###\nSophie Lang Goes West is a 1937 American crime film directed by Charles Reisner and written by Frederick Irving Anderson, Doris Anderson, Brian Marlow and Robert Wyler. The film stars Gertrude Michael, Lee Bowman, Sandra Storme, Buster Crabbe, Barlowe Borland, C. Henry Gordon and Jed Prouty. The film was released on September 10, 1937, by Paramount Pictures.\nQuestion: Charles Reisner directed the movie Sophie Lang Goes West. True, False, or Neither? True\n###\nBela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act.\nQuestion: The child of B\u00e9la Lugosi participated in the legal actions that led to California Celebrities Rights Act. True, False, or Neither? True\n###\nLee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador.\nQuestion: Lee Scott Wolosky was born the year Martin Luther King Jr. was assassinated. True, False, or Neither?", "doc_id": 233, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34292, 40212, 26541, 6722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Drivin' Around Song\" is a song recorded by American country rap singer Colt Ford and country music singer Jason Aldean. It is the third single from his fourth studio album, \"Declaration of Independence\". The song was written by Chris Tompkins and Craig Wiseman.\nQuestion: He did at least 4 studio albums True, False, or Neither? True\n###\nCleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre.\nQuestion: The Cleethorpes play in England which is near Scotland. True, False, or Neither? Neither\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback.\nQuestion: Hard Landing is the first book in the Dan 'Spider' Shepherd series that was released before the second decade of the twenty-first century. True, False, or Neither? True\n###\nThe Hanover Hound is a breed of dog sometimes referred to as a Hanoverian Hound. It is a hunting and tracking dog descended from bloodhounds of medieval times. It was first introduced into France in the 1980s and is still a very rare breed. It was cross-bred with the Bavarian Hound which gave rise to the Bavarian Mountain Hound.\nQuestion: The Bavarian Mountain Hound will decrease in popularity in 2019 because the Hanover Mountain Hound and Bavarian Hound are purer breeds. True, False, or Neither? Neither\n###\nMads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted.\nQuestion: There is a chance that applications are accepted. True, False, or Neither?", "doc_id": 652, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44102, 23383, 14277, 10767], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places.\nQuestion: The William Martin Armistead House wasn't considered Historic until 2009. True, False, or Neither? Neither\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000.\nQuestion: The 2013 Sun Life Financial Players' Championship had a purse of CAD$120,000 for both the men's and women's tournaments.\n True, False, or Neither? Neither\n###\nPeter Billingsley (born April 16, 1971), also known as Peter Michaelsen and Peter Billingsley-Michaelsen, is an American actor, director, and producer, known for his role as Ralphie in the 1983 movie \"A Christmas Story\" and as \"Messy Marvin\" in the Hershey's Chocolate Syrup commercials during the 1970s. He began his career as an infant in television commercials.\nQuestion: Peter Billingsley never acted in a film True, False, or Neither? False\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg.\nQuestion: Steven Spielberg's first film to direct was Duel. True, False, or Neither? True\n###\nThe Wire is an American crime drama television series set and produced in Baltimore, Maryland. Created and primarily written by author and former police reporter David Simon, the series was broadcast by the cable network HBO in the United States. \"The Wire\" premiered on June 2, 2002, and ended on March 9, 2008, comprising 60 episodes over five seasons.\nQuestion: There was a long lapse of time during that range when episodes were not released. True, False, or Neither?", "doc_id": 453, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29080, 6263, 41870, 23943], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Once Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: Once Upon a Time is a drama set in New England. True, False, or Neither? True\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire.\nQuestion: He also won Emmy Award for Boardwalk Empire. He was also known for his work as an assistant director. True, False, or Neither? Neither\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: The black eyed Susan has been found in every province and state in North America True, False, or Neither? False\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: The Beatles Greatest Hits Volume 1 was first exclusive to only 2 countries. True, False, or Neither? True\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy.\nQuestion: doctor congo was a the location of the 2017 Congolese police decapitation attacks True, False, or Neither?", "doc_id": 659, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32497, 40681, 19123, 24153], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company.\nQuestion: Jake Gyllenhaal has never played a boxer prior to appearing in the 2015 sports dram film Southpaw. True, False, or Neither? Neither\n###\nThe Hill Country Film Festival is a yearly film festival in Fredericksburg, Texas, United States. It was established in 2010. The executive director is Chad Matthews, and it is presented by the Hill Country Film Society, who hold free screenings at the festival and, afterward, monthly. In 2013, \"Texas Monthly\" selected it as a \"quirky, discerning\" pick.\nQuestion: The Hill Country Film Festival is a yearly film festival in Dallas Texas, United States True, False, or Neither? False\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011.\nQuestion: Hipmunk is the best travel company in California. True, False, or Neither? Neither\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: Rudbeckia hirta tastes bitter. True, False, or Neither? Neither\n###\nVincent Edward \"Bo\" Jackson (born November 30, 1962) is a former baseball and American football player. He is one of the few athletes to be named an All-Star in two major sports, and the only one to do so in both baseball and football. He is widely considered one of the greatest athletes of all time.\nQuestion: Bo Jackson was called Vincent when he played baseball. True, False, or Neither?", "doc_id": 960, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36266, 14879, 26450, 16071], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harold E. Ennes was a broadcasting pioneer who authored many textbooks for broadcast and broadcast-related communications training and was a member of the Indianapolis chapter of the Society of Broadcast Engineers. He was a member of SBE's national Certification Committee and made many contributions to the early development of the SBE Certification Program.\nQuestion: Harold E. Ennes was a broadcasting pioneer who authored and lectured many textbooks for broadcast and broadcast-related communications training. True, False, or Neither? Neither\n###\nThe Friant-Kern Canal is a 152 mi Central Valley Project aqueduct managed by the United States Bureau of Reclamation in Central California to convey water to augment irrigation capacity in Fresno, Tulare, and Kern counties. Construction began in 1949 and the canal was completed in 1951, at a cost of $60.8 million.\nQuestion: The Friant-Kern Canal is home to marine species native to California. True, False, or Neither? Neither\n###\nSongbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist.\nQuestion: Songbook is a bad acoustic live album True, False, or Neither? Neither\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title.\nQuestion: The 1982 Bavarian Tennis Championships was a women's match True, False, or Neither? False\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town.\nQuestion: The largest store in Harbour Place is located in the centre of the shopping centre True, False, or Neither?", "doc_id": 542, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37257, 32920, 45440, 11851], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva (born January 21, 1977) is a Portuguese former swimmer, who specialized in sprint freestyle events. He is a two-time Olympian (2000 and 2004) and a former Portuguese record holder in the 50 m freestyle (22.86). Silva is a resident athlete for Sport Alg\u00e9s e Dafundo, and is trained by his long-time coach, director, and mentor M\u00e1rio Madeira.\nQuestion: Pedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva is a small child True, False, or Neither? False\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio.\nQuestion: Lawrence has worked for the BBC True, False, or Neither? Neither\n###\nLeonard Pilkington (1527\u20131599) was an English academic and clergyman. A Marian exile, he became Regius Professor of Divinity at Cambridge and Master of St John's College, Cambridge at the start of the reign of Elizabeth I. In his subsequent church career, he followed the way opened when his brother James Pilkington became Bishop of Durham.\nQuestion: Leonard Pilkington was 45 years old when he passed away True, False, or Neither? False\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht.\nQuestion: Khan Kluay 2 is the most successful movie ever. True, False, or Neither? Neither\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures.\nQuestion: Robert Z. Leonard is not an American actor. True, False, or Neither?", "doc_id": 394, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9694, 35561, 9715, 45010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fake? is a Japanese alternative rock band formed in 2001 by Ken Lloyd and Inoran. Their music has been described as alternative mixed with electronic sounds. Their sound has also been called \"Mixture Rock\" as well as an \"alternative punk rock mix.\" Lyrics are mainly in English and sometimes in Japanese.\nQuestion: Fake? was created by Ken Inoran True, False, or Neither? False\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks.\nQuestion: Cocaine sells millions annually True, False, or Neither? Neither\n###\nJames Brandon (born 20 September 1980) is a British journalist, who was kidnapped in Iraq during 2004 while on assignment from the \"Sunday Telegraph\" and \"The Scotsman\", covering the occupation and insurgency. He was kidnapped by the Mahdi Army militia and was released after one day.\nQuestion: James Brandon is a famous British Journalist. True, False, or Neither? Neither\n###\nJonathan Erlich and Andy Ram were the defending champions, but Erlich chose not to participate due to an elbow injury, and only Ram competed that year.Ram partnered with Max Mirnyi, but lost to Feliciano L\u00f3pez and Fernando Verdasco in the second round.\nQuestion: Ram and Erlich were soundly defeated by Mirnyi, Feliciano, and Verdasco in the second round. True, False, or Neither? False\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: Johan Martin Schr\u00f6der was nearly 60 when he founded Martinair True, False, or Neither?", "doc_id": 246, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36923, 28214, 9059, 28418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund.\nQuestion: University of Maryland Eastern Shore is a great university True, False, or Neither? Neither\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia.\nQuestion: The Big 12 Conference is the oldest of its kind in the country. True, False, or Neither? Neither\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central.\nQuestion: The Daily Show is 30 minutes long. True, False, or Neither? Neither\n###\nGamalost (also Gammelost, Gammalost), which translates as \"old cheese\", is a pungent traditional Norwegian cheese, which was once a staple of the Norwegian diet. Like many traditional Norwegian foods, such as flat bread, dry salted meats and stockfish, Gamalost could be stored for long periods without refrigeration.\nQuestion: Gamalost was exported from Norway True, False, or Neither? Neither\n###\nAndrea Albert Pierre Casiraghi (born 8 June 1984) is the elder son of Caroline, Princess of Hanover, and her second husband Stefano Casiraghi. He is the eldest grandchild of Rainier III, Prince of Monaco, and American actress Grace Kelly. Casiraghi is currently fourth in the line of succession to the Monegasque throne, following his twin cousins and mother.\nQuestion: Casiraghi is the last in the line of succession True, False, or Neither?", "doc_id": 735, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1605, 26210, 42423, 35273], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Luton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C.\nQuestion: It formed a partnership in 1998 True, False, or Neither? False\n###\nThe Bermuda Broadcasting Company is the largest broadcasting company in Bermuda. Sometimes abbreviated locally as \"BBC\", it is not related to the BBC, a public broadcaster in the United Kingdom. A commercial, for-profit broadcasting company since its beginning in the 1950s, the chairman is Fernance B. Perry.\nQuestion: The Bermuda Broadcasting Company began in the 1950s. True, False, or Neither? True\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: Martina Hingis was the top seed when she won her first singles title. True, False, or Neither? False\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season.\nQuestion: Mansfield was a good player but he was kind of slow. True, False, or Neither? Neither\n###\nSyracuse IMG Sports Network is the radio and television name for Syracuse University sports. The radio affiliates broadcast football, as well as men's and women's basketball and men's lacrosse games. Time Warner Cable Sports broadcasts the coaches' show and a weekly program titled \"Syracuse Sidelines\".\nQuestion: Syracuse University men's lacrosse games are broadcasted on Time Warner Cable Sports. True, False, or Neither?", "doc_id": 356, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6776, 19691, 26633, 34221], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: Home Depot has no significant presence in the state of Georgia True, False, or Neither? False\n###\nThe Latin American Boom was a flourishing of literature, poetry and criticism in Latin America during the 1960s and 1970s, when writers from this region explored new ideas and came to international renown in a way that had not happened previously. Major figures of the boom include Julio Cort\u00e1zar, Gabriel Garc\u00eda M\u00e1rquez, Carlos Fuentes, Jorge Luis Borges, and Mario Vargas Llosa.\nQuestion: The Latin American Boom was about African Americans True, False, or Neither? False\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: Johan Martin Schr\u00f6der was born before 1970 True, False, or Neither? True\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: They regrouped sometime in the 70's True, False, or Neither? Neither\n###\n\"Oh My\" is a song by American hip hop artist DJ Drama, released on May 13, 2011, as the lead single from his third studio album \"Third Power\". The song was produced by frequent collaborator Drumma Boy and features rappers Fabolous, Roscoe Dash and Wiz Khalifa. The song peaked at #18 on the \"Billboard\" and #12 on the Top R&B/Hip-Hop Songs, making it the most successful song for DJ Drama to date.\nQuestion: Most people have not heard of DJ Drama. True, False, or Neither?", "doc_id": 371, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3049, 8555, 35260, 44289], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Blackwater Lightship is a 2004 Hallmark Hall of Fame TV movie adaptation of the novel \"The Blackwater Lightship\" by Colm T\u00f3ib\u00edn. It aired on CBS on February 4, 2004. The movie stars Angela Lansbury, Gina McKee, Sam Robards, Dianne Wiest, and Keith McErlean. Lansbury received an Emmy nomination for it in 2004.\nQuestion: The Blackwater Lightship only aired on CBS. True, False, or Neither? Neither\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is accessed by a state route. True, False, or Neither? True\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: Lourdes Ver\u00f3nica Ar\u00e9valos Elias will start her singing career in 2021 True, False, or Neither? Neither\n###\nPeter Himmelman (born November 23, 1959 in St. Louis Park, Minnesota) is an American singer-songwriter and film and television composer from Minnesota, who formerly played in the Minneapolis indie rock band Sussman Lawrence before pursuing an extensive solo career. Himmelman is also the founder of Big Muse, a company which helps individuals and organizations unlock their creative potential.\nQuestion: Peter Himmelman has never written music for the stage. True, False, or Neither? Neither\n###\nThe Monument to Vasil Levski (Bulgarian: \u041f\u0430\u043c\u0435\u0442\u043d\u0438\u043a \u043d\u0430 \u0412\u0430\u0441\u0438\u043b \u041b\u0435\u0432\u0441\u043a\u0438 , \"Pametnik na Vasil Levski\") in the centre of Sofia, the capital of Bulgaria, is one of the first monuments to be built in the then newly liberated Principality of Bulgaria. It commemorates the hanging of Bulgarian national hero and major revolutionary figure Vasil Levski on the same spot on 18 February 1873.\nQuestion: The Monument to Vasil Levski may or may not have been the first monument in the Principality of Bulgaria. True, False, or Neither?", "doc_id": 579, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43264, 25192, 37531, 20818], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming.\nQuestion: Cape Vakop was likely uninhabited during the early 20th century True, False, or Neither? True\n###\nStuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area.\nQuestion: Stuart is extremely far from the sea True, False, or Neither? False\n###\nSadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award.\nQuestion: Sadat's wife, Jehan was a bad woman True, False, or Neither? Neither\n###\nMichelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California.\nQuestion: Do did not attend college. True, False, or Neither? Neither\n###\nIdentification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\".\nQuestion: Little Hamlet is a 1964 Polish drama film directed by Jerzy Skolimowski. True, False, or Neither?", "doc_id": 754, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15708, 35752, 18375, 44393], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Deanne Olivia Bell is an American television personality currently hosting CNBC's reality docu-series \"Make Me a Millionaire Inventor.\" She has previously worked on PBS's \"Design Squad\", Discovery Channel's \"Smash Lab\", and National Geographic's \"The Egyptian Job\". She has also co-hosted DIY Network's \"Money Hunters\" and ESPN's \"Rise Up.\"\nQuestion: Deanne Olivia Bell hosted \"Bash Lab\" True, False, or Neither? False\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation.\nQuestion: The 89th Medium Tank Battalion was the very slowest armored tank unit of the United States Army True, False, or Neither? Neither\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas.\nQuestion: Paul Albert Raymond Barlatier de Mas was the grandson of a famous captain True, False, or Neither? True\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories.\nQuestion: Nadeem F. Paracha only writes non-fiction. True, False, or Neither? Neither\n###\nJustin Alaric Holiday (born April 5, 1989) is an American professional basketball player for the Chicago Bulls of the National Basketball Association (NBA). He played college basketball for the University of Washington. He won an NBA championship in 2015 as a member of the Golden State Warriors.\nQuestion: Justin Alaric Holiday is a 30 year old American professional basketball player for the Chicago Bulls. True, False, or Neither?", "doc_id": 86, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39007, 27332, 14704, 978], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cinnaholic is a vegan bakery franchise that started in 2010 and currently operates in eight states. The company's owners appeared on the television show Shark Tank in 2014, which ended with them ultimately turning down a $200,000 investment offer from Robert Herjavec. The company has adopted a franchise business model and has plans to open 100 locations by 2020.\nQuestion: Cinnaholic is a vegan bakery. True, False, or Neither? True\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency.\nQuestion: Texas is located in the USA. True, False, or Neither? True\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium.\nQuestion: Moulton is no longer affiliated with Alchemy Partners. True, False, or Neither? True\n###\nTrojan War is a 1997 American romantic comedy film directed by George Huang. It stars Will Friedle, Jennifer Love Hewitt, and Marley Shelton. The film was a critical and box office disaster. Produced for $15 million, it made only $309 in ticket sales because it was played in a single movie theater and was pulled after only a week.\nQuestion: Trojan War is the lowest grossing film of all time. True, False, or Neither? Neither\n###\nThe Blackpool Gazette is an English evening newspaper based in Blackpool, Lancashire. Published every day except Sunday, it covers the towns and communities of the Fylde coast. It was founded as \"The West Lancashire Evening Gazette\" in 1929 before being renamed the \"Evening Gazette\", and then \"Blackpool Gazette\". The paper's history dates back to a weekly publication founded in 1873.\nQuestion: The Blackpool Gazette is published 7 days a week. True, False, or Neither?", "doc_id": 965, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33971, 37511, 12187, 7940], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century.\nQuestion: Cranborne Priory church is a grade 1 building True, False, or Neither? True\n###\nSulakshana is an Indian actress born on August 1 ,1965 who has performed in Tamil, Telugu, Kannada and Malayalam films at the age of two and half in the movie Kaaviya Thalaivi as child Krishna in the name of Dolly . After that she acted in Thulabharam as child artist in Tamil,Telugu,Malayalam and Hindi (all version) in the name of Rajani .\nQuestion: Sulakshana is an Indian actress who is very fat True, False, or Neither? Neither\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: The 1974 New York Mets was led by a woman. True, False, or Neither? False\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Destiny was a foreign language film. True, False, or Neither? True\n###\nThe China Stars was a baseball team established in 2005. It was made up of the best players in the China Baseball League. The team was established in purpose of playing with the winners from the professional baseball league in Japan, Taiwan, and Korea in the annual Konami Cup Asia Series. The China Stars lost all the 9 games in their three participartions.\nQuestion: The China Stars participated in the series three years in a row. True, False, or Neither?", "doc_id": 959, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24572, 38348, 4963, 18461], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau.\nQuestion: The 2nd Macau International Movie Festival took place on December 6, 2010 True, False, or Neither? Neither\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1.\nQuestion: Club Deportivo Cajamadrid was sponsored by Caja Madrid until 1991. True, False, or Neither? True\n###\nNew Hampshire Route 202A (abbreviated NH\u00a0202A) is a 14.639 mi east\u2013west state highway in Strafford and Rockingham counties in southeastern New Hampshire. The western terminus is in Northwood at U.S. Route\u00a0202 and New Hampshire\u00a09, near their intersection with U.S. Route\u00a04. Its eastern terminus is in downtown Rochester at New Hampshire Route\u00a0108 and New Hampshire Route\u00a0125.\nQuestion: NH 202A is the widest route in the US True, False, or Neither? Neither\n###\nThe 2015 Auburn Tigers softball team is an American softball team, representing the Auburn University for the 2015 NCAA softball season. In 2014, the Auburn Tigers softball team went 42-19-1 during Clint Myers first season. The Auburn Tigers play their home games at Jane B. Moore Field.\nQuestion: The Auburn Tigers play games at Jane B. Moore Field.\n True, False, or Neither? True\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008.\nQuestion: Miss Bala is only about a drug trafficking scandal. True, False, or Neither?", "doc_id": 584, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5491, 8513, 24586, 33236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mikhail Nikolayevich Baryshnikov (Russian: \u041c\u0438\u0445\u0430\u0438\u0301\u043b \u041d\u0438\u043a\u043e\u043b\u0430\u0301\u0435\u0432\u0438\u0447 \u0411\u0430\u0440\u044b\u0301\u0448\u043d\u0438\u043a\u043e\u0432 , Latvian: \"Mihails Bari\u0161\u0146ikovs\" ; born January 27, 1948), nicknamed \"Misha\" (Russian diminutive of the name \"Mikhail\"), is a Soviet and American dancer, choreographer, and actor.\nQuestion: Mikhail Baryshnikov was called Misha True, False, or Neither? True\n###\nThe Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly.\nQuestion: The Parliament of Singapore was formed in 1900 True, False, or Neither? Neither\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph Ernst Friedrich von Forcade de Biaix was always a politician. True, False, or Neither? Neither\n###\nIntervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009.\nQuestion: The game returned in 1985. True, False, or Neither? True\n###\nThe Featured Artists Coalition (FAC) is a nonprofit organisation set up to protect the rights of featured musical artists, particularly in the new digital age. It encourages a greater connection between fans and artists and aims to promote transparency in the music industry specifically to the benefit of the artists themselves.\nQuestion: The Featured Artists Coalition wants to benefit fans. True, False, or Neither?", "doc_id": 242, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26517, 11448, 9453, 41295], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Babar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\".\nQuestion: The film was the debut film for the character of Babar. True, False, or Neither? False\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012.\nQuestion: The People's Choice Awards always happen in January True, False, or Neither? Neither\n###\nThe Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs.\nQuestion: The value of natural water will decrease because heavy water has more scientific uses. True, False, or Neither? Neither\n###\nGeneo Grissom (born June 4, 1992) is an American football defensive end for the New England Patriots. He played college football at Oklahoma. He was drafted by the New England Patriots in the third round with the 97th overall pick of the 2015 NFL Draft.\nQuestion: Geneo Grissom was born in England. True, False, or Neither? False\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC.\nQuestion: Tripoli Municipal stadium gained capacity when it was recently rehabilitated. True, False, or Neither?", "doc_id": 928, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6403, 29952, 31908, 6445], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oceanus ( ; Greek: \u1f68\u03ba\u03b5\u03b1\u03bd\u03cc\u03c2 \"\u014ckean\u00f3s\", ] ), also known as Ogenus (\"Ogenos\", \u03a9\u03b3\u03b7\u03bd\u03bf\u03c2) or Ogen (\u03a9\u03b3\u03b7\u03bd), was a divine figure in classical antiquity, believed by the ancient Greeks and Romans to be the divine personification of the sea, an enormous river encircling the world.\nQuestion: Greeks and Romans had a lot of conflict to its origin True, False, or Neither? Neither\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\".\nQuestion: Selvaraj worked with a lot of actors. True, False, or Neither? True\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: CUHK is located in Shatin True, False, or Neither? True\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg.\nQuestion: Australia was only one of several countries with the currency peg True, False, or Neither? Neither\n###\nWest Town Mall is an upscale shopping mall located in Knoxville, Tennessee, United States. Opened in August 1972, this one-level mall is located in the western portion of Knoxville in the West Hills community. West Town Mall is located along Interstates 40/75 and Kingston Pike. The mall has over 1300000 sqft of Gross leasable area, making it the largest of any enclosed shopping mall in Tennessee.\nQuestion: \"Upscale\" does not contain the word \"up\". True, False, or Neither?", "doc_id": 88, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39932, 36173, 15490, 17767], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre.\nQuestion: cleethorpes is an american football club True, False, or Neither? False\n###\nThe Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: The Master of Revenge stars Jo Jae-hyun True, False, or Neither? True\n###\nVersailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S.\nQuestion: Versailles premiered in the U.S. on October 1 2016 in the evening. True, False, or Neither? Neither\n###\nAsana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook.\nQuestion: Asana was founded by Mark Zukerberg True, False, or Neither? False\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Kirkas Palestina was not accepted as a nominee into the 71st Academy Awards because it won 5 Israeli Film Academy Awards in 1998. True, False, or Neither?", "doc_id": 823, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41281, 1416, 11342, 8319], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: Nydala Abbey was a monastery for nuns. True, False, or Neither? Neither\n###\nMake It Big () is a 2002 South Korean comedy film. Song Seung-heon, Kim Young-jun and Kwon Sang-woo play three high school students who are startled when a bagful of money and a dead man fall on top of their car. Once they realize just how much money is in the bag, they give up any thought of calling the police.\nQuestion: Make it Big is a 2002 South Korean romantic film that involves two high school students. True, False, or Neither? False\n###\nGrotto Geyser is a fountain-type geyser located in the Upper Geyser Basin in Yellowstone National Park in the United States. Grotto Geyser is the namesake for the group of geysers that includes Grotto Fountain Geyser, South Grotto Fountain Geyser, Indicator Spring, Spa Geyser, and Rocket Geyser.\nQuestion: Grotto Geyser is a fountain type volcano located at a national park True, False, or Neither? False\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl.\nQuestion: Madhur Bhandarkar's drama film \"Calendar Girls\" was successful because it starred Avani Modi. True, False, or Neither? Neither\n###\nThe \"Charleston\"-class amphibious cargo ships were a class of amphibious cargo ships in service with the United States Navy. These ships served in Amphibious Readiness Groups between 1968 and 1994. The ships were the last amphibious cargo ships built for the U.S. Navy, their role having been taken over by amphibious transport docks.\nQuestion: The \"Charleston\"-class amphibious cargo ships were not able to float True, False, or Neither?", "doc_id": 511, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13894, 32727, 30563, 21263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o has a father. True, False, or Neither? True\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Claire Huffaker wrote the script for Seven Ways from Sundown True, False, or Neither? True\n###\nFor Screening Purposes Only is the debut album by UK dance-punk trio Test Icicles. After being released in 2005, the album was critically praised for being unique and compelling in an increasingly homogenous indie music scene. Following the group's split in February 2006, the album remains Test Icicles' only LP.\nQuestion: The album was praised as unique True, False, or Neither? True\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: St Clement's street is 8 miles long True, False, or Neither? Neither\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season.\nQuestion: Utah Jazz of the National Basketball Association (NBA) has been around the world. True, False, or Neither?", "doc_id": 829, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13601, 13005, 1584, 25981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Atiha Sen Gupta (born 1988) is a British playwright and screenwriter. She is writer-in-residence for 2016-2017 at Theatre Royal Stratford East in London, where her play \"Counting Stars\" was produced in 2016. In the same year she won the International Achievement Recognition Awards (IARA) Award for Best Playwright.\nQuestion: Atiha Sen Gupta is from outside Europe. True, False, or Neither? False\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine.\nQuestion: MIT Technology Review is ready mostly by teenagers True, False, or Neither? Neither\n###\nThere Was a Crooked Man... is a 1970 western starring Kirk Douglas and Henry Fonda and directed by Joseph L. Mankiewicz. This was the only western made by Mankiewicz, director of such notable films as \"All About Eve\", \"Guys and Dolls\" and \"Cleopatra\". It was written by David Newman and Robert Benton, their first script after \"Bonnie and Clyde\".\nQuestion: Mankiewicz made over 20 films. True, False, or Neither? Neither\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008.\nQuestion: He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2017. True, False, or Neither? False\n###\nRobert Louis (Robert) Paquette (born 1951) is an American historian, Publius Virgilius Rogers Professor of American History at Hamilton College, and co-founder of the Alexander Hamilton Institute for the Study of Western Civilization. He is particularly known for his work on the history of slavery in Cuba.\nQuestion: Robert Paquette was a slave in Cuba. True, False, or Neither?", "doc_id": 292, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23211, 30958, 2904, 8339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974.\nQuestion: Juan Domingo Per\u00f3n was always popular. True, False, or Neither? Neither\n###\nYough Senior High School is a high school located in the southeastern region of Westmoreland County, Pennsylvania, USA (Parents of Students/Staff/Educators). The school is operated by the Yough School District. Students attend from the townships of Sewickley Township, Westmoreland County, Pennsylvania and South Huntingdon. Yough High School has graduating class sizes from 180-200.\nQuestion: Yough Senior High School is operated by its eponymous district. True, False, or Neither? True\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg.\nQuestion: Snoop Dogg does not like Stan Lane. True, False, or Neither? Neither\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound.\nQuestion: There are many other breed dogs from Germany other than the Bavarian Mountain Hound. True, False, or Neither? Neither\n###\nAloe ( or ), also written \"Alo\u00eb\", is a genus containing over 500 species of flowering succulent plants. The most widely known species is \"Aloe vera\", or \"true aloe\", so called because it is cultivated as the standard source of so-called \"aloe vera\" for assorted pharmaceutical purposes. Other species, such as \"Aloe ferox\", also are cultivated or harvested from the wild for similar applications.\nQuestion: Aloe has between 400 and 490 total species. True, False, or Neither?", "doc_id": 290, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8489, 28736, 37448, 10101], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Port Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: Port Melbourne is split between two areas. True, False, or Neither? True\n###\nThorley Wash or Thorley Flood Pound is a 17.3 hectare biological Site of Special Scientific Interest in Thorley, south of Bishop's Stortford in Hertfordshire. It was formerly a flood pound for the Stort Navigation, which was decommissioned in 2004 and converted to a more natural state. It was purchased by the Herts and Middlesex Wildlife Trust from the Environment Agency in 2011.\nQuestion: Thorley Wash is more than 7.3 hectares. True, False, or Neither? True\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: \"Come Back in One Piece\" is the sixth official single True, False, or Neither? False\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG.\nQuestion: Al Shaksy has been a Director. True, False, or Neither? True\n###\nHudepohl Brewing Company is a brewery established in Cincinnati, Ohio in 1885 by founder Ludwig Hudepohl II. Hudepohl was the son of Bavarian immigrants and had worked in the surgical tool business before starting his brewery. Hudepohl combined with Schoenling Brewing Company in 1986. Today, the Hudepohl-Schoenling Brewing Company is a wholly owned subsidiary of Christian Moerlein Brewing Co..\nQuestion: Hudepohl Brewing Company was founded by Ludpig True, False, or Neither?", "doc_id": 570, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3, 4557, 37819, 11490], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nantwich Town Football Club is a semi-professional football club based in Nantwich, Cheshire, England. The club was founded in 1884 and is nicknamed \"The Dabbers\", a reference to the town's tanning industry. The club is currently a member of the Northern Premier League Premier Division, the seventh tier in the English football league system, with home matches played at the Weaver Stadium.\nQuestion: Nantwich Town Football Club is a professional football club. True, False, or Neither? False\n###\nAn Act for naturalizing Louis Sekeyhaye, George Frederick Handel, and others (13 Geo. I), later given the short title of Handel's Naturalisation Act 1727, was a 1727 Act of the Parliament of Great Britain with the intent of naturalising and granting British citizenship to German-born composer George Frideric Handel and other foreign citizens.\nQuestion: Handel's Naturalisation Act 1727 was initially rejected by Parliament True, False, or Neither? Neither\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries.\nQuestion: People really like flowers True, False, or Neither? Neither\n###\nKim Won-sik (Hangul:\u00a0\uae40\uc6d0\uc2dd , born February 15, 1993) better known by his stage name Ravi (Hangul:\u00a0\ub77c\ube44 ), is a South Korean rapper, singer-songwriter, producer, signed under Jellyfish Entertainment. He is a member of the South Korean boy group VIXX and VIXX sub-unit VIXX LR. He debuted as a solo artist on January 9, 2017, with the release of his debut mini album \"R.EAL1ZE\".\nQuestion: VIXX was more popular in China True, False, or Neither? Neither\n###\nLik\u00ebng\u00eb are pork sausages flavored with salt, pepper and seed of Fennel (far\u00eb mbrai), made in Piana degli Albanesi and Santa Cristina Gela. \"Lik\u00ebng\u00eb\" is the Undefinite Singular, \"Lik\u00ebnga\" is the Definite Singular and is cognate with the Italian Lucanica and the Greek Loukaniko.\nQuestion: A former vegetarian can eat Lik\u00ebng\u00eb. True, False, or Neither?", "doc_id": 285, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28298, 21486, 11969, 3581], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Songbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist.\nQuestion: Chris Cornell released his live album with his band. True, False, or Neither? False\n###\nDerailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago.\nQuestion: Derailed sold billions. True, False, or Neither? Neither\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC.\nQuestion: SNOBOL was used by Putin. True, False, or Neither? Neither\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced.\nQuestion: Dark Castle Entertainment enjoys producing movies that are weird and out of the norm. True, False, or Neither? Neither\n###\nNathan Never is a black-and-white, science fiction Italian comic book, published monthly in Italy since 1991 by Sergio Bonelli Editore. It is written by Michele Medda, Antonio Serra and Bepi Vigna. Artists who worked to series include Claudio Castellini, Roberto De Angelis, Dante Bastianoni, Nicola Mari, Pino Rinaldi, Giancarlo Olivares and Onofrio Catacchio.\nQuestion: Nathan Never is published 12 times a year. True, False, or Neither?", "doc_id": 604, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28790, 36614, 21971, 16140], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Reid Report is an hour-long weekday U.S. and world political commentary program on MSNBC. Hosted by Joy-Ann Reid, it premiered on February 24, 2014, in the time slot formerly occupied by \"NewsNation with Tamron Hall\". The show ended on February 27, 2015 due to low ratings.\nQuestion: The Reid Report was cancelled in 2015 because of low ratings. True, False, or Neither? True\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic.\nQuestion: Lausche is located in the French border state of Saxony. True, False, or Neither? False\n###\nThe Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife.\nQuestion: The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It was hard to understand. True, False, or Neither? Neither\n###\nJulian William Kennedy Burnside AO QC (born 9 June 1949) is an Australian barrister, human rights and refugee advocate, and author. He practises principally in commercial litigation, trade practices and administrative law. He is known for his staunch opposition to the mandatory detention of asylum seekers, and has provided legal counsel in a wide variety of high-profile cases.\nQuestion: Julian specializes in three principal areas of law. True, False, or Neither? True\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing.\nQuestion: The Charter Township of Lansing was produced in a state bordering Canada. True, False, or Neither?", "doc_id": 939, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2545, 30947, 27425, 33915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The End Tour was the farewell tour of the heavy metal band Black Sabbath, featuring founding members Ozzy Osbourne, Tony Iommi and Geezer Butler. The tour concluded Sabbath's 40+ year career. The final show was February 4, 2017, in their home city of Birmingham, UK.\nQuestion: Ozzy Osbourne, Tony Iommi and Geezer Butler were founding members of Black Sabbath. True, False, or Neither? True\n###\nWuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county.\nQuestion: The county is the second largest in the country True, False, or Neither? Neither\n###\n41 Commando or No. 41 (Royal Marine) Commando was a unit of the Royal Marines trained as Commandos during the Second World War. They were part of the all Royal Marine 4th Special Service Brigade that took part in the Normandy landings in June 1944 and later that served in World War II, the Korean War, and in Northern Ireland. They were disbanded in 1981.\nQuestion: 41 Commando is a special service brigade. True, False, or Neither? True\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\".\nQuestion: Paint It Black was released by The Rolling Stones in 1966 True, False, or Neither? True\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\".\nQuestion: Jeon started in the film Secret Sunshine. True, False, or Neither?", "doc_id": 315, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2527, 25053, 26382, 2723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kim Hyang-gi (born August 9, 2000) is a South Korean actress. Kim began her career as a child actress, and has starred in films and television series such as \"Wedding Dress\" (2010), \"The Queen's Classroom\" (2013), \"Thread of Lies\" (2014) and \"Snowy Road\" (2017).\nQuestion: The film Snowy Road was well received True, False, or Neither? Neither\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes.\nQuestion: There are private foundations involved in estate planning. True, False, or Neither? True\n###\nKhatarnaak is a 1990 Hindi-language Indian feature film directed by Bharat Rangachary, starring Sanjay Dutt, Farha Naaz and Anita Raj in lead roles, upon release the film was a box office hit. Film's track \" Aasmaan pe baithi Chandani\" music is uncredited copy of B.J. Thomas's track \"Raindrops keep falling on my head .\"\nQuestion: Bharat Rangachary forgot to credit B. J. Thomas. True, False, or Neither? Neither\n###\nJason Ian Drucker (born \u20092005 ) is an American child actor. He starred as Greg Heffley in the 2017 film \"\". He also played Tommy Miller, the youngest of the Miller Family, in Nickelodeon's \"Every Witch Way\". In 2018, he will co-star in the \"Transformers\" spin-off \"Bumblebee\".\nQuestion: Jason Ian Drucker acted is a series produced by Nickelodeon. True, False, or Neither? True\n###\nSugar & Spice is a 2001 American teen crime comedy film directed by Francine McDougall, and starring Marley Shelton, Marla Sokoloff, Mena Suvari, James Marsden, and Melissa George. The plot follows a group of high school cheerleaders who conspire and commit armed robbery when one of them becomes pregnant and desperate for income.\nQuestion: Melissa George was pregnant in the movie about cheerleaders from 2001. True, False, or Neither?", "doc_id": 189, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16721, 17338, 23017, 30058], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The man was short. True, False, or Neither? Neither\n###\nSemonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781.\nQuestion: Semonkong can be translated to Place of Smoke True, False, or Neither? True\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: The school doesn't have many students True, False, or Neither? Neither\n###\nGiovanni Visconti \u2014 according to Lorenzo Cardella nephew of Pope Gregory X. He was ostensibly created cardinal-bishop of Sabina by his uncle in 1275 and in 1276 was named judge in the case concerning the translation of bishop Giovanni of Potenza to the archbishopric of Monreale, postulated by the cathedral chapter of Monreale. He died in 1277 or 1278.\nQuestion: Giovanni Visconti left the church True, False, or Neither? Neither\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169.\nQuestion: The administrative center of the rural okrug has a population below 200. True, False, or Neither?", "doc_id": 585, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26485, 5196, 25464, 26835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ahmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Ahmad Kemal Idris was born on February 10, 1923. True, False, or Neither? True\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label.\nQuestion: Lance King was also known for his art displayed in various museums. True, False, or Neither? Neither\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places.\nQuestion: The William Martin Armistead House was added to the U.S. National Register of Historic Places over 5 years ago True, False, or Neither? True\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time.\nQuestion: Fayette County was not named after Thomas Jefferson. True, False, or Neither? True\n###\nMiriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua also known as Ka\u02bb ahumanu III (July 27, 1794 \u2013 June 7, 1845), was Kuhina Nui of the Kingdom of Hawaii, a queen consort of both King Kamehameha I and Kamehameha II, and mother of another king.\nQuestion: Ka\u02bb ahumanu III was born on the seventh day of the month. True, False, or Neither?", "doc_id": 513, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5034, 11107, 44669, 27444], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Morley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944.\nQuestion: Morley College Choir has won many awards True, False, or Neither? Neither\n###\nThe Merkur XR4Ti is a high-performance 3-door hatchback sold in North America from 1985 to 1989. A product of the Ford Motor Company, the car was based on a version of the European Ford Sierra XR4i adapted to US regulations. The XR4Ti and the Merkur brand name were both projects sponsored by Ford vice president Bob Lutz.\nQuestion: The XR4Ti and the Merkur brand name were not sponsored and it was based on a version of the European Ford Sierra XR4i adapted to US regulations. True, False, or Neither? False\n###\nOverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation. It is considered one of the first survival horror games to make use of a fully three-dimensional virtual environment, second only to Riverhillsoft's own \"Doctor Hauzer\", a fully 3D survival horror game released for the 3DO in 1994.\nQuestion: OverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation and stars the character Doctor Hauzer. True, False, or Neither? False\n###\nIppadikku Rose (Tamil: \u0b87\u0baa\u0bcd\u0baa\u0b9f\u0bbf\u0b95\u0bcd\u0b95\u0bc1 \u0bb0\u0bcb\u0bb8\u0bcd ; English: Yours truly, Rose ) is a Tamil talk show aired on Vijay TV. The show hosted by Rose. The talk show deals with current affairs touching a wide variety of social issues including traditions, taboos, rebels and culture. This is the first TV show in India hosted by a transgender person. The show is telecast at every Thursday at 11:PM IST.\nQuestion: Ippadikku Rose is named after the flower. True, False, or Neither? False\n###\nHonest Ed's was a landmark discount store located in Toronto, Ontario, Canada. It was named for its proprietor, Ed Mirvish, who opened the store in 1948 and oversaw its operations for almost 60 years, until his death in 2007. The store continued in operation until it was permanently closed on December 31, 2016.\nQuestion: Honest Ed's closed on New Year's Eve 2016. True, False, or Neither?", "doc_id": 663, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3374, 39593, 29773, 23085], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country.\nQuestion: The context explicitly states that Daniel James Shellabarger is known to roam the country, it follows that he has visited more than one state. Also, he was born in Colorado and now lives in Utah True, False, or Neither? True\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: Aragone attended the University of Virginia but never graduated. True, False, or Neither? Neither\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The 1960 Gator Bowl had a winner. True, False, or Neither? True\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL).\nQuestion: He was a popular player True, False, or Neither? Neither\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\".\nQuestion: Carmen Lebbos was born in nineteen hundred sixty three. True, False, or Neither?", "doc_id": 568, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15965, 38782, 40015, 15156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ruddy Joraider Lugo (born May 22, 1980) is a former Dominican-American Major League Baseball right-handed relief pitcher. Lugo is the younger brother of shortstop Julio Lugo. He attended Xaverian High School (famous alumni include Chris Mullin and Rich Aurilia) in Brooklyn, New York.\nQuestion: Ruddy Luge used to be a Dominican-American major League Baseball right-handed relief pitcher. True, False, or Neither? True\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: There were two players named Most Valuable Player but they didn't deserve the award. True, False, or Neither? Neither\n###\nLori-Jane Powell (born November 8, 1971) is a retired Canadian racquetball player from Prince Albert, Saskatchewan. Powell was Canadian Champion five times: thrice in singles and twice in doubles. She was forced to retire from competition in 2006 due to a right knee injury.\nQuestion: Lori-Jane Powell retired from racquetball thrice. True, False, or Neither? False\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent.\nQuestion: Doomsday Device was a popular term. True, False, or Neither? Neither\n###\nA sugar-baker was the owner of a sugar house, a factory for the refining of raw sugar from the Barbados. Sugar refining would normally be combined with sugar trading, which was a lucrative business. The architectural historian Kerry Downes gives an example of one sugar baker's house in Liverpool being estimated to bring in \u00a340,000 a year in trade from the Barbados.\nQuestion: Sugar refining is a lucrative business True, False, or Neither?", "doc_id": 882, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41007, 956, 23903, 33545], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Antonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: Antonio Lewis is an American rapper from Brooklyn who has directed a music video and also produces all of his music. True, False, or Neither? Neither\n###\nMarvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds.\nQuestion: marvin was very loyal to the gang True, False, or Neither? Neither\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television.\nQuestion: Sebastian Philip Bierk was born before April 3, 1969. True, False, or Neither? True\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169.\nQuestion: Innyaly is located 336 kms from Lensk. True, False, or Neither? True\n###\nYngwie Johan Malmsteen ( ; born Lars Johan Yngve Lannerb\u00e4ck; 30 June 1963) is a Swedish guitarist and bandleader. Malmsteen first became known in the 1980s for his neoclassical metal playing style in heavy metal. In 2009, \"Time\" magazine rated Malmsteen as among the 10 greatest electric guitar players of all time.\nQuestion: Yngwie Johan Malmsteen is currently living True, False, or Neither?", "doc_id": 382, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17934, 17314, 33725, 19583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "FS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani.\nQuestion: FS Kozani players live in Greece all year round True, False, or Neither? Neither\n###\nHabib (Habibollah) Elghanian (Persian: \u062d\u0628\u06cc\u0628 (\u062d\u0628\u06cc\u0628\u200c\u0627\u0644\u0644\u0647) \u0627\u0644\u0642\u0627\u0646\u06cc\u0627\u0646\u200e \u200e , 5 April 1912 \u2013 9 May 1979) was a prominent Iranian Jewish businessman and philanthropist who served as the president of the Tehran Jewish Society and acted as the symbolic head of the Iranian Jewish community in the 1970s.\nQuestion: Israel will put up a stature of Habib in 2020 for his achievements True, False, or Neither? Neither\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185.\nQuestion: In 2005, Suffolk had more than one least populated parish, True, False, or Neither? True\n###\nLate Spring () is a 2014 South Korean romance melodrama starring Park Yong-woo, Kim Seo-hyung and Lee Yoo-young. It portrays the true beauty and the platonic love discovered between a genius sculptor and his final model. It made its world premiere at the Santa Barbara International Film Festival in January 2014.\nQuestion: The film is a drama/ True, False, or Neither? True\n###\nMikhail Nikolayevich Baryshnikov (Russian: \u041c\u0438\u0445\u0430\u0438\u0301\u043b \u041d\u0438\u043a\u043e\u043b\u0430\u0301\u0435\u0432\u0438\u0447 \u0411\u0430\u0440\u044b\u0301\u0448\u043d\u0438\u043a\u043e\u0432 , Latvian: \"Mihails Bari\u0161\u0146ikovs\" ; born January 27, 1948), nicknamed \"Misha\" (Russian diminutive of the name \"Mikhail\"), is a Soviet and American dancer, choreographer, and actor.\nQuestion: Mikhail was also nicknamed MickaGho True, False, or Neither?", "doc_id": 643, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6911, 9623, 20452, 43323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Johnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson.\nQuestion: Johnson College Prep was named after a African American publishing company owner. True, False, or Neither? True\n###\nTom Clancy's Splinter Cell is a 2002 stealth video game developed by Ubi Soft Montreal and built on the Unreal Engine 2. It is the first \"Splinter Cell\" game in the series. Endorsed by author Tom Clancy, it follows the activities of NSA black ops agent Sam Fisher. The character of Fisher is voiced by actor Michael Ironside.\nQuestion: The character of Fisher is voiced by Tom Clancy. True, False, or Neither? False\n###\nJeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia.\nQuestion: Jeffrey Orlando Hunter entered the Canadian Football League after leaving the National Football League True, False, or Neither? Neither\n###\nThe Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW.\nQuestion: The power plant's capacity is being quadrupled. True, False, or Neither? True\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur.\nQuestion: George Foreman returned to boxing after a 10 year hiatus True, False, or Neither?", "doc_id": 495, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35963, 44894, 31098, 33145], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode.\nQuestion: David Anders no longer features in the series. True, False, or Neither? Neither\n###\nJohn M. W. Moorlach (born December 21, 1955 in the Netherlands) is a Republican California State Senator representing 37th Senate district, which includes portions of Orange County, since March 22, 2015. He previously served on the Orange County Board of Supervisors from December 5, 2006 \u2013 January 5, 2015 and as Orange County Treasurer-Tax Collector from March 17, 1995 \u2013 December 5, 2006.\nQuestion: John M. W. Moorlach lost every election. True, False, or Neither? False\n###\nRelient K is the debut studio album by American rock band Relient K. Many of the tracks are newer versions of those found on their 1998 demo \"All Work & No Play\". Typical of early Relient K albums, the lyrics use pop culture references for teaching and to illustrate Biblical principles. As of late 2006/early 2007, this album has sold around 400,000 copies.\nQuestion: It would be impossible to find pop culture references embedded in the lyrics of Relient K tracks. True, False, or Neither? False\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015.\nQuestion: Dantoro was from africa True, False, or Neither? True\n###\nHenry Nelson Pope (April 23, 1859 - June 13, 1956) was president of the Texas Farmers Union and president of the Association of State Presidents of the Farmers' Education and Cooperative Union of America, and president of the American Federation of Organized Producers and Consumers.\nQuestion: Henry Nelson Pope is not currently living. True, False, or Neither?", "doc_id": 844, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19160, 2736, 41344, 33168], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011.\nQuestion: Goldstein was older than Huffman in 2010. True, False, or Neither? Neither\n###\nSarah Beth Noriega (born April 24, 1976) is a former indoor volleyball player. She played for Loyola Marymount University from 1994 to 1997 and was named the 1997 West Coast Conference Player of the Year. She also played for the United States national team at the 2000 Summer Olympics.\nQuestion: sarah win golden medal in the 2000 summer olympics True, False, or Neither? Neither\n###\nMaria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\".\nQuestion: Maria Ho can read faces. True, False, or Neither? Neither\n###\nThe 2004 IIFA Awards, officially known as the 5th International Indian Film Academy Awards ceremony, presented by the International Indian Film Academy honoured the best films of 2003 and took place between May 20\u201322, 2004. This year, the city of Singapore played host to the Indian Film Industry. The tag line of this year's IIFA Awards was \"Uniquely IIFA, Uniquely Singapore ...\".\nQuestion: The ceremony took place for approximately 24 hours True, False, or Neither? True\n###\nThe Friant-Kern Canal is a 152 mi Central Valley Project aqueduct managed by the United States Bureau of Reclamation in Central California to convey water to augment irrigation capacity in Fresno, Tulare, and Kern counties. Construction began in 1949 and the canal was completed in 1951, at a cost of $60.8 million.\nQuestion: The Friant-Kern Canal is more than 200 kilometers long. True, False, or Neither?", "doc_id": 531, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11614, 28, 61, 29363], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site.\nQuestion: Eglinton Castle was once home to three Earls. True, False, or Neither? Neither\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008.\nQuestion: Jung Young-bae directed one film in 2008 True, False, or Neither? Neither\n###\nThirteen Ghosts (also known as 13 Ghosts and stylized as THIR13EN Ghosts) is a 2001 Canadian-American supernatural horror film directed by Steve Beck. It is a remake of the 1960 film \"13 Ghosts\" by William Castle. It follows the remake of another one of Castle's films, \"House on Haunted Hill\", and was shot entirely around Lower Mainland, British Columbia.\nQuestion: Thirteen Ghosts has had more than one film adaptation True, False, or Neither? True\n###\n\"It's the Little Things\" is a 1967 single by Sonny James. \"It's the Little Things\" was Sonny James' twenty-fifth release on the country chart, the song went to number one on the country chart for five weeks and spent a total of fourteen weeks on the charts.\nQuestion: Only women liked the song True, False, or Neither? Neither\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington.\nQuestion: The Angel and the Soldier Boy was created within the past 30 years True, False, or Neither?", "doc_id": 56, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21707, 1070, 26433, 40731], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Misty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue.\nQuestion: Misty Knight sold billions. True, False, or Neither? Neither\n###\nOverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation. It is considered one of the first survival horror games to make use of a fully three-dimensional virtual environment, second only to Riverhillsoft's own \"Doctor Hauzer\", a fully 3D survival horror game released for the 3DO in 1994.\nQuestion: OverBlood was a very popular game. True, False, or Neither? Neither\n###\nThe Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes.\nQuestion: Master of Revenge first aired in April 2016 True, False, or Neither? True\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology.\nQuestion: Pavel Sergeyevich Alexandrox made significant contributions to set theory and topology by writing three hundred papers. True, False, or Neither? True\n###\nSoul Ballet is a musical project of actor, producer, arranger, programmer, and multi-instrumentalist Rick Kelly \"RK.\" Soul Ballet\u2019s music is smooth contemporary jazz/electronica, characterized as pulsating electronic beats entwined with a dark, moody atmosphere.\nQuestion: Soul Ballet works with Rick Kelly \"KR\" True, False, or Neither?", "doc_id": 598, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31133, 28500, 38534, 38414], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund.\nQuestion: University of Maryland Eastern Shore helps a lot of students with financial aid True, False, or Neither? Neither\n###\nWalcha Shire is a local government area located in the New England region of New South Wales, Australia. The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 20 km east of the Main North railway line passing through Walcha Road.\nQuestion: Oxley Highway and Thunderbolts Way is east of the Main North railway line. True, False, or Neither? True\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio.\nQuestion: Christopher is a blues musician. True, False, or Neither? False\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there.\nQuestion: Congressman Joseph Bouck was born in Breakabeen, New York. True, False, or Neither? False\n###\n\"The DArkest Knight\" is tenth episode of the seventh season of the American mystery\u2013thriller television series \"Pretty Little Liars\". The installment was directed by Arlene Sanford and written by showrunner I. Marlene King and executive producer Maya Goldsmith. It premiered on August 23, 2016, on the cable network Freeform.\nQuestion: Season seven had ten episodes before The Darkest Knight. True, False, or Neither?", "doc_id": 148, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29722, 10927, 17997, 39526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act.\nQuestion: Bela was involved in multiple lawsuits. True, False, or Neither? Neither\n###\nRise of the Dragon is a graphic adventure game released in 1990 for DOS and Macintosh, and later remade for the Sega CD (1993) as well as the Amiga. It was one of the few adventure game titles developed by Dynamix, a company that was better known as an action and flight simulator game developer.\nQuestion: The genre of Rise of the Dragon is not the most familiar type for Dynamix. True, False, or Neither? True\n###\nGun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso.\nQuestion: Gun Bow was not a very fast dog. True, False, or Neither? False\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\nQuestion: All of the members of Norwegian black metal band Emperor came back for their reunion in 2005. True, False, or Neither? Neither\n###\n\"Hang On\" is a song by the band Weezer. The song impacted radio on February 15, 2011. It is the seventh track and second single from their eighth studio album, \"Hurley\". The album version of \"Hang On\" is co-written by Rick Nowels and features Canadian actor Michael Cera on backing vocals and pseudo-mandolin. The single version features no mandolin, and contains a harder sound.\nQuestion: If you listen carefully you can hear Michael Cera in on Weezers song True, False, or Neither?", "doc_id": 483, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40669, 7562, 33839, 31825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Club Deportivo Aguilar is a football team based in Aguilar de Campoo in the autonomous community of Castile and Le\u00f3n. Founded in 1947, it plays in the Primera Provincial. Its stadium is \"Ciudad Deportiva Alberto Fern\u00e1ndez\" with a capacity of 6,000 seats.\nQuestion: Aguilar's is located in Spain (Leon) True, False, or Neither? Neither\n###\nDaraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\".\nQuestion: Daraar exceeded expectations at the box office and became a hit. True, False, or Neither? False\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places.\nQuestion: Sarasota, Florida has many places on the National Registry True, False, or Neither? Neither\n###\nBianca Gascoigne (born 28 October 1986) is a British glamour model and television personality. She is the daughter of Sheryl Gascoigne, and adopted daughter of Paul Gascoigne, a former footballer. She has a brother Mason and a half-brother Regan Gascoigne. She came sixth in the nineteenth series of Channel 5 reality show \"Celebrity Big Brother\".\nQuestion: Bianca Gascoigne is 25 years old today True, False, or Neither? False\n###\nBaker College Preparatory High School (also known as Baker College Prep) is a public four-year charter high school located in the South Chicago neighborhood on the far south side of Chicago, Illinois. It is operated by the Noble Network of Charter Schools. It shares its campus with Bowen High School. Baker is named for civil and human rights activist Ella Baker.\nQuestion: The school is going to be built in 2028 True, False, or Neither?", "doc_id": 932, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10762, 38617, 16116, 41960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Introduction to Finality\" is the 22nd episode of the third season of the American television series \"Community\" and the third season finale. It originally aired on May 17, 2012 on NBC. This was the last episode to air with series creator Dan Harmon as showrunner before he was fired, though Harmon would later return as showrunner for the 5th season.\nQuestion: \"Introduction to Finality\" is the third episode of Community True, False, or Neither? False\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Lucy Mack Smith was familiar with her son's life and activities True, False, or Neither? True\n###\nJulian William Kennedy Burnside AO QC (born 9 June 1949) is an Australian barrister, human rights and refugee advocate, and author. He practises principally in commercial litigation, trade practices and administrative law. He is known for his staunch opposition to the mandatory detention of asylum seekers, and has provided legal counsel in a wide variety of high-profile cases.\nQuestion: When added together, the numerals in the year Burnside was born equal 23. True, False, or Neither? True\n###\nPedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva (born January 21, 1977) is a Portuguese former swimmer, who specialized in sprint freestyle events. He is a two-time Olympian (2000 and 2004) and a former Portuguese record holder in the 50 m freestyle (22.86). Silva is a resident athlete for Sport Alg\u00e9s e Dafundo, and is trained by his long-time coach, director, and mentor M\u00e1rio Madeira.\nQuestion: Pedro was by his long time coach and mentor M\u00e1rio Madeira True, False, or Neither? True\n###\nThe Tasmanian Legislative Council is the upper house of the Parliament of Tasmania in Australia. It is one of the two chambers of the Parliament, the other being the House of Assembly. Both houses sit in Parliament House in the state capital, Hobart. Members of the Legislative Council are often referred to as MLCs.\nQuestion: Tasmania is the state capital. True, False, or Neither?", "doc_id": 432, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10677, 32441, 7627, 8114], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher David Mole (born 16 March 1958) is a British Labour Party politician, who was the Member of Parliament (MP) for Ipswich from a by-election in 2001, after the death of Jamie Cann, and was re-elected in 2005. He was Parliamentary Under Secretary of State at the Department for Transport, until his defeat in the 2010 general election by Ben Gummer, son of former MP John Gummer.\nQuestion: John Gummer has a kid True, False, or Neither? True\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi.\nQuestion: The center is named off a person who's name started with a J True, False, or Neither? True\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144.\nQuestion: Keystone is the largest unincorporated community in central Keith County, Nebraska with a post office. True, False, or Neither? Neither\n###\nThe Ghost and Mrs. Muir (1947) is a romantic-fantasy film starring Gene Tierney and Rex Harrison. It was directed by Joseph L. Mankiewicz, and is based on a 1945 novel written by Josephine Leslie under the pseudonym of R. A. Dick. In 1945, 20th Century Fox bought the film rights to the novel, which had been published only in the United Kingdom at that time. It was shot entirely in California.\nQuestion: Josephine Leslie wrote the novel that inspired the movie The Ghost and Mrs. Muir True, False, or Neither? True\n###\nThe Empire Icon Award is an honorary Empire Award presented by the British film magazine \"Empire\". The Empire Icon Award was first introduced at the 11th Empire Awards ceremony in 2006 with Brian Cox receiving the award. The award was absent from the 12th, 17th and 18th Empire Awards ceremonies. Hugh Jackman is the most recent winner in this category.\nQuestion: Brian Cox was honored at the 18th Empire Awards ceremony True, False, or Neither?", "doc_id": 732, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14930, 11801, 20148, 583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hubertus \"Huib\" Wilton (12 March 1921 in Rotterdam \u2013 29 October 1959) was a Dutch tennis player. He was on the 1953 Netherlands Davis Cup team which also included among others Hans van Swol (his partner in the men's doubles), Boebi van Meegeren and Ivo Rinkel. In 1950 Wilton reached the second round at Wimbledon, losing to Henry Billington of Great Britain 6\u20131 7\u20135 11\u20139.\nQuestion: Billington was also on the 1952 Davis Cup team. True, False, or Neither? Neither\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\".\nQuestion: LA1:TV is for profit True, False, or Neither? False\n###\nBarbatodon is a mammal genus from the Upper Cretaceous period. It lived in Transylvania at the same time as some of the last dinosaurs and was a member of the extinct order of Multituberculata. It is within the suborder of Cimolodonta, and the family Kogaionidae. The genus \"Barbatodon\" was named by R\u00e3dulescu R. and Samson P. in 1986.\nQuestion: Barbatodon lived for millions of years. True, False, or Neither? Neither\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others.\nQuestion: Gay Sex in the 70s is one of the best documentaries ever. True, False, or Neither? Neither\n###\n\"(Baby) You Don't Have to Tell Me\" (often written \"You Don't Have to Tell Me\") is a song by New York songwriter Pete Antell (formerly of the American pop group The Chants) and first recorded bysinger Bobby Coleman. The obscure song was later recorded and released by the American pop group the Walker Brothers as their sixth single in 1966. The accompaniment was directed by Reg Guest.\nQuestion: The Walker Brothers released six singles before You Don't Have to Tell me. True, False, or Neither?", "doc_id": 173, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11194, 25432, 34668, 12376], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: New York Mets had a losing record because they lost their star players True, False, or Neither? Neither\n###\nThe Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December 1898. The significance of this pair of lions was their unusual behavior, such as the number of men killed and the manner of the attacks.\nQuestion: The Tsavo Man-Eaters were killed multiple people. True, False, or Neither? True\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla.\nQuestion: The Feed icon is widely used by internet users True, False, or Neither? Neither\n###\nGhost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997).\nQuestion: The members of Veruca Salt have remained the same since 1997. True, False, or Neither? False\n###\nK\u00e1roly P\u00e1ncz\u00e9l (born April 3, 1961) is a Hungarian teacher and politician, member of the National Assembly (MP) for R\u00e1ckeve (Pest County Constituency XIII) from 2002 to 2006 and from 2010 to 2014. He was also Member of Parliament from the Pest County Regional List of Fidesz between 1998\u20132002 and 2006\u20132010. He was elected MP for Dabas (Pest County Constituency XI) in 2014.\nQuestion: K\u00e1roly P\u00e1ncz\u00e9l was the first female member of the National Assembly (MP) for R\u00e1ckeve. True, False, or Neither?", "doc_id": 403, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23690, 23007, 40822, 16139], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India.\nQuestion: West Indies were mad at India. True, False, or Neither? Neither\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: The university has more men than women True, False, or Neither? Neither\n###\nThe 1821 Norfolk and Long Island Hurricane was one of four known tropical cyclones that have made landfall in New York City. Another, even more intense hurricane in pre-Columbian times (sometime between 1278 and 1438) left evidence that was detected in southern New Jersey by paleotempestological research. The third was the 1893 New York hurricane, and the fourth was Hurricane Irene in 2011.\nQuestion: There have been five other cyclones in New York City. True, False, or Neither? False\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Kant served as VP during spring of 2002. True, False, or Neither? True\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures.\nQuestion: A Daughter of the Wolf features acting True, False, or Neither?", "doc_id": 893, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40017, 28038, 44093, 29953], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation.\nQuestion: Samson and Delilah, Op. 47 is performed in a German translation. True, False, or Neither? True\n###\n\"We Really Shouldn't Be Doing This\" is a song written by Jim Lauderdale, and recorded by American country music artist George Strait. It was released in September 1998 as the third and final single from his album \"One Step at a Time\". It peaked at number 4 in the United States, and number 2 in Canada.\nQuestion: \"We Really Shouldn't Be Doing This\" was the last single from \"One Step at a Time\" True, False, or Neither? True\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg.\nQuestion: Snoop Dog appears on more than one song on the album Professional Rapper. True, False, or Neither? Neither\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: Murder of the Universe has been covered by black sabbath True, False, or Neither? Neither\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980.\nQuestion: The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records more than 1980 years ago. True, False, or Neither?", "doc_id": 540, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18754, 22218, 37918, 41124], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Linyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: LYU has many students. True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: The war created chaos in the surrounding communities. True, False, or Neither? Neither\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\".\nQuestion: \"When in Broad Daylight I Open my Eyes\" was a song written by Elton John. True, False, or Neither? False\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property.\nQuestion: Nikola Tesla still resides at Wardenclyffe True, False, or Neither? False\n###\nUSNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944.\nQuestion: USNS \"Lone Jack\" was launched before the end of WW2 True, False, or Neither?", "doc_id": 934, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23266, 28007, 21330, 6233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015.\nQuestion: Sarah Blasko writes all of her own music. True, False, or Neither? Neither\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996.\nQuestion: Trainspotting is intended to by funny. True, False, or Neither? True\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct.\nQuestion: There have been less than 80 recently extinct species. True, False, or Neither? False\n###\nHow Green Was My Valley is a BBC Television serial based on the novel by Richard Llewellyn, and features one of the last performances by Stanley Baker. It was first shown in the UK from 29 December 1975 in six weekly parts, while producer Martin Lisemore also cast Si\u00e2n Phillips in his next production, \"I Claudius\" (1976).\nQuestion: How Green Was My Valley was based on a BBC Television serial. True, False, or Neither? False\n###\nMan in a Hurry (French: \"L'Homme press\u00e9\" , Italian: \"L'ultimo giorno d'amore\" , released in UK as The Hurried Man) is a 1977 French-Italian drama film directed by \u00c9douard Molinaro and starring Alain Delon and Mireille Darc. It is based on the novel \"The Man in a Hurry\" by Paul Morand. It recorded admissions of 730,581 in France.\nQuestion: Man in a Hurry had over 1 million admissions in France. True, False, or Neither?", "doc_id": 112, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17764, 30147, 35248, 41959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat.\nQuestion: Spanish Mastiff is a good guard dog True, False, or Neither? Neither\n###\nThe Reunion (also titled as The Reunion: Live At The Hyatt Regency 9.11.2010) is a live album released on January 11, 2015 by the Washington, D.C.-based go-go band Rare Essence. The album was recorded live at the Hyatt Regency in Crystal City, Virginia on September 11, 2010.\nQuestion: The Reunion is a four member band True, False, or Neither? Neither\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\".\nQuestion: Lourdes Ver\u00f3nica Ar\u00e9valos Elias will play a role in the next Marvel film True, False, or Neither? Neither\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits.\nQuestion: This box set contains music by the band not commonly heard or released. True, False, or Neither? True\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels.\nQuestion: Peter Padfield is a European novelist. True, False, or Neither?", "doc_id": 120, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11222, 18738, 5948, 25654], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Peter Murray Kapetan was an American Broadway actor, singer and dancer notable for playing numerous roles during a thirty-year career. He was notable for performing in the musical \"The Wedding Singer\" as a Ronald Reagan impersonator. He appeared in \"Titanic\", \"Sunset Boulevard\", \"Joseph and the Amazing Technicolor Dreamcoat\", and \"Got Tu Go Disco\".\nQuestion: Peter Murray Kapetan appeared in a minimum of five different productions over his career. True, False, or Neither? True\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: LYU has affordable tuition. True, False, or Neither? Neither\n###\nThe Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash.\nQuestion: The Middlewich Folk and Boat Festival is the only festival that Middlewich, Cheshire, England celebrates during the month of June. True, False, or Neither? Neither\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions.\nQuestion: Project Gasbuggy caused radiation poisoning. True, False, or Neither? Neither\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date.\nQuestion: Something Like Human is an awesome album that was released in 2000, it reached nr. 2 on the U.S. Billboard Top 200 True, False, or Neither?", "doc_id": 198, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [132, 14924, 20056, 7833], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "I Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013.\nQuestion: I Love Hong Kong has three sequels. True, False, or Neither? True\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine.\nQuestion: MIT Technology Review has been updated or re-branded twice. True, False, or Neither? True\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele.\nQuestion: Stanley Fredrick Steele scored 97 goals from 1955 until 1968. True, False, or Neither? True\n###\nThe Nariphon (Thai: \u0e19\u0e32\u0e23\u0e35\u0e1c\u0e25 ), also known as Makkaliphon (Thai: \u0e21\u0e31\u0e01\u0e01\u0e30\u0e25\u0e35\u0e1c\u0e25 , from Pali \"makkaliphala\"), is a tree in Buddhist mythology which bears fruit in the shape of young female creatures. The maidens grow attached by their head from the tree branches. This tree grows at the Himaphan, a mythical forest where the female fruits are enjoyed by the Gandharvas who cut the fruits and take them away.\nQuestion: The Nariphon is also called Makkaliphon True, False, or Neither? True\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation.\nQuestion: Weiner is not smart. True, False, or Neither?", "doc_id": 603, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34611, 1040, 14379, 8343], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: He is professional skateboarder who lived in his van on the west side of LA True, False, or Neither? True\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production.\nQuestion: Stella was a novel that took place in 1941 about the Battle of the Atlantic. True, False, or Neither? True\n###\nRyman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc.\nQuestion: The Grand Ole Opry was moved to Memphis Tennessee after 1974 True, False, or Neither? Neither\n###\nZina Lynna Garrison (born November 16, 1963) is a former professional tennis player from the United States. During her career, she was a women's singles runner-up at Wimbledon in 1990, a three-time Grand Slam mixed doubles champion, and a women's doubles gold medalist and singles bronze medalist at the 1988 Olympic Games. She is currently coaching Taylor Townsend.\nQuestion: Garrison played doubles with Graf. True, False, or Neither? Neither\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: In 2017, the final tournament in the 2017 Overwatch World Cup will be held during the last week of November. True, False, or Neither?", "doc_id": 556, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17373, 20062, 15721, 34574], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012.\nQuestion: Anna Sun was the band's most profitable song True, False, or Neither? Neither\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: Gwyneth Paltrow and Kelly Preston know each other very well True, False, or Neither? Neither\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh.\nQuestion: TSS exports to America True, False, or Neither? Neither\n###\nFrederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\".\nQuestion: Frederick Wiseman was considered very easy to work with True, False, or Neither? Neither\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001).\nQuestion: Susan Lynch acted in \"16 Years of Alcohol\" which was released in the 2010s. True, False, or Neither?", "doc_id": 809, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [43067, 37175, 44392, 16173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records.\nQuestion: The album was released in 2015 + 1 True, False, or Neither? True\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster.\nQuestion: Dance India Dance is shown on national television. True, False, or Neither? True\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier.\nQuestion: Timoon Animation was the sole company created by Philippe Mounier True, False, or Neither? Neither\n###\nDavid Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League.\nQuestion: David Thomas Bush played in more than 3 different MLB teams True, False, or Neither? True\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910.\nQuestion: Yi Bangja died when she was 50 years old. True, False, or Neither?", "doc_id": 341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28714, 1437, 6384, 12146], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Englandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo.\nQuestion: Knut Wigert was in Englandfarere film. True, False, or Neither? True\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006.\nQuestion: Tim Van Patten directed the entire sixth season of The Sopranos. True, False, or Neither? Neither\n###\nThe Love for Three Oranges, Op. 33, also known by its French language title L'amour des trois oranges (Russian: \u041b\u044e\u0431\u043e\u0432\u044c \u043a \u0442\u0440\u0451\u043c \u0430\u043f\u0435\u043b\u044c\u0441\u0438\u043d\u0430\u043c , \"Lyubov' k tryom apel'sinam\"), is a satirical opera by Sergei Prokofiev. Its French libretto was based on the Italian play \"L'amore delle tre melarance\" by Carlo Gozzi. The opera premiered at the Auditorium Theatre in Chicago, Illinois, on 30 December 1921.\nQuestion: The opera debuted two days before the new year. True, False, or Neither? True\n###\nJoseph Maurice Francis Connaughton (15 August 1918 \u2013 12 February 1944) was an English first-class cricketer active 1939 who played for Middlesex. He was born in Paddington. During World War II he was commissioned in the Royal Artillery. He was drowned off the Maldives after SS \"Khedive Ismail\" was torpedoed; and officially declared dead one year later.\nQuestion: Joseph Maurice Francis Connaughton is alive. True, False, or Neither? False\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA).\nQuestion: PLU Crew never won a match True, False, or Neither?", "doc_id": 530, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4686, 22129, 23287, 8184], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more.\nQuestion: Brandon Hughes was born in 1999 True, False, or Neither? False\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\".\nQuestion: Carmen Lebbos is from a small town in Portugal. True, False, or Neither? False\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: He became a master of music at age 52 True, False, or Neither? Neither\n###\nThe Mayor of Youngstown is the chief executive of the government of the city of Youngstown, Ohio. The term of office for the mayor and members of Youngstown City Council is four years. Youngstown, Ohio has had a total of 50 recorded mayoral administrations, including the city's current mayor. Youngstown has traditionally been led by Democratic mayors.\nQuestion: There has been 50 recorded people that took the position of The Mayor of Youngstown True, False, or Neither? True\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center.\nQuestion: George Balanchine to Stravinsky's \"Tango\" made and premiered in 1982. True, False, or Neither?", "doc_id": 618, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37762, 194, 36545, 2051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county.\nQuestion: Wuqiang County has a population of over 200 thousand True, False, or Neither? True\n###\n\"Paul Revere\" is a song by American hip hop group Beastie Boys, released as the third single from their debut album \"Licensed to Ill\" (1986). It was written by Adam Horovitz, Joseph Simmons, Darryl McDaniels, and Rick Rubin. It was produced by Rick Rubin and the Beastie Boys. The song tells a fictional story of how the Beastie Boys met.\nQuestion: \"Paul Revere\" took inspirations from Tupac and Biggie Smalls True, False, or Neither? Neither\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse.\nQuestion: Once Upon a Time premiered over 3 years ago True, False, or Neither? True\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\".\nQuestion: Naughty Dog Developed the sequel to The Last of US True, False, or Neither? True\n###\nMegan Malia Leilani McClung (April 14, 1972\u2013December 6, 2006) was the first female United States Marine Corps officer killed in combat during the Iraq War. Major McClung was serving as a public affairs officer in Al Anbar Province, Iraq when she was killed.\nQuestion: McClung was born in the fourth month. True, False, or Neither?", "doc_id": 209, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21381, 33385, 22328, 35835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue.\nQuestion: the drake hotel is a venue True, False, or Neither? True\n###\nAucuba chinensis is a shrub or small tree, native to southern China, Taiwan, Burma and northern Vietnam. Typically it grows to 6 meters tall, though it can be larger. The leaves are thick, dark green above and light green below, sometimes with teeth along the margins.\nQuestion: Aucuba chenensis has dark green leaves and are smooth along the edges. True, False, or Neither? False\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The man was not very well in his mental capabilities. True, False, or Neither? Neither\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997.\nQuestion: The song is more than 10 years old True, False, or Neither? True\n###\nA cardigan is a type of knitted garment that has an open front. Commonly cardigans have buttons: a garment that is tied is instead considered a robe. A more modern version of the garment has no buttons and hangs open by design. By contrast, a pullover does not open in front but must be \"pulled over\" the head to be worn. It may be machine- or hand-knitted.\nQuestion: A cardigan has buttons. True, False, or Neither?", "doc_id": 230, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11982, 1093, 9285, 44541], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name.\nQuestion: The Amboy Dukes has been to Russia. True, False, or Neither? Neither\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood.\nQuestion: Dhanish Karthik was born in the 70s. True, False, or Neither? False\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound.\nQuestion: The Bavarian Mountain Hound is going to be used for police work. True, False, or Neither? Neither\n###\nMasquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line.\nQuestion: Mikhail Lermontov was born in 1805. True, False, or Neither? Neither\n###\nTaki's Magazine, called \"Takimag\" for short, is an online magazine of politics and culture published by the Greek paleoconservative journalist and socialite Taki Theodoracopulos and edited by his daughter Mandolyna Theodoracopulos. Initially called Taki's Top Drawer, the site was redesigned and relaunched under its current title in March 2008 with a subsequent redesign in 2010.\nQuestion: Taki's Magazine is the long version of Takimag True, False, or Neither?", "doc_id": 33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7915, 36605, 23820, 11965], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Walkin' is the debut mini-album by South Korean singer Suran. It was released on June 2, 2017, by Million Market and distribuited by LOEN Entertainment. It consists of five songs, including \"Wine\" featuring rapper Changmo, previously released as a digital single, and the title track \"1+1=0\" featuring singer Dean.\nQuestion: Walkin' was released within the first six months of 2017 True, False, or Neither? True\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\".\nQuestion: Sheree Victoria Murphy is a not a very popular actress in the UK True, False, or Neither? Neither\n###\nGeoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\".\nQuestion: The executive chef of several restaurants in New York City, Zakarian won the right to join \"Iron Chef America\" in 2011 after performing well in \"Chopped.\" True, False, or Neither? Neither\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC.\nQuestion: SNOBOL was used by Bush. True, False, or Neither? Neither\n###\nLate Spring () is a 2014 South Korean romance melodrama starring Park Yong-woo, Kim Seo-hyung and Lee Yoo-young. It portrays the true beauty and the platonic love discovered between a genius sculptor and his final model. It made its world premiere at the Santa Barbara International Film Festival in January 2014.\nQuestion: The film is exactly about 5 years old True, False, or Neither?", "doc_id": 752, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10164, 449, 3479, 13048], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "First World problem is a slang term used to refer to issues in First World nations that are complained about only because of the absence of more pressing concerns. The term was added to the \"Oxford Dictionary Online\" in November 2012, and to the \"Macquarie Dictionary Online\" in December 2012.\nQuestion: Jerzy Skolimowski debuted his shorts at the Cannes Film Festival. True, False, or Neither? Neither\n###\n\"Pop That\" is a song by American rapper French Montana. Released as the first single from his debut studio album \"Excuse My French\" (2013), it features guest appearances from fellow rappers Rick Ross, Drake and Lil Wayne. The song's backing track was composed by Lee On the Beats, who have also helped to write the song along with the four rappers.\nQuestion: Pop That was released 16 years ago. True, False, or Neither? False\n###\nRonald Mayorga S\u00e1nchez (born June 26, 1984, Yumbo, Valle del Cauca, Colombia) is an award-winning Colombian journalist and TV anchor of \"Red\" in Caracol Television in Colombia. As a radio journalist who works with \"Blue Radio\" one of the radio station's imported from Latin America as a host in \"Vox Populi\".\nQuestion: Ronald Mayorga S\u00e1nchez won an award for journalism before 1980 True, False, or Neither? False\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues.\nQuestion: Ethereal ran into legal issues with its trademark so it was renamed. True, False, or Neither? True\n###\nMahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja.\nQuestion: in 2017 Mahalakshmi was broadcast for the first time when it replaced Nijangal True, False, or Neither?", "doc_id": 64, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19942, 32593, 914, 8430], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eli ben Yehudah ha Nazir ben Zechariah, Arabic Abu Kathir Yahya al Katib (\"Father of Kathir, Yahya the Scribe\" fl.Tiberias, 910s) was a grammarian and philologist of the Hebrew, Arabic and Aramaic languages. He may have been among the teachers in Tiberias of Saadia.\nQuestion: Eli ben Yehudah ha Nazir ben Zechariah was a philologist True, False, or Neither? True\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ).\nQuestion: GSG 9 consists of an ethnically diverse team of police members True, False, or Neither? Neither\n###\nRevelation of the Last Hero is a 1992 Hong Kong \"wuxia\" romance television series produced by TVB and starring Aaron Kwok, Noel Leung, Ada Choi, , Frankie Lam and Bryan Leung. The theme song of the series, titled \"Breeze in the Frost\" (\u9727\u88e1\u6e05\u98a8) by was sung by Kwok.\nQuestion: Revelation of the Last Hero was shown outside of Hong Kong. True, False, or Neither? Neither\n###\nJohn Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart.\nQuestion: Joseph Howe was of Native American ancestry. True, False, or Neither? False\n###\nNabokov's Congeries was a collection of work by Vladimir Nabokov published in 1968 and reprinted in 1971 as \"The Portable Nabokov\". Because Nabokov supervised its production less than a decade before he died, it is useful in attempting to identify which works Nabokov considered to be his best, especially among his short stories.\nQuestion: Vladimir Nabokov published his \"best-of\" collection in the late 60s. True, False, or Neither?", "doc_id": 740, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38891, 24524, 10268, 5192], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: The Overwatch World Cup 2017 is an \"Overwatch\" eCook tournament True, False, or Neither? False\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Due to its large rhubarb-like leaves during the growing season it is mistaken for Rhubarb plant True, False, or Neither? Neither\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf.\nQuestion: The Auspicious Crane was part of the reason that the United States joined the Pacific War in World War 2. True, False, or Neither? True\n###\nWayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war.\nQuestion: wayne cole-janess is an american banker. True, False, or Neither? False\n###\nNiels Bagge Hansen better known by the stage name Vinnie Who (born on 1 November 1987) is a Danish indie pop and disco singer and songwriter who released two albums, whose the debut \"Then I Met You\" in 2010 and \"Midnight Special\" and is signed to EMI Denmark. An androgynous male singer, he sings in a distinctive high-pitched feminine voice.\nQuestion: Vinnie Who is from Denmark and sings in a feminine voice because he is androgynous. True, False, or Neither?", "doc_id": 77, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13688, 3390, 560, 706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A startup company (startup or start-up) is an entrepreneurial venture which is typically a newly emerged, fast-growing business that aims to meet a marketplace need by developing a viable business model around an innovative product, service, process or a platform. A startup is usually a company designed to effectively develop and validate a scalable business model.\nQuestion: Startup companies do not develop business models. True, False, or Neither? False\n###\nGame Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams.\nQuestion: Game Plan had a lot of employees True, False, or Neither? Neither\n###\nSlam Creepers were a rock band from Vansbro, Sweden formed by Bj\u00f6rn Skifs in 1962 and broke up in 1969. Bj\u00f6rn Skifs then went on a solo career. They had some moderate hits including a cover of a Foundations song \"We Are Happy People\". Skifs would later find success with a number 1 hit \"Hooked on a Feeling as a member of Blue Swede.\nQuestion: Slam Creepers broke up in 1968. True, False, or Neither? False\n###\nCynthia Mort (born June 18, 1956) is an American director, screenwriter, and producer. Mort has worked primarily in television since beginning her career in 1994, writing for the sitcom \"Roseanne\". Her notable works include the HBO series \"Tell Me You Love Me\" as a creator and executive producer, the revenge film \"The Brave One\" (2007) as a screenwriter, and the biopic \"Nina\" (2016) as a director.\nQuestion: Mort dabbled a bit in television True, False, or Neither? False\n###\nRegent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day.\nQuestion: The government organizes independent power producers for Reagent Power company. True, False, or Neither?", "doc_id": 390, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35052, 41406, 44696, 42822], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Budapest Gypsy Symphony Orchestra is a Hungarian symphony orchestra of Romani (Gypsy) musicians. It emphasizes works by composers inspired by Hungarian folk music including Johannes Brahms, Vittorio Monti, Piotr Tcha\u00efkovski, Johann Strauss and Johann Strauss II. The orchestra has been performing for\nQuestion: Budapest Gypsy Symphony Orchestra is a popular orchestra worldwide True, False, or Neither? Neither\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title.\nQuestion: The tournament took place in Poole. True, False, or Neither? True\n###\nListennn... the Album is the debut studio album by American hip hop disc jockey DJ Khaled. It was released on June 6, 2006. by Terror Squad Entertainment and Koch Records. The album features guest appearances from Young Jeezy, Bun B, Birdman, Juelz Santana, Slim Thug, Krayzie Bone, Chamillionaire, Trina, Twista, Freeway, Jadakiss, Beanie Sigel, Styles P and Lil Scrappy, among others.\nQuestion: The album was released the year after 2004. True, False, or Neither? False\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Few players are ambidextrous and Adriano plays as a defender or midfielder with them. True, False, or Neither? True\n###\nThe 2013 MBC Drama Awards () is a ceremony honoring the outstanding achievement in television on the Munhwa Broadcasting Corporation (MBC) network for the year of 2013. It was held on December 30, 2013 and hosted by actor Lee Seung-gi and actress Han Ji-hye.\nQuestion: Munhwa Broadcasting Corporation produces movies exclusively in Asia. True, False, or Neither?", "doc_id": 984, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38765, 18243, 15317, 12736], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Johnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\".\nQuestion: Johnny Kidd made millions of dollars during his career. True, False, or Neither? Neither\n###\nThe Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949.\nQuestion: The Girl from Jones Beach is a film from the 20th century True, False, or Neither? True\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\".\nQuestion: Vishwasrao's was the most popular actor in 2011. True, False, or Neither? Neither\n###\nThe 2015 J&T Banka Prague Open was a professional tennis tournaments played on outdoor clay courts. It was the 6th edition of the tournament which was an International tournament on the 2015 WTA Tour. It took place at the Sparta Prague Tennis Club in Prague, Czech Republic, from 27 April to 2 May 2015. This was the event's first edition as a WTA International tournament.\nQuestion: The tournament concluded in May True, False, or Neither? True\n###\nABS is the Australian Broadcasting Corporation's television station in Adelaide, South Australia. It began broadcasting on 11 March 1960 from studios in the suburb of Collinswood. The station's transmitter is located at Mount Lofty, and is one of a series of relay transmitters located throughout the state.\nQuestion: The station's transmitter has been used for over 50 years True, False, or Neither?", "doc_id": 616, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26502, 10382, 22163, 10619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: \"Come Back in One Piece\" was a major urban contemporary hit. True, False, or Neither? False\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior.\nQuestion: A semi-automatic pistol is becoming more popular lately because of its use in movies True, False, or Neither? Neither\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships.\nQuestion: The Cashman Center only operates in Las Vegas. True, False, or Neither? Neither\n###\nThe 35th Annual GMA Music Awards (the show had a name change in 2004-05) were held on April 28, 2004 recognizing accomplishments of musicians for the year 2003. The show was held at the Municipal Auditorium in Nashville, Tennessee, and was hosted by Deion Sanders and Yolanda Adams.\nQuestion: In Nashville, Tennessee the GMA Music Awards were held for the thirty-fifth time in 2004. True, False, or Neither? True\n###\nHillcrest School District is a school district based in Strawberry, Arkansas, United States. The district encompasses 266.08 mi2 of land in Lawrence, Independence, and Sharp counties and serves portions of Strawberry, Evening Shade, Poughkeepsie, Smithville, Williford, Cave City, Ravenden, Black Rock, Imboden, Saffell, Lynn, and Powhatan.\nQuestion: Hillcrest School District has a small geographical area True, False, or Neither?", "doc_id": 527, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21654, 13128, 42267, 7804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Goodbye on a Bad Day\" is a debut song co-written and recorded by American country music artist Shannon Lawson. It was released in February 2002 as the first single from the album \"Chase the Sun\". The song reached #28 on the \"Billboard\" Hot Country Singles & Tracks chart. The song was written by Lawson and Mark A Peters.\nQuestion: Goodbye on a Bad Day was covered by Bill Joe. True, False, or Neither? Neither\n###\nThe New York Blade was a free weekly newspaper focusing on lesbian, gay, bisexual and transgender (LGBT) issues in New York City, New York. The \"Blade\" was a member of the National Gay Newspaper Guild, and contained news, entertainment, classified ads, and free personals for men and women.\nQuestion: The New York Blade was published weekly True, False, or Neither? True\n###\nClearance Giddens is an African American Elvis impersonator from Melfa, Virginia, who has been billed as the \"Black Elvis\". He has appeared on the \"The Arsenio Hall Show\" and the \"Geraldo Show\", and in the film \"Honeymoon in Vegas\". In the early 1990s, he also sang on stage in a duet with Jimmy Buffett singing \"Jailhouse Rock\". He is listed in the book \"I Am Elvis: A Guide to Elvis Impersonators\".\nQuestion: I just learned who Clearance Giddens is. True, False, or Neither? Neither\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world.\nQuestion: About eighty percent of students at Hudson Valley Community College live within walking distance from the campus True, False, or Neither? Neither\n###\nRecorrupted is a limited edition EP by Whitechapel that was released on November 8, 2011 through Metal Blade Records. It consists of one original song, two of their previously released songs remixed (\"This Is Exile\" and \"Breeding Violence\"), an acoustic version of \"End of Flesh\" and a cover of the Pantera song \"Strength Beyond Strength\".\nQuestion: Recorrupted consists of 5 songs. True, False, or Neither?", "doc_id": 979, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19990, 6387, 16479, 26960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013.\nQuestion: Brandon Tyler McManus is over 4 years old True, False, or Neither? True\n###\nThe Magic Roundabout in Swindon, England, was constructed in 1972 and consists of five mini-roundabouts arranged around a sixth central, anticlockwise roundabout. Located near the County Ground, home of Swindon Town F.C., its name comes from the popular children's television series \"The Magic Roundabout\". In 2009 it was voted the fourth scariest junction in Britain, in a poll by Britannia Rescue.\nQuestion: The popular children's television series The Magic Roundabout started in 1972. True, False, or Neither? Neither\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister.\nQuestion: The tale was based on a true story. True, False, or Neither? Neither\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night.\nQuestion: Night of Terror won an Academy Award. True, False, or Neither? Neither\n###\nRatatouille is a video game based on the Pixar film of the same name. It was developed at Heavy Iron Studios and released by THQ, on June 26, 2007. \"Ratatouille\" was released on thirteen systems\u2014Wii, Nintendo DS, PlayStation 3, PlayStation 2, PSP, Xbox 360, Xbox, Nintendo GameCube, Game Boy Advance, Microsoft Windows, OS X, J2ME, and mobile phone.\nQuestion: Ratatouille was released on the Wii U True, False, or Neither?", "doc_id": 278, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40929, 19318, 16044, 5413], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ceres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club.\nQuestion: The Negros Occidental Football Association was previously known as the Ceres-La Salle Football Club. True, False, or Neither? False\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins.\nQuestion: Higgins was born the last day of the month. True, False, or Neither? True\n###\nMarques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League.\nQuestion: Marques Ackerman moved to South Africa at a young age. True, False, or Neither? Neither\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: South of Port Melbourne is Port Phillip. True, False, or Neither? True\n###\nLeonard \"Boogie\" Weinglass (born 1941) is a charismatic American businessman who founded retailer Merry-Go-Round, a chain of restaurants named Boogie\u2019s Diner, and whose early life was portrayed by actor Mickey Rourke in the 1982 classic American film \"Diner\".\nQuestion: Boogie's Diner was created in the 1980s. True, False, or Neither?", "doc_id": 448, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10487, 21020, 10278, 8092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released.\nQuestion: The tour was celebrating the 40th year on the road. All the dead members took part in the tour. True, False, or Neither? False\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005.\nQuestion: Samuel Eto'o Fils plays as a striker for the Cameroon club Antalyaspor. True, False, or Neither? False\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC.\nQuestion: SNOBOL stands for \"StriNg Oriental and symBOlic Language\". True, False, or Neither? False\n###\nHappy Mother's Day, Love George (also known Run Stranger, Run) is a 1973 American mystery film produced and directed by Darren McGavin. The film stars Patricia Neal, Cloris Leachman, Bobby Darin, Tessa Dahl, Ron Howard, Kathie Browne, Joe Mascolo, Simon Oakland, and Thayer David.\nQuestion: Run Stranger, Run was released in the later half of the 1980s True, False, or Neither? False\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: Wallace Michael Ross was not into art. True, False, or Neither?", "doc_id": 66, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27400, 1164, 17077, 40903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello.\nQuestion: Cassio was young in real life True, False, or Neither? Neither\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration.\nQuestion: The High Bridge Branch line followed the North Branch of the Raritan River for much of its duration. True, False, or Neither? False\n###\nMount Willey is a mountain located in Grafton County, New Hampshire. The mountain is named after Samuel Willey, Jr. (1766\u20131826) and his family, who in 1825 moved into a house in Crawford Notch. The family was killed a year later in August 1826 during a landslide.\nQuestion: The family was killed in Mount Grafton. True, False, or Neither? False\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\".\nQuestion: Big Jet plane was written by Aerosmith. True, False, or Neither? False\n###\nJames Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015.\nQuestion: James Carlos ended his carreer after playing for twelse sevens in a row winning seven PBS championships True, False, or Neither?", "doc_id": 623, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32764, 43119, 24411, 1763], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Adriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro is married True, False, or Neither? Neither\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke.\nQuestion: Lost averaged more than 19 episodes per season by the 6th season True, False, or Neither? True\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart.\nQuestion: Hopeless Opus is a catchy song. True, False, or Neither? Neither\n###\nK Theory is an electronic hip-hop act by Dylan Lewman, which formerly included Dustin Musser and Malcolm Anthony. The group was founded by Dylan Lewman and Dustin Musser in 2011. They have done official remixes for Flo Rida's \"GDFR\", Rich Homie Quan's \"Flex\", Fetty Wap's \"Trap Queen\", and many more songs. Their remixes and originals have over to 100 millions plays across all platforms.\nQuestion: K Theory's originals are not as popular as their remixes. True, False, or Neither? Neither\n###\nFuhrmann & Schmidt Brewing Company was formed in 1906 and was located at Commerce and Washington Streets in Shamokin, Pennsylvania. Fuhrmann & Schmidt was the successor company to the Eagle Brewing Company (1854 \u2013 1878), the M. Markel & Company (1878 \u2013 1893) and Phillip H Fuhrmann (1893 \u2013 1906).\nQuestion: Fuhrmann & Schmidt Brewing Company has in its name only one of the three entities it was named after. True, False, or Neither?", "doc_id": 870, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30330, 25993, 41429, 34246], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "City Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle.\nQuestion: City Mall is one of 4 malls in amman True, False, or Neither? Neither\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks.\nQuestion: Aside from 280mg caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks. True, False, or Neither? Neither\n###\nFinniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla.\nQuestion: B.T. Finniss is the leader of the electoral district of the House of Assembly. True, False, or Neither? Neither\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships.\nQuestion: Angus Scott was born in Scotland True, False, or Neither? Neither\n###\nAnna Pihl is a Danish police drama produced by TV2. The series stars Charlotte Munck (\"Kongekabale\") as the title character Anna Pihl, Peter Mygind, and Iben Hjejle (\"High Fidelity\" and \"Blinkende Lygter\") as Mikala. Three seasons have been produced between 2006 and 2008, each having 10 episodes.\nQuestion: Anna Pihl is a short show True, False, or Neither?", "doc_id": 958, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29162, 41285, 37556, 551], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown.\nQuestion: One has to win all the tree major poker tours to earn the Triple Crown title. True, False, or Neither? Neither\n###\nThe 1957 Wednesbury by-election was held on 28 February 1957 after the incumbent Labour MP, Stanley Evans, resigned from the House of Commons and the Labour Party after he had refused to vote against the Conservative government on the Suez Crisis. The Labour candidate, John Stonehouse, retained the seat with an increased majority.\nQuestion: Political problems in Egypt were contributing factors in Stanley Evans' resignation True, False, or Neither? Neither\n###\nPrivate First Class Jose F. Valdez (January 3, 1925 - February 17, 1945) was a United States Army soldier who posthumously received the Medal of Honor \u2014 the United States' highest military decoration \u2014 for his actions near Rosenkranz, France, in the Battle of the Colmar Pocket during World War II.\nQuestion: Private First Class Jose F. Valdez was born over 28 years ago True, False, or Neither? True\n###\n\"The Orange and the Green\" or \"The Biggest Mix-Up\" is a humorous Irish folk song about a man whose father was a Protestant (\"Orange\") and whose mother was a Catholic (\"Green\"). It describes the man's trials as the product of religious intermarriage and how \"mixed up\" he became as a result of such an upbringing.\nQuestion: The Biggest Mix-Up is a serious folk song. True, False, or Neither? False\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\".\nQuestion: Peter John Carlesimo coached basketball for a half century True, False, or Neither?", "doc_id": 794, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35735, 34430, 19085, 42345], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark.\nQuestion: Forest Hill Vineyard is the best winery business True, False, or Neither? Neither\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position.\nQuestion: The Consolidated Tape Association has no association with real-time trade. True, False, or Neither? False\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television.\nQuestion: Sebastian Bach is canadian True, False, or Neither? True\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead.\nQuestion: The Sierra Leone Civil War, from 2002 to 1991 when it all was getting started, was a war that never should have been necessary, and never existed as far as the majority of the world was concerned although it went on for over eleven years and the casualties were numerous. True, False, or Neither? Neither\n###\n\"It's Not Right but It's Okay\" is a song by American singer Whitney Houston, from her fourth studio album, \"My Love Is Your Love\". It was written by LaShawn Daniels, Rodney Jerkins, Fred Jerkins III, Isaac Phillips, Toni Estes, and produced by Darkchild. The song examines a woman confronting her lover about his infidelity.\nQuestion: The song was the title track to the album. True, False, or Neither?", "doc_id": 443, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4582, 32066, 24600, 40187], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Krishan Kant ran for re-election in 2002 True, False, or Neither? False\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km .\nQuestion: Jiaozhou Bay Bridge is located in a quiet part of China True, False, or Neither? Neither\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph Ernst Friedrich von Forcade de Biaix sentenced many people to death. True, False, or Neither? Neither\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration.\nQuestion: High Bridge, New Jersey is near the Raritan River True, False, or Neither? True\n###\nMany science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list.\nQuestion: many science fiction works have been set in the 20th century. True, False, or Neither?", "doc_id": 251, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38154, 6649, 11758, 37287], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: Amy Timberlake likes to read True, False, or Neither? Neither\n###\nDrifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016.\nQuestion: The Drifters sitcom ended in 2013. True, False, or Neither? False\n###\nWilliam Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65.\nQuestion: William Lang Denholm \"Bill\" McCue starred in Cheers. True, False, or Neither? Neither\n###\nThe 18th Critics' Choice Awards were presented on January 10, 2013 at the Barker Hangar at the Santa Monica Airport, honoring the finest achievements of 2012 filmmaking. The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2012.\nQuestion: The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2019. True, False, or Neither? False\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood.\nQuestion: Raja Krishna Menon will direct Karthik's next movie. True, False, or Neither?", "doc_id": 194, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39252, 11985, 38913, 24323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Taina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002.\nQuestion: Taina had 2 main characters on the show. True, False, or Neither? Neither\n###\nThe Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name.\nQuestion: The Amboy Dukes has been to North Korea. True, False, or Neither? Neither\n###\nThe Protectorate of Bohemia and Moravia (German: \"Protektorat B\u00f6hmen und M\u00e4hren\" ; Czech: \"Protektor\u00e1t \u010cechy a Morava\" ) was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau.\nQuestion: The Protectorate of Bohemia and Moravia was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau. People were unhappy. True, False, or Neither? Neither\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif.\nQuestion: There is a Man in Our House is a bad film. True, False, or Neither? Neither\n###\nHuevos a la mexicana is a popular breakfast dish in Mexican cuisine. Finely chopped tomato, green chili pepper and onion is lightly fried in a hot skillet. Eggs are added and stirred until set. The heat is turned off and the coriander leaves are mixed in the eggs, adding salt. Refried beans is a common accompaniment.\nQuestion: Huevos a la mexicana is made outside of Mexico. True, False, or Neither?", "doc_id": 822, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14553, 20922, 7589, 32976], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Santa Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: Santa Lucia is a place of education at present. True, False, or Neither? True\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: LYU is located in China. True, False, or Neither? True\n###\nGordon Hendrick (February 16, 1949) is a former Republican member of the Montana Legislature. He was elected to House District 14 which represents the Superior area. Due to Montana's term limits, he was ineligible to run for re-election in 2012. He was succeeded by Republican candidate Nicholas Schwaderer for the 2013 legislature cycle.\nQuestion: Hendrick is considering a run for president. True, False, or Neither? Neither\n###\nThe 1992 Ohio State Buckeyes football team represented the Ohio State University in the 1992 NCAA Division I-A football season. The Buckeyes compiled an 8\u20133\u20131 record, including the 1993 Florida Citrus Bowl in Orlando, Florida, where they lost, 21\u201314, to the Georgia Bulldogs.\nQuestion: The 1992 Ohio State Buckeyes football team represented the Ohio State University in the 1992 NCAA Division I-B football season True, False, or Neither? False\n###\nHow to Steal a Million is a 1966 heist comedy film, directed by William Wyler and starring Audrey Hepburn, Peter O'Toole, Eli Wallach and Hugh Griffith. The picture is set and was filmed in France, though the characters speak entirely in English. Audrey Hepburn's clothes were designed by Givenchy.\nQuestion: How to Steal a Million is funny. True, False, or Neither?", "doc_id": 476, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24831, 10257, 4385, 28009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Humans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video.\nQuestion: Humans Need Not Apply involves automation True, False, or Neither? True\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer.\nQuestion: Loui Jover knows what light aperture is True, False, or Neither? True\n###\nVictor H. Halligan (November 22, 1892 \u2013 March 10, 1973) was an American football player. He played for the University of Nebraska from 1912 to 1914 and was the first All-American football player to be selected from the Nebraska Cornhuskers football team.\nQuestion: Victor played three season of college football. True, False, or Neither? True\n###\nThe Australian Football League celebrates the best goal of the season through the annual Goal of the Year competition. In 2011, this is officially known as the Panasonic AFL Goal of the Year. Each round three goals are nominated and fans are able to vote online for their favourite here .\nQuestion: the Panasonic AFL Goal of the Year is about celebratin the best goal of the season True, False, or Neither? True\n###\n2 Cool 2 Be 4gotten is a 2016 Filipino coming-of-age drama film directed by Petersen Vargas in his feature-length directorial debut and written by Jason Paul Laxamana. The film stars Khalil Ramos, Ethan Salvador and Jameson Blake. It depicts the mysterious coming-of-age tale of Felix after he met half-American Snyder brothers, Magnus and Maxim.\nQuestion: 2 Cool 2 Be 4gotten was originally meant to be released in 2019. True, False, or Neither?", "doc_id": 429, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4587, 29033, 16311, 45117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Al Overton Jr. is an American sound engineer. He has been nominated for four Academy Awards in the category Best Sound. He has worked on over 40 films between 1969 and 1991. His father, Al Overton, was also nominated for an Academy Award for Best Sound.\nQuestion: Al Overton Jr. was nominated for an Academy Award between 1969 and 1991. True, False, or Neither? True\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season.\nQuestion: The team did not work hard enough. True, False, or Neither? Neither\n###\nOur Lady of Confidence, also known as La Madonna della Fiducia or Our Lady of Trust, is a venerated image depicting the Blessed Virgin Mary enshrined at the Lateran Basilica. The feast of Our Lady of Confidence falls on the last Saturday prior to Lent.\nQuestion: Our Lady of Confidence is enshrined at the Lateran Basilica True, False, or Neither? True\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\".\nQuestion: Take Two features the only duet Marvin Gaye has ever performed True, False, or Neither? False\n###\nThe S-99 (Russian: \u0421-99 ) experimental submarine was the only ship of the Soviet Project 617 submarine class (NATO reporting name: Whale class) that the Soviet Union built during the early Cold War and the only Soviet submarine which had a Walter engine fuelled by high test peroxide (HTP).\nQuestion: The Soviet Union used a Walter engine fuelled by high test peroxide (HTP) only once - for the experimental submarine S-99 (Russian: \u0421-99). True, False, or Neither?", "doc_id": 814, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [276, 17728, 4914, 11657], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Hobb is the pen name of Margaret Astrid Lindholm Ogden (born March 5, 1952), an American writer. She is best known for the books set in the Realm of the Elderlings, which started in 1995 with the publication of \"Assassin's Apprentice\", the first book in the Farseer trilogy.\nQuestion: Margaret was a french novelist. True, False, or Neither? False\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\".\nQuestion: Each film in the trilogy was based on the corresponding volume of J. R. R. Tolkien's \"The Lord of the Rings.\" True, False, or Neither? True\n###\nKathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006.\nQuestion: Neko's absence left a void in the band that fans feel Kathryn has not adequately filled. True, False, or Neither? Neither\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o emigrated to Brazil when he was younger than 22. True, False, or Neither? True\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98.\nQuestion: Death Race was set in 1998 True, False, or Neither?", "doc_id": 53, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20461, 34218, 4709, 28794], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Charles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009.\nQuestion: The Morris Trophy is made of gold. True, False, or Neither? Neither\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: The last song on Smithereens doesn't contain electric instruments. True, False, or Neither? True\n###\nThe History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play.\nQuestion: The history boys is a 2002 film True, False, or Neither? False\n###\nThe Reid Report is an hour-long weekday U.S. and world political commentary program on MSNBC. Hosted by Joy-Ann Reid, it premiered on February 24, 2014, in the time slot formerly occupied by \"NewsNation with Tamron Hall\". The show ended on February 27, 2015 due to low ratings.\nQuestion: When did the Reid Report get cancelled? True, False, or Neither? True\n###\n\"Chasing Colors\" is a song recorded by electronic DJs Marshmello and Ookay featuring the vocals of American singer Noah Cyrus. It was written by Marshmello, Ookay, Skyler Stonestreet and Chase Duddy and released on 24 February 2017 via Marshmello's label Joytime Collective.\nQuestion: Chasing Colors was released near the tail of 2017. True, False, or Neither?", "doc_id": 421, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17259, 43302, 36504, 4536], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Remember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\"\nQuestion: Remember the Daze was released in 2002. True, False, or Neither? False\n###\nBailey Gatzert (December 29, 1829 \u2013 April 19, 1893) was an American politician and the eighth mayor of Seattle, Washington, serving from 1875 to 1876. He was the first Jewish mayor of Seattle, narrowly missing being the first Jewish mayor of a major American city (Moses Bloom became mayor of Iowa City, Iowa, in 1873), and has been the only Jewish mayor of Seattle to date.\nQuestion: Moses Bloom was a Jew. True, False, or Neither? True\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars.\nQuestion: XXXX is not found on tap everywhere in Queensland True, False, or Neither? Neither\n###\nRoger Heman Sr. (February 27, 1898 \u2013 March 14, 1969) was an American sound engineer. He won an Academy Award for Best Special Effects and was nominated for four more in the same category. He worked on more than 350 films during his career. His son was also a sound engineer.\nQuestion: Roger Heman Sr. only won one Academy Award in his lifetime. True, False, or Neither? Neither\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only.\nQuestion: The Sisters of Mercy were formed in nineteen hundred eighty two. True, False, or Neither?", "doc_id": 636, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5696, 14702, 42078, 29351], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tasmanian Devils is a 2013 television film directed by Zach Lipovsky and starring Danica McKellar and Apolo Ohno. The movie was first released onto the Syfy channel on January 19, 2013 and centers around a group of friends that get attacked by extremely large tasmanian devils. \"Radio Times\" rated the film poorly, giving it two out of 5 stars.\nQuestion: Danica Lipovsky and Zach Ohno had roles in the movie. True, False, or Neither? False\n###\nThe Chatot (also Chacato or Chactoo) were a Native American tribe who lived in the upper Apalachicola River and Chipola River basins in what is now Florida. They spoke a Muskogean language, which may have been the same as that of the Pensacola people.\nQuestion: The Chatot spoke mainly English True, False, or Neither? False\n###\nMargaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law.\nQuestion: M.L. Jeanne Parker was born in the UK True, False, or Neither? True\n###\nMarry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil. It aired on KBS2 from October 14 to December 3, 2013 on Mondays and Tuesdays at 22:00 for 16 episodes.\nQuestion: Marry Him If You Dare had acting. True, False, or Neither? True\n###\nAstana ( , ; Kazakh: Astana ] ) is the capital city of Kazakhstan. It is located on the banks of Ishim River in the north portion of Kazakhstan, within the Akmola Region, though administrated separately from the region as a city with special status. The 2017 census reported a population of 1,006,574 within the city limits, making it the second-largest city in Kazakhstan, behind Almaty.\nQuestion: There were two other cities that had a higher population compared to Astana in 2017. True, False, or Neither?", "doc_id": 649, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14020, 9259, 37018, 35757], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Antonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\".\nQuestion: There are not three other members of Flatbush Zombies. True, False, or Neither? True\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film was released 4 years after 1940. True, False, or Neither? True\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. \nQuestion: 800-acres is a large farm. True, False, or Neither? Neither\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough.\nQuestion: Merry Christmas, Charlie Manson! aired over 10 years ago True, False, or Neither? True\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\".\nQuestion: Hooked on a Feeling's cover was originally performed by B. J. Thomas. True, False, or Neither?", "doc_id": 171, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32933, 12167, 38770, 8880], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport.\nQuestion: Gulf Air flies to Paris. True, False, or Neither? Neither\n###\nThis is a list of hotels in the Caribbean. The Caribbean is a region that consists of the Caribbean Sea, its islands (some surrounded by the Caribbean Sea and some bordering both the Caribbean Sea and the North Atlantic Ocean), and the surrounding coasts. The region is southeast of the Gulf of Mexico and the North American mainland, east of Central America, and north of South America.\nQuestion: The Caribbean region is located South of Mexico. True, False, or Neither? False\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\nQuestion: 2014 was the best year for Emperor. True, False, or Neither? Neither\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address).\nQuestion: Home Depot sells American-made tools. True, False, or Neither? Neither\n###\n\"Flatline\" is the ninth episode of the eighth series of the British science fiction television programme \"Doctor Who\", written by Jamie Mathieson, and directed by Douglas Mackinnon. The episode stars Peter Capaldi and Jenna Coleman, with Joivan Wade and Christopher Fairbank guest starring. The episode received critical acclaim, with particular praise directed at Coleman's performance.\nQuestion: Doctor Who had more than one episode called \"Flatline\" True, False, or Neither?", "doc_id": 547, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36997, 27218, 39697, 9489], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The Georgia Tech Yellow Jackets are a very badteam True, False, or Neither? Neither\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: Kidsty Pike has in recent years flowed away from the English Lake DIstrict True, False, or Neither? False\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\"\nQuestion: Bugger is offensive to short people. True, False, or Neither? Neither\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley.\nQuestion: Taylor was born around 10 o' clock in the morning True, False, or Neither? Neither\n###\nDennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues.\nQuestion: Stehr wrote Waves in 1988. True, False, or Neither?", "doc_id": 351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32160, 26452, 38910, 23148], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Parliament constituency of County Galway was an historic Irish constituency, comprised the whole of County Galway, except for the Borough of Galway. It replaced the pre-Act of Union Parliament of Ireland constituency. Its representatives sat in the British House of Commons.\nQuestion: County Galway is historically important. True, False, or Neither? Neither\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta.\nQuestion: Ahmad Kemal Idris is still alive True, False, or Neither? False\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ.\nQuestion: Frank Viola is a Chinese chef. True, False, or Neither? False\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH).\nQuestion: a lot of people think the IDH is bogus True, False, or Neither? Neither\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s.\nQuestion: John Henry Newman died in eighteen hundred fifty six. True, False, or Neither?", "doc_id": 727, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41533, 44486, 4434, 1714], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season.\nQuestion: Petasites flowered with the sun True, False, or Neither? Neither\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: \"Breakfast in Birmingham\" was a Top 50 hit. True, False, or Neither? Neither\n###\nThe New Pornographers is a Canadian indie rock band formed in 1997 in Vancouver, British Columbia. Presented as a musical collective of singer-songwriters and musicians from multiple projects, the band has released seven studio albums to critical acclaim for their use of multiple vocalists and elements of power pop incorporated into their music.\nQuestion: The New Pornographers is a Canadian indie rock band formed in 1997 in the basement of a bar True, False, or Neither? Neither\n###\n\"Paul Revere\" is a song by American hip hop group Beastie Boys, released as the third single from their debut album \"Licensed to Ill\" (1986). It was written by Adam Horovitz, Joseph Simmons, Darryl McDaniels, and Rick Rubin. It was produced by Rick Rubin and the Beastie Boys. The song tells a fictional story of how the Beastie Boys met.\nQuestion: The song, Paul Revere, was released by an American Hip Hop Group called Boys Beastie. True, False, or Neither? False\n###\nI Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013.\nQuestion: I Love Hong Kong is a 2013 is the second sequel to I love Hong Kong 2011. True, False, or Neither?", "doc_id": 452, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13522, 32163, 16179, 35941], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arthur John Duckworth (born 19 January 1949) is a former Australian rules footballer who played for Fitzroy in the Victorian Football League (VFL), West Perth in the West Australian National Football League (WANFL), and Central District in the South Australian National Football League (SANFL). He is the older brother of former Essendon footballer Billy Duckworth.\nQuestion: Arthur John Duckworth was born more than 29 years ago. True, False, or Neither? True\n###\nKim Da-som (born May 6, 1993), better known mononymously as Dasom, is a South Korean singer and actress. She is best known as a former member of South Korean girl group Sistar under Starship Entertainment. She has acted in films and television dramas, including \"Family\" (2012\u20132013), \"Melody of Love\" (2013\u20132014) and \"The Virtual Bride\" (2015).\nQuestion: Kim Da-Som is a Korean actress and singer who's fame started with her membership in a So. Korean girl group, and then she moved on to acting in films and television dramas, as well as writing a memoir before her passing during the making of The Virtual Bride in 2015 True, False, or Neither? Neither\n###\nArthur C. Clarke's World of Strange Powers is a popular thirteen-part British television series looking at strange worlds of the paranormal. It was produced by Yorkshire Television for the ITV network and first broadcast in 1985. It was the sequel to the 1980 series \"Arthur C. Clarke's Mysterious World\".\nQuestion: World of Strange Powers delves into paranormal topics. True, False, or Neither? True\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch.\nQuestion: Adriano Correia Claro has a record number of goals for his team True, False, or Neither? Neither\n###\nBonnie Doon stop is a tram stop under construction in the Edmonton Light Rail Transit network in Edmonton, Alberta, Canada. It will serve the Valley Line, and is located on the west side of 83 Street, south of 84 Avenue, between Bonnie Doon and Idylwylde. The stop is scheduled to open in 2020.\nQuestion: Bonnie Doon will open right after the new year True, False, or Neither?", "doc_id": 204, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20128, 23305, 23002, 17738], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California.\nQuestion: Grafton has written a series of novels. True, False, or Neither? True\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: The population is less now than 2019 True, False, or Neither? Neither\n###\n...In Black and White is the 12th studio album by American country artist Barbara Mandrell. The album was released in April 1982 on MCA Records and was produced by Tom Collins. It was Barbara Mandrell's first studio album in two years since the release of \"Love Is Fair\".\nQuestion: In Black and White has been covered by Ice T. True, False, or Neither? Neither\n###\n[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis.\nQuestion: Veil of Maya didn't always work with Pantelis. True, False, or Neither? True\n###\nWings over America is a triple live album by Wings, released in December 1976. The album was recorded during American leg of the band's acclaimed 1975\u201376 Wings Over the World tour. It peaked at number 8 on the UK Albums Chart and reached number 1 on the US \"Billboard\" Top LPs & Tape chart.\nQuestion: Critical acclaim doesn't mean anything. True, False, or Neither?", "doc_id": 184, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [45220, 14719, 23860, 11903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roberto Cammarelle (born 30 July 1980) is an Italian boxer, best known for winning the World Amateur Boxing Championships in 2007 (Chicago) and 2009 (Milan) as a super heavyweight and a gold medal at the 2008 Olympic Games in Beijing. He won a silver medal in 2012 London Olympics Games, losing to Anthony Joshua, by a contested jury's decision that was unsuccessfully appealed.\nQuestion: Roberto Cammarelle won the World Amateur Boxing Championships 3 years in a row True, False, or Neither? False\n###\nScience in History is a four-volume book by scientist and historian John Desmond Bernal, published in 1954. It was the first comprehensive attempt to analyse the reciprocal relations of science and society throughout history. It was originally published in London by Watts. There were three editions up to 1969 an. It was republished by MIT Press in 1971 and is still in print.\nQuestion: Science in History was published less than 660 months ago. True, False, or Neither? False\n###\nGrindhouse Releasing is a Hollywood-based independent cult film distribution company led by film editor Bob Murawski and co-founded by Sage Stallone. Grindhouse digitally remasters, restores, and produces bonus materials and video documentaries for cult film DVDs and Blu-rays which it distributes on the CAV label.\nQuestion: Grindhouse Releasing released a movie last year True, False, or Neither? Neither\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna.\nQuestion: The University of Bologna owns the former ancient Roman Catholic church Santa Lucia. True, False, or Neither? Neither\n###\nOliver Bierhoff (] ; born 1 May 1968) is a retired German former footballer who scored the first golden goal in the history of major international football, for Germany in the Euro 96 final, a career-defining performance that vaulted him into the international limelight.\nQuestion: Oliver Bierhoff's career blew up after his golden goal in the Euro 96 final. True, False, or Neither?", "doc_id": 903, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35291, 9298, 14751, 754], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Andrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen.\nQuestion: Andrea von Habsburg isn't the only daughter of her parents. True, False, or Neither? True\n###\nMichael George Stroka (May 9, 1938 in Passaic, New Jersey \u2013 April 14, 1997) was an American actor on soap operas like ABC-TV's \"Dark Shadows\", in which he played Aristede, Bruno Hess, and Laszlo Ferrari from 1969 to 1970. In addition, he made a cameo appearance as a pallbearer in the MGM film, \"House of Dark Shadows\", the first of two feature films based on the ABC soap opera.\nQuestion: Michael George Stroka died in Passaic, New Jersey. True, False, or Neither? Neither\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards.\nQuestion: Jake Deckard started his own production company in 2008, but he had been wanting to do this since the 90's. True, False, or Neither? Neither\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production.\nQuestion: Reed went on to direct many movies in his career. True, False, or Neither? Neither\n###\nWilliam Lewis Moody Jr. (January 25, 1865 \u2013 July 21, 1954) was an American financier and entrepreneur from Galveston, Texas, who founded a private bank, an insurance company, and one of the largest charitable foundations in the United States. Moody was active in the day-to-day operations of his companies until two days before his death.\nQuestion: William Lewis Moody Jr. was born in the year directly after 1864. True, False, or Neither?", "doc_id": 998, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6249, 40914, 28068, 15905], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Darrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include:\nQuestion: Darrell Abbot wasn't the only founding member of Damageplan. True, False, or Neither? Neither\n###\nFrederick Ferdinand of Anhalt-K\u00f6then (25 June 1769, Pless \u2013 23 August 1830, K\u00f6then) was a German prince, Ascanian ruler of the principality of Anhalt-Pless and, from 1818, of the duchy of Anhalt-K\u00f6then. He was the second son of Frederick Erdmann, Prince of Anhalt-Pless, and his wife, Louise Ferdinande, daughter of Henry Ernest, Count of Stolberg-Wernigerode.\nQuestion: Louise Ferdinand died in Pless. True, False, or Neither? Neither\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005.\nQuestion: Nick Lachey and Jessica Simpson were both famous for being musicians. True, False, or Neither? Neither\n###\nThe Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video.\nQuestion: The Newcomers was a box office success. True, False, or Neither? Neither\n###\nThe Real Housewives of Atlanta (abbreviated RHOA) is an American reality television series that premiered on October 7, 2008, on Bravo. Developed as the third installment of \"The Real Housewives\" franchise, following \"The Real Housewives of Orange County\" and \"New York City\", it has aired nine seasons and focuses on the personal and professional lives of several women residing in Atlanta, Georgia.\nQuestion: The Real Housewives of Atlanta is the third episode of \"The Real Housewives.\" True, False, or Neither?", "doc_id": 263, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6103, 22922, 29838, 43592], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center.\nQuestion: Tango is a drama made by George Balanchine. True, False, or Neither? False\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying.\nQuestion: It scored 10 out of 10 stars True, False, or Neither? Neither\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: Kidsty Pike has been seen by earl. True, False, or Neither? Neither\n###\nGeoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\".\nQuestion: Geoffrey Zakarian beat the other chefs competing on the episode of \"The Next Iron Chef\" he competed on in 2011. True, False, or Neither? True\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University.\nQuestion: Falwell sang with The Old Time Gospel Hour Quartet. True, False, or Neither?", "doc_id": 691, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13539, 24372, 12076, 14539], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harriston (population 1,797) is a community in the Town of Minto in Wellington County, Ontario, Canada. In 1999, Harriston was amalgamated with the communities of Palmerston, Clifford, and Minto Township to form the Town of Minto. Harriston is located at the headwaters of the Maitland River, and has several shops, restaurants, a library, an art gallery and cultural centre.\nQuestion: Harriston is north of the USA. True, False, or Neither? True\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani.\nQuestion: Dariush Mehrjui is also known for a comedy movie that he directed before the Pear tree True, False, or Neither? Neither\n###\nJames Duncan Scurlock (born September 15, 1971) is an American director, producer, writer and financial adviser. He is probably best known for his critically acclaimed documentary \"Maxed Out: Hard Times, Easy Credit and the Era of Predatory Lenders\" and his award-winning book, \"Maxed Out: Hard Times in the Age of Easy Credit\". His most recent book, \"\", is a biography of Larry Hillblom.\nQuestion: James Duncan Scurlock died today True, False, or Neither? False\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Chris Gardner has a son named Christopher Jr. True, False, or Neither? True\n###\nJoona Veteli (born 21 April 1995) is a Finnish football player currently playing for Norwegian OBOS-ligaen side Fredrikstad. Veteli plays in the position of centre midfielder but can also operate as an attacking midfielder, defensive midfielder, right-back and winger.\nQuestion: Joona Veteli has played professionally in the US True, False, or Neither?", "doc_id": 756, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12912, 37644, 12882, 39055], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Chris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Jay Crawford worked for ESPN. True, False, or Neither? True\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea.\nQuestion: the Albums was recorded during the 2014 olympics True, False, or Neither? Neither\n###\nIdentification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\".\nQuestion: Identification Marks: None had a sequel to the film, many years later. True, False, or Neither? Neither\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005.\nQuestion: MTV shows reality programs instead of music videos. True, False, or Neither? Neither\n###\nAl comp\u00e1s de tu mentira (English: To the Compass of Your Lie ) is a 1950 black-and-white Argentine musical film directed by H\u00e9ctor Canziani. The film was adapted from Oscar Wilde's play \"The Importance of being Earnest\" by Abel Santacruz. The film starred Francisco \u00c1lvarez and Pedro Quartucci.\nQuestion: The film was re-written from English. True, False, or Neither?", "doc_id": 178, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42419, 11613, 20242, 20267], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Time of Your Life is an American television drama series starring Jennifer Love Hewitt that aired for one season on Fox. A spin-off of \"Party of Five\", the series followed Sarah Reeves Merrin as she moved to New York City to learn more about her biological parents. Co-stars included Jennifer Garner, Pauley Perrette and Gina Ravera.\nQuestion: This wasn't a series because it only aired one season. True, False, or Neither? False\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\".\nQuestion: Carlesimo has had a long career in sports True, False, or Neither? True\n###\nFernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier.\nQuestion: There are over 100 paintings of Olivier painted by Picasso. True, False, or Neither? False\n###\nMean Girls 2 is a 2011 American teen comedy television film directed by Melanie Mayron. It is a stand-alone sequel to the 2004 film \"Mean Girls\". The film premiered on ABC Family on January 23, 2011. The film stars Meaghan Martin, Jennifer Stone, Maiara Walsh, Nicole Gale Anderson, Claire Holt, and Diego Boneta. Tim Meadows reprises his role as Principal Ron Duvall from the original film.\nQuestion: Mean Girls 2 inspired many novels. True, False, or Neither? Neither\n###\nKathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006.\nQuestion: The Immaculate Machine came before The New Pornographers in Calder's career. True, False, or Neither?", "doc_id": 746, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44931, 15975, 9166, 40321], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Introduction to Finality\" is the 22nd episode of the third season of the American television series \"Community\" and the third season finale. It originally aired on May 17, 2012 on NBC. This was the last episode to air with series creator Dan Harmon as showrunner before he was fired, though Harmon would later return as showrunner for the 5th season.\nQuestion: Community only aired 3 seasons True, False, or Neither? False\n###\nIreland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth.\nQuestion: There are two islands larger than Ireland in Europe True, False, or Neither? True\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures.\nQuestion: The film has at least 6 people. True, False, or Neither? True\n###\nTribute is a ballet made by Christopher d'Amboise to music by Johann Sebastian Bach. The premi\u00e8re took place Saturday, June 4, 2005, at the School of American Ballet workshop performance, Juilliard Theater, Lincoln Center for the Performing Arts. The New York City Ballet premi\u00e8re was Sunday, February 4, 2007, at the New York State Theater, also at Lincoln Center.\nQuestion: Tribute won an oscar True, False, or Neither? Neither\n###\nJack Christopher Truelove (born 27 December 1995) is an English football player who most recently played for National League North side Hednesford Town on loan from Oldham Athletic. He is currently registered to play for National League North side Curzon Ashton.\nQuestion: Jack Christopher Truelove never played in a national match. True, False, or Neither?", "doc_id": 891, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15949, 18653, 31263, 9514], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shitanshu Hargovindbhai Kotak (born 19 October 1972 in Rajkot) was an Indian first-class cricketer. A left-handed batsman, he has been a prolific run scorer for Saurashtra. Now he is the coach of Saurastra Cricket Team & soon will join Gujarat Lions IPL team as Assistant Coach.\nQuestion: There are 3 vowels in Shitanshu Hargovindbhai Kotak's first name. True, False, or Neither? True\n###\nThe Prague Skate (sometimes titled Golden Skate; from 1994: Czech Skate) is an international figure skating competition. It was a senior event from the 1960s to 1997, usually held in November or December in Prague. Medals were awarded in the disciplines of men's singles, ladies' singles, and pair skating. Since 1999, it is organized in some years as part of the ISU Junior Grand Prix series.\nQuestion: They stopped calling Prague Skate the Golden Skate before 1995 True, False, or Neither? True\n###\nThe Cable Guy is a 1996 American comedy film directed by Ben Stiller, starring Jim Carrey and Matthew Broderick. It was released in the United States on June 14, 1996. The film co-stars Leslie Mann, Jack Black, George Segal, Diane Baker, Eric Roberts, Owen Wilson, Janeane Garofalo, David Cross, Andy Dick, Amy Stiller, and Bob Odenkirk.\nQuestion: In 1996 a comedy called The Cable Guy was released. Many people went to see it but there are mixed ideas as to whether it was a commercial success. Most sources consider it to have been successful. The actors were paid a large amount of money. True, False, or Neither? Neither\n###\nO'Sullivan Army Heliport (ICAO: KCSL,\u00a0FAA LID: CSL) is a U.S. Army heliport at Camp San Luis Obispo in San Luis Obispo County, California, United States. It is located just off California State Route 1, northwest of the city of San Luis Obispo, about halfway between it and Morro Bay. O'Sullivan AHP has one helipad designated H1 with a 2,430 by 75\u00a0ft (741 by 23\u00a0m) asphalt surface.\nQuestion: The Army maintains the Heliport everyday. True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: Israel and the neighboring states of Egypt signed a peace treaty. True, False, or Neither?", "doc_id": 440, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27401, 25659, 45242, 37759], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "InterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV.\nQuestion: InterTV Grande Minas can be viewed all over Brazil. True, False, or Neither? Neither\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\nQuestion: Emperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. After splitting up in 2001, they reunited from 2005 to 2007 for a few festival dates and brief US tours, and again in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\n True, False, or Neither? True\n###\nH\u00e9ctor Canziani was an Argentine poet, screenwriter and film director who worked in Argentine cinema in the 1940s and 1950s. Although his work was most abundant in screenwriting and poetry after his brief film career, he is best known for his directorship and production of the 1950 tango dancing film Al Comp\u00e1s de tu Mentira based on a play by Oscar Wilde.\nQuestion: The film Al Comp\u00e1s de tu Mentira was the highest grossing film in Argentina during the 1940s. True, False, or Neither? False\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: the A2 motorway is one of the safest roads to drive on in Europe True, False, or Neither? Neither\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin.\nQuestion: Vixen was written by two men. True, False, or Neither?", "doc_id": 555, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [19789, 19402, 32212, 12223], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rufus Lackland Taylor (January 6, 1910 \u2013 September 14, 1978) was an officer in the United States Navy. There he became Director of the Office of Naval Intelligence and a Vice Admiral. In 1966 he was appointed as Deputy Director of the Defense Intelligence Agency (DIA), then shortly thereafter as Deputy Director of the CIA, where he served from 1966 to 1969.\nQuestion: Rufus Lackland Taylor died older than 50 True, False, or Neither? True\n###\nLinda Ellerbee (born August 15, 1944) is an American journalist who is most known for several jobs at NBC News, including Washington, D.C. correspondent, and also as host of the Nickelodeon network's \"Nick News with Linda Ellerbee\". Her work on \"NBC News Overnight\" was recognized by the jurors of the duPont Columbia Awards as \"possibly the best written and most intelligent news program ever.\"\nQuestion: Linda Ellerbee is an American journalist who is known for her works on \"NBC News Overnight\" as the best written and most intelligent news program ever. True, False, or Neither? Neither\n###\nWinning America is a documentary television film about the Canadian band Said the Whale. It follows the band on their first US tour down through California, and then to South by Southwest. It premiered on CBC Television on July 23, 2011. The film was directed by Brent Hodge and Thomas Buchan, and was produced by Brent Hodge, Jon Siddall and Sheila Peacock. It was nominated for a Leo Award in 2012.\nQuestion: 5 months after premiering on CBC Television, Winning America was released on DVD. True, False, or Neither? Neither\n###\nYulia Victorovna Makhalina (Russian: \u042e\u043b\u0438\u044f \u0412\u0438\u043a\u0442\u043e\u0440\u043e\u0432\u043d\u0430 \u041c\u0430\u0445\u0430\u043b\u0438\u043d\u0430 ), also Yulia, (born 23 June 1968) is a Russian ballet dancer. Since 1986, she has been with the Kirov/Mariinsky Ballet where she is a principal dancer. Along with Ulyana Lopatkina, Makhalina is a member of 'the basketball team', a group of Kirov dancers who are characterized for being especially tall and slender.\nQuestion: the basketball team is unsuccessful in their efforts True, False, or Neither? Neither\n###\nKiss of the Spider Woman is a musical with music by John Kander and Fred Ebb, with the book by Terrence McNally. It is based on the Manuel Puig novel \"El Beso de la Mujer Ara\u00f1a\". The musical had runs in the West End (1992) and Broadway (1993) and won the 1993 Tony Award for Best Musical.\nQuestion: Kiss of the Spider Woman won a Tony Award 2 years after it first ran in 1992. True, False, or Neither?", "doc_id": 203, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31514, 23842, 34003, 4796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "41 Commando or No. 41 (Royal Marine) Commando was a unit of the Royal Marines trained as Commandos during the Second World War. They were part of the all Royal Marine 4th Special Service Brigade that took part in the Normandy landings in June 1944 and later that served in World War II, the Korean War, and in Northern Ireland. They were disbanded in 1981.\nQuestion: Number 41 killed many people True, False, or Neither? Neither\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups.\nQuestion: Mohamed Izzadeen Mohamed Naufer has many fanatic fans. True, False, or Neither? Neither\n###\nBrendan Francis Aidan Behan (christened Francis Behan) ( ; Irish: \"Breand\u00e1n \u00d3 Beach\u00e1in\" ; 9 February 1923 \u2013 20 March 1964) was an Irish Republican, poet, short story writer, novelist, and playwright who wrote in both English and Irish. He is widely regarded as one of the greatest Irish writers and poets of all time.\nQuestion: Brendan Francis Aidan Behan wrote in french True, False, or Neither? False\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost.\nQuestion: B\u00ebor the Old is a real person. True, False, or Neither? False\n###\nThe 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015.\nQuestion: The 2015 City of Onkaparinga Challenger took place on courts made from concrete. True, False, or Neither?", "doc_id": 38, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38413, 14819, 42719, 4676], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Real Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded.\nQuestion: Real Madrid Club de F\u00fatbol C was disbanded due to poor attendance and financial support True, False, or Neither? Neither\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area.\nQuestion: The Pikes Peak Center for the Performing Arts is a concert auditorium in Colorado Springs, Colorado. True, False, or Neither? True\n###\nRoy Denzil Hibbert (born December 11, 1986) is a Jamaican-American professional basketball player who last played for the Denver Nuggets of the National Basketball Association (NBA). He is a two-time NBA All-Star, and earned NBA All-Defensive Second Team honors in 2014.\nQuestion: Hibbert no longer plays in the NBA. True, False, or Neither? True\n###\nLemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146.\nQuestion: Lemoyne is near more than one lake. True, False, or Neither? Neither\n###\nThe National Democratic Party (NDP) is a Ghanaian political party, founded in October 2012 as a split from the ruling National Democratic Congress. Its first leader was former NDC politician Nana Konadu Agyeman Rawlings, who is also the wife of former President of Ghana and NDC founder Jerry Rawlings.\nQuestion: The National Democratic Party has five words in it's name. True, False, or Neither?", "doc_id": 566, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18593, 25882, 27052, 27287], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Carlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features.\nQuestion: Carlyle Eubank is very proud of his film the signal True, False, or Neither? Neither\n###\nJoseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars.\nQuestion: Joseph Eppele protects the quarterback well True, False, or Neither? Neither\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ.\nQuestion: Frank is an African American author and blogger. True, False, or Neither? Neither\n###\n\"Requiem\" is the seventh episode in the fifth season, and the 101st overall episode, of the American crime drama television series \"NCIS\". It first aired on CBS in the United States on November 6, 2007. The episode was written by Shane Brennan and directed by Tony Wharmby.\nQuestion: The episode was directed by Tony Wharmby and written by Shane Brennan who felt that Requiem was a classic True, False, or Neither? Neither\n###\nThe 2015 Abu Dhabi Grand Prix (formally known as the 2015 Formula 1 Etihad Airways Abu Dhabi Grand Prix) was a Formula One motor race held at the Yas Marina Circuit on 29 November 2015. The race was the nineteenth and final round of the 2015 season, and marked the seventh running of the Abu Dhabi Grand Prix as a round of the World Championship since its inception in .\nQuestion: The Abu Dhabi Grand Prix was started in 2008. True, False, or Neither?", "doc_id": 454, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24253, 26329, 2876, 32015], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society.\nQuestion: The word Christmas contains the word \"Christ\" within it. True, False, or Neither? True\n###\nHarold E. Ennes was a broadcasting pioneer who authored many textbooks for broadcast and broadcast-related communications training and was a member of the Indianapolis chapter of the Society of Broadcast Engineers. He was a member of SBE's national Certification Committee and made many contributions to the early development of the SBE Certification Program.\nQuestion: Harold E. Ennes was a broadcaster who was not a female True, False, or Neither? True\n###\nPisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) is a 1975 Soviet Ukrainian musical film, produced by Viktor Storozhenko starring Sofia Rotaru in the main role, as well as Soviet Ukrainian Smerichka vocal-instrumental band. The movie features songs in Ukrainian, Moldovan and Russian of Sofia Rotaru filmed in the background of Ukrainian Carpathian mountains.\nQuestion: There is at least one song in Russian in the film Pisnia zavzhdy z namy. True, False, or Neither? True\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training.\nQuestion: The Sea Balliol went out to sea over 100 times. True, False, or Neither? Neither\n###\nMercy Yvonne Debrah-Karikari is a career diplomat and the first female to be secretary to the cabinet of the government of Ghana. She was appointed to occupy this position by the current President Nana Akufo-Addo. Her appointment took effect on the 14th of February 2017.\nQuestion: Mercy Yvonne Debrah-Karikari was the secretary to the cabinet of the government of Ghana in 2019. True, False, or Neither?", "doc_id": 805, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23014, 972, 40154, 28212], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Linyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management.\nQuestion: The school is built mostly with wood True, False, or Neither? Neither\n###\nThe End Tour was the farewell tour of the heavy metal band Black Sabbath, featuring founding members Ozzy Osbourne, Tony Iommi and Geezer Butler. The tour concluded Sabbath's 40+ year career. The final show was February 4, 2017, in their home city of Birmingham, UK.\nQuestion: On Feb. 4th 2017 a rock band of some renown ended a career that has spanned more than three decades but less than five. True, False, or Neither? True\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\".\nQuestion: Jonathan King's 1971 version of \"Hooked on a Feeling\" includes the words 'ooga chaka'. True, False, or Neither? True\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books.\nQuestion: Bill Sienkiewicz is a baby boomer. True, False, or Neither? True\n###\nThe 1938 Montana Grizzlies football team represented the University of Montana in the 1938 college football season as a member of the Pacific Coast Conference (PCC). Led by fourth-year head coach Doug Fessenden, they played their home games on campus in Missoula at Dornblaser Field. The Grizzlies finished the season with an overall record of 5\u20133\u20131, and were 0\u20131 in PCC play.\nQuestion: The Pacific Coast Conference featured the Montana Razorbacks in 1938. True, False, or Neither?", "doc_id": 871, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36732, 44373, 15534, 39668], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous.\nQuestion: The McLaren team were proud of the McLaren MP4/1 as it was quick True, False, or Neither? Neither\n###\nCavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas.\nQuestion: Calvary Sunday takes place on the second Sunday of March. True, False, or Neither? Neither\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles.\nQuestion: The movie was produced BY Fox True, False, or Neither? Neither\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's.\nQuestion: Staughton mall is the biggest in the City. True, False, or Neither? Neither\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia.\nQuestion: The Big 12 Conference has 3 schools from Texas. True, False, or Neither?", "doc_id": 571, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17235, 44612, 19238, 13749], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records.\nQuestion: Weezer is the band's 9th album True, False, or Neither? False\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th.\nQuestion: Vasili Vyacheslavovich Blagov starts with an A. True, False, or Neither? False\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal.\nQuestion: The Empire of Japan invaded and occupied the Northeast over 70 Years ago. True, False, or Neither? True\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968.\nQuestion: The 1974 New York Mets had a cheap beer day. True, False, or Neither? Neither\n###\nThe 1997 Indian vice-presidential election was held on 16 August 1997 to elect Vice-President of India. Krishan Kant defeated Surjit Singh Barnala to become 10th Vice-President of India. At the time of the election, VP office was vacant since the incumbent, K. R. Narayanan, had already inaugurated as President following his victory in the presidential election.\nQuestion: The 1997 Indian vice-presidential election was held in the ninth month of the year. True, False, or Neither?", "doc_id": 561, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35683, 14027, 37273, 40619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley.\nQuestion: Art of Dying is from Europe True, False, or Neither? False\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College.\nQuestion: There are no Major League Baseball teams based in California. True, False, or Neither? Neither\n###\nFar from the Madding Crowd is a 2015 British romantic drama film directed by Thomas Vinterberg and starring Carey Mulligan, Matthias Schoenaerts, Michael Sheen, Tom Sturridge and Juno Temple. It is an adaptation of the 1874 novel of the same name by Thomas Hardy, the fourth time this novel has been filmed.\nQuestion: Far from the Madding Crowd is a comedy True, False, or Neither? False\n###\nThe 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015.\nQuestion: There have been five tournaments in total. True, False, or Neither? Neither\n###\nCraig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\".\nQuestion: Craig Lahiff directed several television shows. True, False, or Neither?", "doc_id": 777, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [3324, 21926, 5179, 23385], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amara Karan (born 1984) is a Sri Lankan-English actress who made her film d\u00e9but as the love interest in Wes Anderson's \"The Darjeeling Limited\". The film premi\u00e8red at the 2007 Venice Film Festival. Karan's second film role was as schoolgirl Peaches in the 2007 film \"St Trinian's\".\nQuestion: The film was not popular True, False, or Neither? Neither\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif.\nQuestion: There Is a Woman in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama True, False, or Neither? False\n###\nComet in Moominland (Swedish: \"Kometjakten\" / \"Mumintrollet p\u00e5 kometjakt\" / \"Kometen kommer\") is the second in Tove Jansson's series of Moomin books. Published in 1946, it marks the first appearance of several main characters, like Snufkin and the Snork Maiden.\nQuestion: Snufkin did not appear in the book before Comet True, False, or Neither? True\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site.\nQuestion: The Cincinnati and Whitewater Canal Tunnel has been seen by Bill. True, False, or Neither? Neither\n###\nThomas Carr Frank (born March 21, 1965) is an American political analyst, historian, journalist, and columnist for \"Harper's Magazine\". He wrote \"The Tilting Yard\" column in the \"Wall Street Journal\" from 2008 to 2010, and he co-founded and edited \"The Baffler\". He has written several books, most notably \"What's the Matter with Kansas?\" (2004) and \"Listen, Liberal\" (2016).\nQuestion: Harpers Magazine is the same as Harpers Bazaar. True, False, or Neither?", "doc_id": 996, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15143, 13721, 40004, 34344], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico.\nQuestion: The USFC \"Fish Hawk\" is no longer in operation. True, False, or Neither? True\n###\n\"In Due Time\" is the lead single from Killswitch Engage's sixth studio album, \"Disarm the Descent\". The song is the band's first single to feature vocalist Jesse Leach since 2003's \"The Element of One\". The song charted at no. 23 on the Active rock chart and no. 26 on the Mainstream Rock chart.\nQuestion: \"In Due Time\" features the vocalist Jessica Leach. True, False, or Neither? False\n###\nWarriors of Virtue is a 1997 Chinese-American martial arts fantasy film directed by Ronny Yu and starring Angus Macfadyen, Mario Yedidia, and Marley Shelton. It was released in English, Mandarin and Cantonese-language versions. The creature effects were designed by Academy Award-nominated special effect production house Alterian, Inc.\nQuestion: The film was directed by a man with the last name Yu True, False, or Neither? True\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization.\nQuestion: 315th Airlift Wing consists of Air force reserves and active-duty military personal True, False, or Neither? Neither\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\").\nQuestion: \"Toi, la musique et moi\" has been translated into more than three languages True, False, or Neither?", "doc_id": 132, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7801, 2471, 9550, 30902], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Julian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality.\nQuestion: Bob Marley's son, Julian, is also a musician. True, False, or Neither? True\n###\nAmerican football strategy concerns the deployment of offensive, defensive, and special teams players and the execution of plays in American football. In American football, there are a vast array of positions, formations, strategies, plays and types of play calling systems that are utilized. If a strategy is for a particular game, it is known as a \"game plan\".\nQuestion: American football strategy will improve a teams performance. True, False, or Neither? Neither\n###\nGhost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997).\nQuestion: El Camino was the first to sign Salt True, False, or Neither? Neither\n###\nBaar is a railway station in the Swiss canton of Zug, situated in the municipality of Baar. The station is located on the Z\u00fcrich to Lucerne railway line and is an intermediate stop for InterRegio trains from Z\u00fcrich to Lucerne and on Z\u00fcrich S-Bahn line S9.\nQuestion: Baar is the first station before Lucerne. True, False, or Neither? Neither\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire.\nQuestion: Peter John Reynolds sadly passed away on the 33 of september, 2001 True, False, or Neither?", "doc_id": 765, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18753, 30230, 33132, 28917], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990).\nQuestion: Zale Dalen is a Canadian film and television director. He is proud of his film the hounds of Notre Dame True, False, or Neither? Neither\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: There are no plans to expand the The San Nicolao Tunnel True, False, or Neither? Neither\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX.\nQuestion: The Outsiders are both dead True, False, or Neither? Neither\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Mickey Rooney had other children. True, False, or Neither? Neither\n###\nHamsalekha (born Govindaraju Gangaraju on 23 June 1951) is an Indian film composer and a songwriter who works in South Indian cinema, predominantly in the Kannada film industry since the late 1980s. He is also a screenplay writer, dialogue writer, instrumentalist and a conductor. Composed and written for over 300 feature films.\nQuestion: Hamsalekha wrote stories. True, False, or Neither?", "doc_id": 648, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23411, 42749, 6542, 36518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia.\nQuestion: Jeffrey Orlando Hunter played defense. True, False, or Neither? True\n###\nBabar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\".\nQuestion: HBO first played Babar: King of the Elephants in the year 1999 True, False, or Neither? False\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\").\nQuestion: \"Toi, la musique et moi\" has been translated into multiple languages True, False, or Neither? True\n###\nThomas Carr Frank (born March 21, 1965) is an American political analyst, historian, journalist, and columnist for \"Harper's Magazine\". He wrote \"The Tilting Yard\" column in the \"Wall Street Journal\" from 2008 to 2010, and he co-founded and edited \"The Baffler\". He has written several books, most notably \"What's the Matter with Kansas?\" (2004) and \"Listen, Liberal\" (2016).\nQuestion: Carr is not known widely as a car enthusiast. True, False, or Neither? Neither\n###\n\"Beyond This Earthly Realm\" is the eleventh episode of the fourth season of the American animated television series \"Adventure Time\". The episode was written and storyboarded by Ako Castuera and Jesse Moynihan, from a story by Patrick McHale, Kent Osborne, and Pendleton Ward. It originally aired on Cartoon Network on June 11, 2012.\nQuestion: \"Beyond This Earthly Realm\" was written by professional comedians True, False, or Neither?", "doc_id": 852, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42118, 11876, 32111, 26654], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city.\nQuestion: Bear River City is a city because they signed a petition to be a city. True, False, or Neither? Neither\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award.\nQuestion: Rachel Brosnahan was born in the 50s. True, False, or Neither? False\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement.\nQuestion: Princess Caroline was 1 day old when she died True, False, or Neither? False\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo.\nQuestion: Celly Cel is a very silly rapper True, False, or Neither? Neither\n###\nChief Crazy Horse is a 1955 American CinemaScope Technicolor Western film directed by George Sherman starring Victor Mature, Suzan Ball and John Lund. The film is a fictionalized biography of the Lakota Sioux Chief Crazy Horse. It was also known as \"Valley of Fury\".\nQuestion: Chief Crazy Horse is not a south american film True, False, or Neither?", "doc_id": 52, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27966, 337, 34227, 16509], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949.\nQuestion: CUHK is a co-ed institution. True, False, or Neither? Neither\n###\nThe Cars are an American rock band that emerged from the new wave scene in the late 1970s. The band originated in Boston, Massachusetts, in 1976, with singer, rhythm guitarist and songwriter Ric Ocasek, singer and bassist Benjamin Orr, lead guitarist Elliot Easton, keyboardist Greg Hawkes and drummer David Robinson.\nQuestion: The Cars originated in MA. True, False, or Neither? True\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series.\nQuestion: The Bear and the Maiden Fair is about Game of Thrones. True, False, or Neither? True\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents.\nQuestion: There is 147 people in Kapp Heights. True, False, or Neither? False\n###\nNuestra Belleza Nuevo Le\u00f3n 2007, was held at Las Lomas Eventos in Monterrey, Nuevo Le\u00f3n on July 25, 2007. At the conclusion of the final night of competition, Anagabriela Espinoza of San Pedro Garza Garc\u00eda was crowned the winner. Espinoza was crowned by outgoing Nuestra Belleza Nuevo Le\u00f3n titleholder, Mariana Lombard. Eight contestants competed for the state title.\nQuestion: Nuestra Belleza Nuevo Le\u00f3n 2007 was a biking event. True, False, or Neither?", "doc_id": 949, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44300, 14771, 17117, 13594], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Middlewich Paddies are an Irish folk band formed in 1979 in the town of Middlewich in Cheshire. Although not widely known outside of folk music circles, two members of the band were instrumental in setting up the Middlewich folk and boat festival which has now become a recognised festival on the folk circuit.\nQuestion: Two members of \"The Middlewich Paddies\" helped set up the Middlewich folk and boat festival in the 80's. True, False, or Neither? Neither\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation.\nQuestion: Paul Hausser was cruel even towards other Germans True, False, or Neither? Neither\n###\nGood is a 2008 drama film based on the stage play of the same name by C. P. Taylor. It stars Viggo Mortensen, Jason Isaacs, and Jodie Whittaker, and was directed by Vicente Amorim. The film premiered at the Toronto International Film Festival on 8 September 2008.\nQuestion: Good's script was written by Vicente Amorim. True, False, or Neither? Neither\n###\nMichael Hunter, Jr. (born July 10, 1988) is an American professional boxer who challenged for the WBO junior heavyweight title in 2017. As an amateur he won the National Championships as a super heavyweight in 2007 and 2009, and qualified for the 2012 Olympics in the heavyweight division. He is the son of former professional boxer Mike \"the Bounty\" Hunter.\nQuestion: Michael Hunter is an amateur boxer. True, False, or Neither? False\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively.\nQuestion: Shadowgun Deadzone was released in 2011 True, False, or Neither?", "doc_id": 761, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [4624, 25401, 28894, 32985], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"A Leela of Her Own\" is the sixteenth episode in the third season of the animated series \"Futurama\". The episode is an homage to \"A League of Their Own\". It originally aired on the Fox network in the United States on April 7, 2002. Bob Uecker provided the voice of himself, Tom Kenny provided the voice of Abner Doubledeal, and Hank Aaron guest starred as himself and Hank Aaron XXIV.\nQuestion: A Leela of Her Own is an episode of Futurama True, False, or Neither? True\n###\nMiniszt\u00e1r is a Hungarian pop group formed in 2000 and consisting of Georgina ('Gina') Poly\u00e1kovics, Vivien Gonda and M\u00e1rk\u00f3 ('M\u00e1rk') Tak\u00e1cs. The band has released two albums to date, as well as a video DVD. The group is one of many to cover the popular song Dragostea Din Tei.\nQuestion: Miniszt\u00e1r was formed over 10 years ago True, False, or Neither? True\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Clair Huffaker wrote the book in 1960. True, False, or Neither? Neither\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate.\nQuestion: The company will make a bar with no toffee. True, False, or Neither? Neither\n###\nRemember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\"\nQuestion: It has been 12 years since the theater release of the movie Remember the Daze. True, False, or Neither?", "doc_id": 716, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34211, 2768, 11848, 30146], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Smithereens was sang by at least 2 people in the acoustic version. True, False, or Neither? True\n###\nErnest R. Kroeger (August 10, 1862 \u2013 April 7, 1934) was an American composer. He is mainly known for the pedagogical works he composed for piano; he also taught music in St. Louis, Missouri. Today his papers are held at the Missouri Historical Society.\nQuestion: Today, Ernest R. Kroeger's works are housed at the Missouri Society for the Prevention of Cruelty to Animals. True, False, or Neither? False\n###\nDavid Armand (born 1980) is an American writer of fiction, non-fiction, and poetry. He has published three novels, \"The Pugilist's Wife\", \"Harlow\", and \"The Gorge\". He has also published a collection of poems, \"The Deep Woods\", and a memoir titled \"My Mother's House\". He is currently Writer-in-Residence at Southeastern Louisiana University.\nQuestion: David Armand is a girl. True, False, or Neither? False\n###\nPhichai Railway Station is a railway station located in Nai Mueang Subdistrict, Phichai District, Uttaradit. It is located 447.553\u00a0km from Bangkok Railway Station and is a class 2 railway station. It is on the Northern Line of the State Railway of Thailand. Phichai Railway Station opened as part of the Northern Line extension from Phitsanulok to Ban Dara Junction in November 1908.\nQuestion: The station is very abandoned True, False, or Neither? Neither\n###\nNannina de' Medici (14 February 1448 \u2013 14 May 1493), born Lucrezia de' Medici, was the second daughter of Piero di Cosimo de' Medici and Lucrezia Tornabuoni. She was thus the elder sister of Lorenzo de' Medici. She married Bernardo Rucellai. Her father's name was Piero, so she is sometimes known as Lucrezia di Piero de' Medici.\nQuestion: Lucrezia de' Medici had multiple siblings other than Lorenzo True, False, or Neither?", "doc_id": 229, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29658, 33577, 40215, 38642], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Analyze This is a 1999 gangster comedy film directed by Harold Ramis, who co-wrote the screenplay with playwright Kenneth Lonergan and Peter Tolan. The film stars Robert De Niro as a mafioso and Billy Crystal as his psychiatrist. A sequel, \"Analyze That\", was released in 2002.\nQuestion: Analyze This has a short ending. True, False, or Neither? Neither\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12.\nQuestion: Upper Grosvenor Street has changed names True, False, or Neither? Neither\n###\nMarvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds.\nQuestion: Barrow died in 1933 True, False, or Neither? True\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853.\nQuestion: Joseph Smith was not an orphan True, False, or Neither? True\n###\nThe Wolfsonian\u2013Florida International University or The Wolfsonian-FIU, located in the heart of the Art Deco District of Miami Beach, Florida, is a museum, library and research center that uses its collection to illustrate the persuasive power of art and design. For fifteen years, The Wolfsonian has been a division within Florida International University.\nQuestion: For the total amount of years that is equivalent to five multiplied by five, The Wolfsonian has been a division within Florida International University. True, False, or Neither?", "doc_id": 353, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [23241, 14107, 36587, 2009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia.\nQuestion: Hunter also was very good at basketball. True, False, or Neither? Neither\n###\nUna Lillian Paisley (born 18 November 1922 in Kew in Melbourne in Victoria - died 1977 in Kew, Victoria) was an Australian cricket player. She played twelve Test matches for the Australia national women's cricket team. She captained the Australia national women's cricket team in four Test matches against New Zealand and England.\nQuestion: Una Lillian Paisley met Bush. True, False, or Neither? Neither\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: Santa Cova Funicular goes down a mountain. True, False, or Neither? True\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success.\nQuestion: \"Thief\", a styled maze chase, was the greatest game released that year. True, False, or Neither? Neither\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano.\nQuestion: Memento was written and directed by two different individuals True, False, or Neither?", "doc_id": 177, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37475, 15651, 32251, 17868], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Salvatore Mineo, Jr. (January 10, 1939February 12, 1976), was an American film and theatre actor, known for his performance as John \"Plato\" Crawford opposite James Dean in the film \"Rebel Without a Cause\" (1955). He was twice nominated for the Academy Award for Best Supporting Actor, for his roles in \"Rebel Without a Cause\" and \"Exodus\" (1960).\nQuestion: He was nominated for an Emmy. True, False, or Neither? Neither\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name.\nQuestion: Sabrina Le Beauf played Sondra for one season. True, False, or Neither? Neither\n###\nSan Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County.\nQuestion: San Francisco Bay Ferry operates train service along the California coast. True, False, or Neither? False\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris.\nQuestion: Since \"Yellow Ledbetter\" is the 113th episode of \"Vampire Diaries\", there must have been 112 episodes that aired before it. True, False, or Neither? True\n###\nFather Xmas is a 2001 short film from director Marie Rose and the American Film Institute's Directing Workshop for Women starring Dakota Fanning as six-year-old Clairee who learns from her older brother (Stephen Fanning) that Santa Claus is not real and that their father is fighting in the Vietnam War.\nQuestion: Father Xmas has an ending. True, False, or Neither?", "doc_id": 267, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11714, 40278, 15495, 4715], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Take Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\".\nQuestion: The album Take Two had more than three hits. True, False, or Neither? Neither\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost.\nQuestion: B\u00ebor the Old is a fictional character in J.R.R. was the father of Baran and Belen and Beren Camlost. True, False, or Neither? False\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists.\nQuestion: Wikipedia is working to complete this list. True, False, or Neither? Neither\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki.\nQuestion: Yellowtail first considered the name Bluetail. True, False, or Neither? Neither\n###\nTamanna (Hindi: \u0924\u092e\u0928\u094d\u0928\u093e , translation: Desire) is a 1997 Indian drama film directed by Mahesh Bhatt. It stars Paresh Rawal, Pooja Bhatt, Sharad Kapoor and Manoj Bajpayee in the lead roles The screenplay was written by Tanuja Chandra. The story was written by Tanuja Chandra and Mahesh Bhatt. It was produced by Pooja Bhatt.\nQuestion: The film was written and produced by three different people. True, False, or Neither?", "doc_id": 102, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5153, 20835, 19740, 44453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015.\nQuestion: The fourth edition of the tournament was played on outdoor hard court True, False, or Neither? True\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono.\nQuestion: Bono is married to Hewson. True, False, or Neither? True\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee.\nQuestion: Peachtree Corners is within GCPS. True, False, or Neither? True\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: The Nydala Abbey was still in operation in 1500. True, False, or Neither? Neither\n###\nSeveral politico-constitutional arrangements use reserved political positions, especially when endeavoring to ensure the rights of minorities or preserving a political balance of power. These arrangements can distort the democratic principle of \"one person - one vote\" in order to address special circumstances.\nQuestion: politico-constitutional arrangements are imaginanry True, False, or Neither?", "doc_id": 117, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9378, 17973, 20811, 17572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Telephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh.\nQuestion: Telephone Shilpa Sangstha launched the first cell phone made/assembled in Bangladesh, Doel True, False, or Neither? Neither\n###\nBruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice.\nQuestion: Bruno Mingeon won his first Olympic medal in 1967. True, False, or Neither? False\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards.\nQuestion: The Proteus Design Suite is a proprietary software tool suite is a place you can trust True, False, or Neither? Neither\n###\nWilson Dam is a dam spanning the Tennessee River between Lauderdale County and Colbert County in the U.S. state of Alabama. It impounds Wilson Lake. It is one of nine Tennessee Valley Authority (TVA) dams on the Tennessee River. The dam was declared a National Historic Landmark on November 13, 1966.\nQuestion: Wilson Dam is a popular place to fish True, False, or Neither? Neither\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying.\nQuestion: Black Dahlia was meant to scare people True, False, or Neither?", "doc_id": 325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14486, 3159, 7920, 30106], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lament is the seventh studio album by British new wave band Ultravox, released in the UK on 6 April 1984. It was the last album featuring original drummer Warren Cann until the band's reunion album \"Brilliant\" in 2012. The album peaked at #8 on the UK album chart and was certified Gold by the BPI in June 1984 for 100,000 copies sold. It also reached #25 in Germany and #115 in the United States.\nQuestion: Lament was the best selling album by Ultravox in the United States True, False, or Neither? Neither\n###\nLibya TV (also known as Libya Al Ahrar TV) is a Libyan TV channel broadcast by satellite from its headquarters in Doha. The channel was created in 2011 during the Libyan Civil War. Its presents news, opinions, analysis, photo and video reports about Libya in specific and the region in a wider scope. It focuses on Libya\u2019s revolution and future toward building a democratic state.\nQuestion: Libya TV often shows news reports about war. True, False, or Neither? Neither\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee.\nQuestion: Destiny was not nominated for an Oscar award. True, False, or Neither? True\n###\nHellgate (originally titled Shadows) is a 2011 American-Thai supernatural thriller directed and written by John Penney, starring William Hurt and Cary Elwes. Elwes plays the sole survivor of a car crash who, upon seeing ghosts, seeks help from a spiritual guru (Hurt).\nQuestion: Hellgate was more popular in Thailand than America True, False, or Neither? Neither\n###\nThe Circuit Gilles Villeneuve (also spelled Circuit Gilles-Villeneuve in French) is a motor racing circuit in Montreal, Quebec, Canada. It is the venue for the FIA Formula One Canadian Grand Prix. It has previously hosted the FIA World Sportscar Championship, the Champ Car World Series, the NASCAR Canadian Tire Series, the NASCAR Xfinity Series and the Grand-Am Rolex Sports Car Series.\nQuestion: The Circuit Gilles Villeneuve once had a massive world championship at it True, False, or Neither?", "doc_id": 260, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44986, 18123, 43957, 44398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Punjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999.\nQuestion: Punjab Control of Organised Crime Act was enacted before the Maharashtra Control of Organised Crime Act True, False, or Neither? False\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\".\nQuestion: Linkin Park are proud of the song for winning a grammy True, False, or Neither? Neither\n###\nChild Whispers (published in 1922) is the first published work of the English children's author Enid Blyton, illustrated by her childhood friend and collaborator Phyllis Chase. It is a collection of 28 poems, and one of Blyton's most popular and best-known poetry books.\nQuestion: Blyton knew Chase for a long time. True, False, or Neither? True\n###\nRishika Singh is an Indian actress who appears in Kannada-language films. She is the daughter of film director Rajendra Singh Babu and granddaughter of Mysore-based film producer Shankar Singh and former Bengali actress Pratima Devi. Her brother Aditya also appears in Kannada films.\nQuestion: Rishika Singh's mother is an actress. True, False, or Neither? Neither\n###\nSourceMedia is a mid-sized diversified business-to-business digital media company owned by Observer Capital, which acquired the company from Investcorp in August 2014. Formerly the Thomson Media division of The Thomson Corporation, SourceMedia was spun off and sold by Thomson to Investcorp in 2004 for $350 million.\nQuestion: Investcorp made $350 million on the sale of SourceMedia. True, False, or Neither?", "doc_id": 458, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38471, 21750, 35546, 11727], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Suntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles.\nQuestion: umashree never acted after this movie True, False, or Neither? Neither\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie.\nQuestion: Cher performs song from other artists. True, False, or Neither? True\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway.\nQuestion: Queen Maud Land in Antarctica was discovered in 1931. True, False, or Neither? Neither\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg.\nQuestion: Duel has a long beginning scene. True, False, or Neither? Neither\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries.\nQuestion: 2007 was not an exciting time for football fans. True, False, or Neither?", "doc_id": 658, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26345, 29553, 29884, 13741], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello.\nQuestion: The basis for Cassio is \"the squadron leader\" in \"Un Capitano Moro.\" True, False, or Neither? True\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery.\nQuestion: The Anchor Bankside is a pub in the England Borough of Southwark True, False, or Neither? False\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate.\nQuestion: The Toffee Crisp bar was manufactored in 1963 True, False, or Neither? True\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: Power chords are rarely used in punk rock True, False, or Neither? False\n###\nZafar Mahmud (1923\u20132016) was a Royal Indian Air Force officer during the second world war, originally stationed in Burma and subsequently stationed in Quetta (in present-day Pakistan) from 1945 to 1947 before the partition of British India. He was sent to England a number of times to train with the Royal Air Force just before and after the war.\nQuestion: Zafar Mahmud fought in the Vietnam War. True, False, or Neither?", "doc_id": 671, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20876, 2529, 22212, 27964], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Valley of Fire Road (also called the Valley of Fire Highway) is a road in northeastern Clark County, Nevada serving the Valley of Fire State Park. The roadway was previously designated State Route 40 (SR 40), and the segment within the state park is currently designated a Nevada Scenic Byway.\nQuestion: There are no areas in Nevada that are recognized as state parks. True, False, or Neither? False\n###\nSamson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation.\nQuestion: Samson and Delilah is performed more in the English language than in German. True, False, or Neither? Neither\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria.\nQuestion: The war left many wounds in society. True, False, or Neither? Neither\n###\nThe Underground Man (1997) is a novel by Mick Jackson. Critically acclaimed, it was shortlisted for the Booker Prize for that year. It shows the life of an eccentric and reclusive Victorian Duke, loosely modelled on William Cavendish-Scott-Bentinck, 5th Duke of Portland. His latest scheme involves building a set of tunnels beneath his estate.\nQuestion: The Underground Man was critically acclaimed by the critic, James Smith. True, False, or Neither? Neither\n###\nThe Letter Black, formerly known as Breaking the Silence, is a Christian rock band that was formed in 2006 in Uniontown, Pennsylvania. The band consists of lead vocalist Sarah Anthony; her husband, lead guitarist and vocalist Mark Anthony; and drummer Justin Brown.\nQuestion: Sarah is not a biological woman True, False, or Neither?", "doc_id": 874, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41410, 14565, 45457, 1991], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\".\nQuestion: James Wyatt designed the roleplaying game \"Dungeons & Dragons\". True, False, or Neither? True\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II.\nQuestion: Astra Modelo 400 was the sidearm standard in the army of the Spanish. True, False, or Neither? True\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes.\nQuestion: The show premiered the year after 1999. True, False, or Neither? False\n###\nCourtland Park is a sub-neighbourhood of Carleton Heights in River Ward in the west end of Ottawa, Canada. It is bounded on the north by Baseline Road, on the east by the Rideau River, on the south by Dynes Road and on the west by Fisher Avenue. Prince of Wales Drive runs thorough the neighbourhood.\nQuestion: Courtland Park was once surrounded by highways True, False, or Neither? Neither\n###\nThe Samsung Galaxy Tab 8.9 is an Android-based tablet computer designed and manufactured by Samsung, introduced on 22 March 2011 at CTIA wireless convention in its Samsung Unpacked event in Orlando. It is part of the Samsung Galaxy Tab series, and features an 8.9-inch display and a 1\u00a0GHz dual-core Nvidia Tegra 2 processor.\nQuestion: Samsung Galaxy has about 9 inch display True, False, or Neither?", "doc_id": 456, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31990, 41235, 12260, 16017], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: Come Back in One Piece is not on the Romeo Must Die soundtrack. True, False, or Neither? False\n###\nEldrid Nordb\u00f8 (born 12 August 1942) is a Norwegian politician for the Labour Party. She was personal secretary to the Minister of Social Affairs in 1971, state secretary to the prime minister (1986-89), and Minister of Trade and Shipping (1990-91). She is married to economist and politician Bj\u00f8rn Skogstad Aamo.\nQuestion: Eldrid Nordb\u00f8 lives in Norway. True, False, or Neither? Neither\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property.\nQuestion: Nikola Tesla's former Wardenclyffe laboratory was owned by New York State True, False, or Neither? Neither\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town.\nQuestion: Harbour Place Shopping Centre is anchored by Macy Stores True, False, or Neither? False\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings.\nQuestion: The recipe says to use canola oil True, False, or Neither?", "doc_id": 441, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [831, 23895, 32357, 17651], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\".\nQuestion: The Last of Us Part II was released on December 2016. True, False, or Neither? False\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state.\nQuestion: Sidalcea oregana grows everywhere. True, False, or Neither? False\n###\nSouthpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company.\nQuestion: When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office. True, False, or Neither? Neither\n###\nOleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers.\nQuestion: Oleg Smirnov was born in Moscow on April 8, 1980. True, False, or Neither? Neither\n###\nThe Highway of Hope is a 1917 American Western silent film directed by Howard Estabrook and written by Harvey Gates and Willard Mack. The film stars House Peters, Sr., Kathlyn Williams, Jim Farley and Harry De Vere. The film was released on May 17, 1917, by Paramount Pictures.\nQuestion: House Peters, Sr. won an award for his singing in The Highway of Hope. True, False, or Neither?", "doc_id": 786, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20720, 43325, 41398, 25342], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag.\nQuestion: Christoph has a long last name. True, False, or Neither? True\n###\nSadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award.\nQuestion: The miniseries Sadat was filmed after his death. True, False, or Neither? True\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle.\nQuestion: I am the boon brewery True, False, or Neither? False\n###\n\"Drop Girl\" is a song by American rapper Ice Cube, and produced by Redfoo for FooCo LLC.. The song, released on July 22, 2014. Drop Girl is the sixth single from Ice Cube's upcoming studio album \"Everythang's Corrupt\". The song features guest vocals from American singer Redfoo and fellow rapper 2 Chainz. In the chorus, it samples a part of Ice Cube's Today Was A Good Day.\nQuestion: \"Drop Girl\" is a song that was released over 2 decades ago True, False, or Neither? False\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal.\nQuestion: The Japanese approved of the Empire of Japan. True, False, or Neither?", "doc_id": 358, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [32397, 15715, 4441, 32870], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer.\nQuestion: Maria liked to pen historical text True, False, or Neither? Neither\n###\nPhakisa Freeway is a motor racing circuit located in Odendaalsrus, South Africa. From 1999 to 2004, the venue hosted the South African motorcycle Grand Prix of the MotoGP championship. It has a capacity of 60,000 spectators and opened in 1999. The track has a 4.24\u00a0km road course and a 1.5 mi oval course. The oval track is an exact copy of Las Vegas Motor Speedway from 1997.\nQuestion: Phakisa Freeway was the only circuit between 1999 to 2004 True, False, or Neither? Neither\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats.\nQuestion: Sierra onion has a pleasant odor. True, False, or Neither? Neither\n###\nCruel World is a 2005 American horror comedy film co-produced and directed by Kelsey T. Howard. The film is about a psychotic man who loses a reality game show and subsequently kills the host. He uses the house where the show took place to film his own reality show. In the show, several contestants perform challenges, and the losers are killed rather than being sent home.\nQuestion: Cruel World was directed by Kelsey T. Howard. True, False, or Neither? True\n###\nTurnagain, also called Buru Island, is an island of the \"Western Islands\" region of the Torres Strait Islands archipelago, located in the northern section of Torres Strait, Queensland, Australia. Turnagain is located within the Torres Strait Island Region Local government area.\nQuestion: Turnagain is connected to land True, False, or Neither?", "doc_id": 683, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34472, 31008, 32848, 2704], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ethan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films.\nQuestion: Ethan Suplee had an acting role in American History X. True, False, or Neither? True\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8.\nQuestion: Dave Dennis was born in Sydney True, False, or Neither? True\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family.\nQuestion: The station was renamed in the century before the current century True, False, or Neither? True\n###\nSarah Beth Noriega (born April 24, 1976) is a former indoor volleyball player. She played for Loyola Marymount University from 1994 to 1997 and was named the 1997 West Coast Conference Player of the Year. She also played for the United States national team at the 2000 Summer Olympics.\nQuestion: sarah played voleyball since she was a kid True, False, or Neither? Neither\n###\nNate Albert (born 1970) is an American music executive, songwriter, producer and guitar player. He is currently the Executive Vice President of A&R at Capitol Records a division of Universal Music Group. He was formerly Senior Vice President of A&R at Republic Records, where he worked with such artists as The Weeknd, Florence & the Machine, Phantogram and the Lonely Island.\nQuestion: Nate Albert was born within the last 100 years. True, False, or Neither?", "doc_id": 900, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36555, 38879, 36131, 42107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson.\nQuestion: \nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. He also loves cats. True, False, or Neither? Neither\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries.\nQuestion: Agent: \nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is strange looking and mysterious. True, False, or Neither? Neither\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist.\nQuestion: Jean-Baptiste Lamarck was not very proud to have the island named after him. True, False, or Neither? Neither\n###\nIn electromagnetism, charge density is a measure of electric charge is the amount of electric charge per unit length, surface area, or volume, called the linear, surface, or volume charge density, respectively. The respective SI units are C\u22c5m, C\u22c5m or C\u22c5m.\nQuestion: In electromagnetism can be measure in length. True, False, or Neither? True\n###\nRoger Heman (March 28, 1932 \u2013 November 13, 1989) was an American sound engineer. He won an Academy Award for Best Sound and was nominated for another one in the same category. His father was also a sound engineer and also won an Academy Award, for Best Effects, Special Effects for \"Crash Dive\".\nQuestion: Roger Heman won an Academy Award on March 28, 1959. True, False, or Neither?", "doc_id": 345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36990, 23350, 34213, 30104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The discography of Death, a metal band, consists of seven studio albums and four live albums. Death was an American metal band founded in 1983. The band's founder, Chuck Schuldiner, is considered \"a pioneering force in death metal and grindcore\". The band ceased to exist after Schuldiner died of brain cancer in 2001, though it remains an enduring metal brand.\nQuestion: The discography of Death, a metal band, consists of 11 albums in total to date. True, False, or Neither? False\n###\nThe Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world.\nQuestion: The ride was built in 1902 True, False, or Neither? Neither\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers.\nQuestion: They were a popular quartet True, False, or Neither? Neither\n###\nHellgate (originally titled Shadows) is a 2011 American-Thai supernatural thriller directed and written by John Penney, starring William Hurt and Cary Elwes. Elwes plays the sole survivor of a car crash who, upon seeing ghosts, seeks help from a spiritual guru (Hurt).\nQuestion: Hellgate was released in American before Thailand True, False, or Neither? Neither\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound.\nQuestion: The Bavarian Mountain Hound is typically a cross between the Bavarian Hound and Hanover Hound. True, False, or Neither?", "doc_id": 688, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5174, 10539, 33686, 12149], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war.\nQuestion: He will no longer produce shows. True, False, or Neither? Neither\n###\nA surf break at Point Leo, on the Mornington Peninsula, one of the closest surf beaches to Melbourne in Victoria, Australia known as First Reef or more colloquially just \"The Reef\". Until the 1970s there was little or no resident surfing population in Point Leo, so the Reef was mainly surfed by the few transient waveriders who were exploring the many breaks to be found in Westernport Bay.\nQuestion: The Reef is a very small compared to other locations. True, False, or Neither? Neither\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185.\nQuestion: In 2011 Wenham Parva still had less than 200 people as a population. True, False, or Neither? True\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release.\nQuestion: Sloppy Josh wanted to be a pianist before he joined the band True, False, or Neither? Neither\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport.\nQuestion: camair-co is a replacement of cameroon airlines True, False, or Neither?", "doc_id": 773, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6458, 6508, 45456, 18527], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg.\nQuestion: Trevor once had a pet swan True, False, or Neither? Neither\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred.\nQuestion: Krishan Kant served as Vice President of India before August 2002. True, False, or Neither? True\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis.\nQuestion: Greene didn't make any films before 2007. True, False, or Neither? False\n###\nFrank Randolph Cady (September 8, 1915\u00a0\u2013 June 8, 2012) was an American actor best known for his recurring and popular role as storekeeper Sam Drucker in three American television series during the 1960s\u00a0\u2013 \"Petticoat Junction\", \"Green Acres\", and \"The Beverly Hillbillies\"\u00a0\u2013 and his earlier role as \"Doc Williams\" on \"The Adventures of Ozzie and Harriet\".\nQuestion: Cady was also in Dallas. True, False, or Neither? Neither\n###\nShoshana Elise Bean (born September 1, 1977) is an American stage actress, singer and songwriter known for her roles in Broadway musicals. She is best known for being the first replacement actress for the role of Elphaba on Broadway in the musical \"Wicked\".\nQuestion: Shoshana Elise Bean was born more than 3500 days ago. True, False, or Neither?", "doc_id": 912, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36788, 4641, 6943, 44893], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983.\nQuestion: Robert L. White's magazine started out as a newsletter. True, False, or Neither? True\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin.\nQuestion: Vixen! was a sexploitation film made in 1968 True, False, or Neither? True\n###\n\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character.\nQuestion: bafta is an award True, False, or Neither? True\n###\nJoe Fryer is an American journalist and storyteller working for NBC News as a west coast correspondent based at the NBC News West Coast Bureau in Universal City, California. Fryer joined NBC News in 2013 as a part-time correspondent and officially joined NBC News as a full-time correspondent on October 21, 2013.\nQuestion: Joe Fryer joined NBC News 3 years ago. True, False, or Neither? False\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175.\nQuestion: The song was released in America in September 2008 True, False, or Neither?", "doc_id": 1, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24809, 12247, 2624, 44837], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The High Rock Canyon Wilderness is a U S Wilderness Area in Nevada under the Bureau of Land Management. It is located on the southwest side of High Rock Canyon and north of the Little High Rock Canyon Wilderness. It does not include the 4x4 trail in High Rock Canyon.\nQuestion: The High Rock Canyon Wilderness is government property. True, False, or Neither? True\n###\nJoel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden.\nQuestion: Joel Madden is a musician known for rap music True, False, or Neither? False\n###\nSpy Corps is a spy film for Christian families that was written and directed by J David Baker. It stars Sarah Beth Hill as a fearful high school teenager, and Adam Hale as a secret member of the Reserve Spy Training Corps, a training program for high school students who want to pursue a career as a spy.\nQuestion: Spy Corps was a film about religion and spies True, False, or Neither? True\n###\nJosef Jan\u00ed\u010dek (born 28 December 1947 in Prague, Czechoslovakia, now Czech Republic) is a Czech rock keyboardist, singer, accordion and guitar player. He was a former guitarist of The Primitives Group; from 1969 he played with The Plastic People of the Universe. He was also a member of Milan Hlavsa's band called \"P\u016flnoc\". Since 1990, he is a member of The Velvet Underground Revival Band.\nQuestion: Josef Jan\u00ed\u010dek has an U. True, False, or Neither? False\n###\nHerv\u00e9 Le Tellier (born 21 April 1957) is a French writer and linguist, and a member of the international literary group Oulipo (Ouvroir de Litt\u00e9rature Potentielle, which translates roughly as \"workshop of potential literature\"). Other notable members have included Raymond Queneau, Georges Perec, Italo Calvino, Jacques Roubaud, Jean Lescure and Harry Mathews.\nQuestion: George Washington was a member of Oulipo. True, False, or Neither?", "doc_id": 419, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33381, 35993, 40278, 12471], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Haliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015.\nQuestion: Haliru Dantoro Kitoro III is a strange name. True, False, or Neither? Neither\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries.\nQuestion: The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Sterling. True, False, or Neither? False\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost.\nQuestion: B\u00ebor the Old is a fictional character in J.R.R. was the father of Baran and Belen and Beren Camlost. True, False, or Neither? False\n###\nEngine is the second album by American Music Club. It was jointly released by Frontier and Grifter in the US and by Zippo in the UK and Europe in 1987. The 1998 Warner Bros. Records reissue added three additional tracks from the same period. The artwork for the Zippo UK release features an incorrect track listing, putting the songs in the wrong order.\nQuestion: American Music Club consisted of five members True, False, or Neither? Neither\n###\nSeverin Bijeli\u0107 (10 February 1921 \u2013 28 July 1972) was a Serbian actor. He appeared in 77 films and television shows between 1949 and 1972. He starred in the 1967 film \"The Rats Woke Up\", which won the Silver Bear for Best Director at the 17th Berlin International Film Festival.\nQuestion: Severin Bijeli\u0107 is his stage name. True, False, or Neither?", "doc_id": 645, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34274, 34944, 19276, 39355], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Matthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season.\nQuestion: Matthew Mansfield was born within the last 9876 days. True, False, or Neither? False\n###\nCentral Mountain Air Ltd. is a Canadian regional airline based in Smithers, British Columbia. It operates scheduled and charter services and transborder services. Its main base is Smithers Airport, with other bases at Calgary International Airport, Vancouver International Airport and Prince George Airport.\nQuestion: Smithers is the capital of British Columbia. True, False, or Neither? Neither\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor.\nQuestion: Prawin Pudi knows what Allu Arjun looks like True, False, or Neither? True\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017.\nQuestion: In 2017, the final tournament in the 2017 Overwatch World Cup will be held during the summer. True, False, or Neither? False\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\".\nQuestion: If one adds the number \"1\" to the number \"14\", one will arrive at the same number that's in the title of the magazine the human subject of this context was editor in chief of. True, False, or Neither?", "doc_id": 307, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41606, 21249, 1484, 13739], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: Rudbeckia hirta is a weed. True, False, or Neither? False\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: St Clement's has a population that has shrunk each year for the past 5 True, False, or Neither? Neither\n###\nAnne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952.\nQuestion: People have been watching Anne Frank's movie since they were children. True, False, or Neither? Neither\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open.\nQuestion: Tennis is not even a sport that Chris McKendry likes. True, False, or Neither? Neither\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862.\nQuestion: Ashcroft is orange. True, False, or Neither?", "doc_id": 734, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13410, 36766, 19469, 9018], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marco Masini (born September 18, 1964 in Florence), is an Italian singer-songwriter and musician. . One of his greatest virtues is his voice due to his vocal range, which reaches difficult musical notes, according to experts . . Accompanied by guitarist Riccardo Cherubini, .\nQuestion: Marco Masini lives in Florence True, False, or Neither? Neither\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015.\nQuestion: 7 different people have served as governor of Abia State since the state was created on August 27, 1991. True, False, or Neither? False\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp.\nQuestion: Sidney is a fishing destination. True, False, or Neither? Neither\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\".\nQuestion: Stillwater Cove Regional Park is situated by water. True, False, or Neither? True\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils.\nQuestion: The Big Cube is a film from the southern hemisphere True, False, or Neither?", "doc_id": 94, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13899, 29032, 1454, 32729], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o was born in France. True, False, or Neither? True\n###\nRBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes.\nQuestion: RBG Resources is not an affiliate of a US based firm. True, False, or Neither? False\n###\nBrew Masters is a television series that was run weekly on Discovery Channel starting on Sunday, November 21, 2010. The show focused on Sam Calagione, the founder and head of Dogfish Head Brewery in Milton, Delaware, and his staff as they searched the world for new, ancient, and imaginative inspirations for beers.\nQuestion: Brew Masters was aired once a month. True, False, or Neither? False\n###\nUSS \"Chicago\" (CA-136) was a \"Baltimore\"-class heavy cruiser laid down on 28 July 1943 at Philadelphia, Pennsylvania, US, by the Philadelphia Navy Yard. Launched on 20 August 1944, she was sponsored by Mrs. Edward J. Kelly, wife of the Mayor of Chicago, Illinois, and commissioned at the Philadelphia Navy Yard on 10 January 1945, Captain Richard R. Hartung, USN, in command.\nQuestion: The wife of the Mayor of Chicago sponsored a heavy cruiser in 1943 True, False, or Neither? True\n###\nRobert Newton \"Bob\" Ford (January 31, 1862 \u2013 June 8, 1892) was an American outlaw best known for killing his gang leader Jesse James in April 1882, to collect a reward. For about a year, Ford and his older brother Charles performed paid re-enactments of the killing at publicity events. Later he drifted around the West, operating saloons and dance halls.\nQuestion: Ford was only twenty when he killed James. True, False, or Neither?", "doc_id": 76, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9203, 31632, 28836, 3656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International.\nQuestion: Jon L. Luther's success in his industry can only be attributed to his hard work. True, False, or Neither? Neither\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs was almost kicked out of the New Democratic party. True, False, or Neither? Neither\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015.\nQuestion: sebo walker lived five years in los angeles True, False, or Neither? True\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration.\nQuestion: The High Bridge Branch never was near a body of water. True, False, or Neither? False\n###\nMichael Blodgett (September 26, 1939 \u2013 November 14, 2007) was an American actor, novelist, and screenwriter. Of his many film and television appearances he is best known for his performance as gigolo Lance Rocke in Russ Meyer's 1970 cult classic \"Beyond the Valley of the Dolls\". He retired from acting in the late 1970s and began a writing career.\nQuestion: Blodgett acted and wrote at the same time True, False, or Neither?", "doc_id": 313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14444, 559, 38676, 22091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr.\nQuestion: Director Gabriele Muccino is also a producer. True, False, or Neither? Neither\n###\nArthur William Feuerstein (born December 20, 1935) is an American chess player and winner of the first U.S. Armed Forces Chess Championship. According to the U.S. Chess Federation, Feuerstein is the shared 53rd ranked chess player over 65, regardless of country, residence or federation.\nQuestion: Arthur William Feuerstein played chess for many years True, False, or Neither? False\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films.\nQuestion: Ileana Carusio belongs to the Generation X. True, False, or Neither? Neither\n###\n\"Boat on the River\" is a 1979 song by Styx, from their album \"Cornerstone\". It was released as a single in 1980, but did not chart in the band's native United States. However, it was popular in several German-speaking countries, becoming a top-five hit on the German, Austrian and Swiss charts (reaching number one on the latter.)\nQuestion: Styx released a single in 1980 that was popular in Europe True, False, or Neither? True\n###\nThe Basketbowl was a college basketball game between Michigan State University and the University of Kentucky held on December 13, 2003 at Ford Field, a domed American football stadium in Detroit, Michigan. Kentucky won the game 79\u201374, never trailing throughout the contest.\nQuestion: Kentucky football defeated Michigan State University in 2003 True, False, or Neither?", "doc_id": 304, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39188, 44782, 39382, 32213], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line.\nQuestion: Homebrew is better than other management systems. True, False, or Neither? Neither\n###\nRichard Noel Marshall Armitage (12 August 1928\u00a0\u2013 17 November 1986) was a talent agent, active in England in the 1950s\u20131980s. Among his clients were Rowan Atkinson, John Cleese, David Frost and Stephen Fry. Producer John Lloyd described him as \"the most powerful agent in the country at that time [the late 1970s]\".\nQuestion: Marshall passed away in the last decade the century. True, False, or Neither? False\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears.\nQuestion: It is hot outside. True, False, or Neither? Neither\n###\nVP-HL-1 was a Heavy Patrol Squadron (Landplane) of the U.S. Navy. The squadron was established as Bombing Squadron 116 (VB-116) on 1 December 1943, redesignated Patrol Bombing Squadron 116 (VPB-116) on 1 October 1944, redesignated Patrol Squadron 116 (VP-116) on 15 May 1946, redesignated Heavy Patrol Squadron (Landplane) 1 (VP-HL-1) on 15 November 1946 and disestablished on 22 May 1947.\nQuestion: The US Navy oversaw three redesignations of the initially branded Bombing Squadron. True, False, or Neither? True\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records.\nQuestion: Christopher Tafoya is currently 43 years old. True, False, or Neither?", "doc_id": 435, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10666, 10016, 28892, 23280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic.\nQuestion: Rusanov's full nameis Vladislav Adolfovitch Rusanov. True, False, or Neither? True\n###\nToolbox Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. It is a remake of the 1978 film of the same name and was produced by the same people behind the original. The film centralizes on the occupants of an apartment who are stalked and murdered by a masked killer.\nQuestion: Tooldot Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. True, False, or Neither? False\n###\nExergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid.\nQuestion: Exergonix Inc has been around for 100 years. True, False, or Neither? False\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia.\nQuestion: He hated being a teacher True, False, or Neither? Neither\n###\nThe Texas A&M Aggie baseball team represents Texas A&M University in NCAA Division I college baseball. The Aggies have competed in the Southeastern Conference since 2013. The Aggies play home games at Olsen Field at Blue Bell Park. The team is led by head coach Rob Childress.\nQuestion: The Aggies are not the only team in the Southeastern Conference. True, False, or Neither?", "doc_id": 875, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28436, 13561, 19967, 35155], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Legendary Majik Mijits is an album that was recorded by Steve Marriott and Ronnie Lane when they reformed under the name of \"Majik Mijits\" in 1981 and gave a one-off concert at the Bridgehouse pub in East London. The lineup included Jim Leverton, Mick Green, Mick Weaver, Dave Hynes and Sam Brown.\nQuestion: Over 10,000 people attended the concert put on by Steve Marriot and Ronnie Lane at the Bridgehouse Pub in East London. True, False, or Neither? Neither\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde.\nQuestion: Brian Wilfe was in a crime-comedy series. True, False, or Neither? True\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013.\nQuestion: Brandon Tyler McManus is over 20 years old True, False, or Neither? True\n###\nConvoy PQ-4 was the fifth of the Arctic Convoys of World War II by which the Western Allies supplied material aid to the Soviet Union in its fight with Nazi Germany. The Convoy sailed from Hvalfjord, Iceland on 17 November 1941 and arrived at Archangelsk on 28 November 1941.\nQuestion: Convoy PQ-4 used ships to transport supplies. True, False, or Neither? True\n###\nAfter Dark is a brand of Indian whisky, manufactured by Radico Khaitan. The whisky was test marketed in 2010, and rolled out nationwide in India by September 2011. It is a 100% grain-based whisky manufactured at Radico's Rampur distillery. It is available in 750ml, 375ml and 180ml bottles. The brand's tagline is \"One Life, Many Passions...Why wait\".\nQuestion: The Whisky brand was test marketed in India before 2010 by Radico Kahitan in 330ml bottles. True, False, or Neither?", "doc_id": 337, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18378, 29840, 7716, 18907], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas.\nQuestion: Paul Albert Raymond Barlatier de Mas was a baron True, False, or Neither? True\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\".\nQuestion: Kidsty Pike has been seen by hank. True, False, or Neither? Neither\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name.\nQuestion: Wendy went on to write other films True, False, or Neither? Neither\n###\nMary Isobel Downer, Lady Downer (13 December 1924 \u2013 14 October 2014) was a prominent South Australian patron, wife of federal MP and high commissioner Sir Alexander \"Alick\" Downer, and mother of Liberal Party leader, Australian Foreign Minister and high commissioner Alexander Downer.\nQuestion: Mary Isobel Downer, Lady Downer never loved her husband federal MP and high commissioner Sir Alexander \"Alick\" Downer True, False, or Neither? Neither\n###\n\"Aven Romale\" (Come in Gypsies), is a song by the Czech group Gipsy.cz that was the Czech entry at the 2009 Eurovision Song Contest held in Moscow, Russia. It scored zero points at the Eurovision Song Contest semi-final, thereby failing to qualify for the final.\nQuestion: Aven Romel was not popular in the eurovision song contest. True, False, or Neither?", "doc_id": 565, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [486, 7265, 3019, 10085], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Love's Labour's Lost is a 2000 adaptation of the comic play of the same name by William Shakespeare, directed by and starring Kenneth Branagh. It was the first feature film to be made of this lesser-known comedy. Branagh's fourth film of a Shakespeare play (he did not direct the 1995 \"Othello\", although he did play Iago), \"Love's Labour's Lost\" was a box-office and critical disappointment.\nQuestion: The feature film Love's Labour's Lost was released more than 5 years ago, but within the last 50 years. True, False, or Neither? True\n###\nThe 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015.\nQuestion: The 2015 City of Onkaparinga ATP Challenger had 5 editions of the tournament. True, False, or Neither? Neither\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings.\nQuestion: Icre de fasole is made with beans, vegetable oil and onions. True, False, or Neither? True\n###\nThe 24th Air Division is an inactive United States Air Force intermediate echelon command and control organization. It was last assigned to First Air Force, Tactical Air Command (ADTAC). It was inactivated on 30 September 1990 at Griffiss Air Force Base, New York.\nQuestion: The 24th Air Division stationed all over the world before becoming inactive True, False, or Neither? Neither\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings.\nQuestion: the Camping World Stadium the Amway Center and the tinker field stadium are very close to each other True, False, or Neither?", "doc_id": 532, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [8755, 8471, 4671, 41056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis.\nQuestion: The Copenhagen Consensus Center was founded before 1990. True, False, or Neither? Neither\n###\nAmanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016.\nQuestion: Amanda Knox and the documentary both share the same name True, False, or Neither? True\n###\nMore of Tom Lehrer was the second studio album recorded by musical satirist Tom Lehrer. The LP contains the same songs (in the same sequence) as the live album \"An Evening Wasted with Tom Lehrer\", which was recorded and released earlier in the same year. The album was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1959.\nQuestion: \"An Evening Wasted with Tom Lehrer\" was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1979. True, False, or Neither? False\n###\nThe 1977 Los Angeles Dodgers season had Tommy Lasorda replace longtime manager Walter Alston as Manager of the team. The Dodgers won the National League West by 10 games and defeated the Philadelphia Phillies in four games in the NLCS, then lost to the New York Yankees in the World Series.\nQuestion: The New York Yankees beat the Phillies in the 1977 world series. True, False, or Neither? False\n###\nMurray, Utah was declared a city July 3, 1902, instituting a mayor-council form of government. The mayor of Murray was originally partisan, but switched to a non-partisan position. The term of mayor was originally two years, but amended to a four-year term in the 1940s in accordance with state law. The following is a list of Mayors of Murray, Utah.\nQuestion: Murray's mayors have always been lawyers. True, False, or Neither?", "doc_id": 563, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [12836, 31563, 7014, 17180], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock.\nQuestion: Carlos Santana is well known for his thrilling power chords. True, False, or Neither? Neither\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee.\nQuestion: Critics felt the movie deserved the two Israeli Film Academy Awards it did not win. True, False, or Neither? Neither\n###\nIan Drew is Entertainment Director for American celebrity magazine, Us Weekly. He speaks about celebrities, music and fashion on television shows including CNN, Good Morning America, The Early Show, MSNBC, and Fox News. He interviewed Janet Jackson for one of Us Weekly's best-selling issues.\nQuestion: Ian Drew is Entertainment Director for American celebrity magazine. True, False, or Neither? True\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor.\nQuestion: Prawin Pudi also edits other short films bye Geetha Arts. True, False, or Neither? Neither\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist.\nQuestion: Ana B\u00e1rbara has 1 premio furia musical award True, False, or Neither?", "doc_id": 123, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6188, 13047, 36877, 25929], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets.\nQuestion: Rodrequis La'Vant Stephens was paid over 5 million True, False, or Neither? Neither\n###\nSuccess is a 1983 studio album originally released by American singing duo The Weather Girls. The album includes the group's biggest hit, \"It's Raining Men\", which peaked at #1 on the U.S. Dance chart, #46 on the U.S. Pop chart, & #34 on the U.S. R&B chart.\nQuestion: Success was an album made by singing duo The Wonder Girls True, False, or Neither? False\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke.\nQuestion: \"The Candidate\" is the 4th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. True, False, or Neither? False\n###\nWhite Fang 2: Myth of the White Wolf is a 1994 American Northern adventure film directed by Ken Olin. A sequel to the 1991 \"White Fang\", it stars Scott Bairstow, Alfred Molina, and Geoffrey Lewis. Filming took place in Aspen, Colorado and Vancouver, British Columbia. Walt Disney Home Video released this movie on VHS October 19, 1994.\nQuestion: White Fang 2: Myth of the White Wolf is a exciting film True, False, or Neither? Neither\n###\n\"Duffle Bag Boy\" is a song by American hip hop duo Playaz Circle, released as the debut lead single from their debut album, \"Supply & Demand\" (2007). The song features a guest appearance from fellow American rapper Lil Wayne and was produced by M16 and Liam Kantwill. The song peaked in the Top 40 of the U.S. \"Billboard\" Hot 100, reaching number 15.\nQuestion: Duffle Bag Boy had their first Top 10 Billboard song in 2010 True, False, or Neither?", "doc_id": 950, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [11363, 34685, 17205, 416], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times.\nQuestion: \"The Sound of Waves\" received an award the same year it was published. True, False, or Neither? True\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit.\nQuestion: There is a national park that is close to Bridge Mountain and managed by the United States. True, False, or Neither? Neither\n###\nSt. Mark's Coptic Orthodox Cathedral is a Coptic church located in the Abbassia District in Cairo, Egypt. The cathedral is the Seat of the Coptic Orthodox Pope. It was built during the time when Pope Cyril VI of Alexandria was Pope of the Coptic Orthodox Church, and was inaugurated by him in 1969.\nQuestion: St. Mark's Coptic Orthodox Cathedral was planned by Pope Cyril VI of Alexandria. True, False, or Neither? Neither\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: Migos has had at least one song in the Billboard top 100 True, False, or Neither? True\n###\nHe Was Cool (; lit. \"That Guy was Cool\") is a 2004 South Korean film based on the same-titled 2001 Internet novel written by Guiyeoni. The film was released in South Korean cinemas on July 23, 2004 and was the 35th most attended film of the year with 800,000 admissions.\nQuestion: He was cool was released in North Korea. True, False, or Neither?", "doc_id": 385, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13379, 19697, 21056, 26088], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Ballet was a British documentary television programme produced by Rare Day and broadcast on Channel 4. The three-episode series was first broadcast on 6 February 2014. It followed Wayne Sleep and prima ballerina Monica Loughman as they worked with a troupe of amateur dancers to realise their dream of dancing Swan Lake.\nQuestion: Big Ballet was aired on Channel 4. True, False, or Neither? True\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008.\nQuestion: They own the publishing rights to Stryder's music in the USA. True, False, or Neither? Neither\n###\nLimnocharis flava (commonly known as yellow velvetleaf, sawah flower rush, sawah lettuce) is a species of aquatic flowering plant which is native to Mexico, Central America, South America, Cuba, Haiti and the Dominican Republic but widely naturalized in southern and southeastern Asia: India, Sri Lanka, Cambodia, Burma, Thailand, Vietnam, Indonesia, Malaysia and southern China (Guangdong, Yunnan).\nQuestion: Limnocharis grows on land. True, False, or Neither? False\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: The 1997 Porsche Tennis Grand Prix took place in 1999 True, False, or Neither? False\n###\nThe 2016\u201317 Danish Cup was the 63rd season of the Danish Cup competition. F.C. Copenhagen won the tournament, earning qualification into the second qualifying round of the 2017\u201318 UEFA Europa League. However, as F.C. Copenhagen also won the 2016\u201317 Danish Superliga, Br\u00f8ndby IF, the cup runners-up, is allotted that position in the 2017\u201318 UEFA Europa League.\nQuestion: F.C. Copenhagen won in 2017. True, False, or Neither?", "doc_id": 896, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [1786, 25805, 42806, 23165], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Emmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\".\nQuestion: Emmanuel Frechette won the Canadian Screen Award for Best Art Direction or Production Design for War Witch (Rebelle) in 2017. True, False, or Neither? False\n###\nMark Christopher Randall (born September 30, 1967) is an American former professional basketball player who played in four National Basketball Association (NBA) seasons for the Chicago Bulls, Minnesota Timberwolves, Detroit Pistons, and Denver Nuggets. Randall was selected by the Bulls in the first round (26th pick overall) of the 1991 NBA Draft and averaged 2.6 points per game for his career.\nQuestion: Mark Christopher Randall played basketball in high school. True, False, or Neither? Neither\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams.\nQuestion: The Gaming Control Act ensured honesty True, False, or Neither? True\n###\nIlse von Glatz (August 21, 1958 \u2013 May 2, 2014) was a Canadian actress who played an Advocate in the 1988 science fiction TV series \"War of the Worlds\". She also worked in \"The Mind of Simon Foster\" (episode of \"the 1985 version of The Twilight Zone\"). She also appeared in at least one episode of \"\" in 1989.\nQuestion: The show was science fiction True, False, or Neither? True\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks.\nQuestion: Jack Bender had Evangeline Lilly act according to his interpretation of the script of \"Lost\" True, False, or Neither?", "doc_id": 796, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40133, 2696, 16931, 31157], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\".\nQuestion: Hooked on a Feeling is a song by Swedish Rock. True, False, or Neither? True\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: Thalia has at least four albums. True, False, or Neither? True\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\".\nQuestion: Carmen Lebbos has only been in one television series. True, False, or Neither? False\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts.\nQuestion: Diaspora studies is usually taken by wealthy college students True, False, or Neither? Neither\n###\nSteve Koren is an Emmy Award winning writer/producer and screenwriter. Most notably he\u2019s written for \"Saturday Night Live\", \"Seinfeld\", and \"Veep\". He also wrote or co-wrote the movies \"Bruce Almighty\", \"Click\", \"A Night at the Roxbury\" and \"Superstar\".\nQuestion: The co-writer for Superstar also wrote for Seinfeld. True, False, or Neither?", "doc_id": 640, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22036, 27488, 33394, 23875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite.\nQuestion: Cari Elizabeth Roccaro is a man. True, False, or Neither? False\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration.\nQuestion: New Day was written by an African author. True, False, or Neither? False\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015.\nQuestion: Niger State is nice. True, False, or Neither? Neither\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre.\nQuestion: Weltenbrand was formed less than 5000 days ago. True, False, or Neither? False\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer.\nQuestion: Gabriel Julio Fern\u00e1ndez Capello was born in a location south of Dallas, TX. True, False, or Neither?", "doc_id": 976, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22509, 12023, 35265, 35463], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Conoclinium coelestinum, the blue mistflower, is a North American species of herbaceous perennial flowering plant in the sunflower family. It was formerly classified in the genus \"Eupatorium\", but phylogenetic analyses in the late 20th century research indicated that that genus should be split, and the species was reclassified in \"Conoclinium\".\nQuestion: The blue mistflower is from Texas. True, False, or Neither? Neither\n###\nTobias Svantesson (born April 1, 1963, in Malmo, Sweden), is a former professional tennis player from Sweden. He enjoyed most of his tennis success while playing doubles. During his career he won 2 doubles titles. He achieved a career-high doubles ranking of World No. 65 in 1991. His career high world ranking in singles was no 89.\nQuestion: Tobias Svantesson met Trump. True, False, or Neither? Neither\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter.\nQuestion: Colin Francis Weeber Isaacs represented the riding of Wentworth starting from 1979 and then ending on 1981 True, False, or Neither? True\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines is a sports news organization True, False, or Neither? False\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed.\nQuestion: Metal Gear Solid was the first in the Metal Gear series which contains ten games. True, False, or Neither?", "doc_id": 784, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [17859, 24656, 37819, 8161], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler.\nQuestion: Dan Deacon was born after 1985 True, False, or Neither? False\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif.\nQuestion: Both of Barakat's parents are Lebanese. True, False, or Neither? Neither\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries.\nQuestion: People really like flowers True, False, or Neither? Neither\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes.\nQuestion: Australian fishing television program Hook Line and Sinker is the most popular fishing show in Australia. True, False, or Neither? Neither\n###\nInvitation to Sociology: A Humanistic Perspective is a 1963 book about sociology by sociologist Peter L. Berger, in which Berger sets out the intellectual parameters and calling of the scientific discipline of sociology. Many of the themes presented in this book were later developed in his 1966 book \"The Social Construction of Reality\", coauthored with Thomas Luckmann.\nQuestion: Peter L. Berger died in 1966. True, False, or Neither?", "doc_id": 860, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [30137, 19368, 31959, 1802], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "KnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates.\nQuestion: Computer Associates earned a lot of money from KnowledgeWare True, False, or Neither? Neither\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels.\nQuestion: Louis Glenn Marson is an American True, False, or Neither? True\n###\nPlatylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in Ivory Coast, Ghana, Cameroon, the Democratic Republic of the Congo (Shaba), western Uganda, Malawi and northern Zambia. The habitat consists of woodland and open places in the forest zone.\nQuestion: The butterfly is found in over 2 countries True, False, or Neither? True\n###\nPaul Annacone and Christo van Rensburg were the defending champions. Annacone participated with John Fitzgerald, and lost in the quarterfinals to Scott Davis and David Pate, while Van Rensburg played with Kevin Curren, and lost in the semifinals to Grant Connell and Glenn Michibata.
Rick Leach and Jim Pugh defeated Connell and Michibata 3\u20136, 6\u20134, 6\u20132, in the final.\nQuestion: rick leach won in the final True, False, or Neither? True\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances.\nQuestion: Dom DeLuise and Rick Moranis were supporting actors in film, whereas Rudy De Luca was one of the starring actors. True, False, or Neither?", "doc_id": 439, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [15968, 20523, 23029, 20042], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ireland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth.\nQuestion: Ireland is the second largest island in the British Isles True, False, or Neither? True\n###\nBernardino Zacchetti (active c. 1523) was an Italian painter of the Renaissance period. He was born in Reggio Emilia. His style recalls Raphael, and is also said to have worked with Michelangelo in the Sistine chapel. His picture of \"St. Paul\" in the church of San Prospero at Reggio recalls Il Garofalo. One of his pupils, Giovanni Soncini was the godfather of Corregio\u2019s second daughter.\nQuestion: Bernardino Zacchetti was an artist True, False, or Neither? True\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues.\nQuestion: The news website is very popular True, False, or Neither? Neither\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler.\nQuestion: Dan Deacon is a slow man True, False, or Neither? Neither\n###\nIt's OK! is a musical group formed by Redd Kross members Robert Hecker (guitar, vocals) and Victor Indrizzo (drums), along with bassist Abby Travis and the late Greg White on vocals. This initial line up of the band released the self-titled debut album \"It's OK!\".\nQuestion: It's OK had several members, they included Hecker (drums) Indrizzo (vocals) Travis (guitar, vocals) and White bassist. True, False, or Neither?", "doc_id": 100, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2337, 12509, 19289, 26398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre.\nQuestion: We are in the 21st century. True, False, or Neither? True\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing.\nQuestion: The population of the Charter Township of Lansing has decreased since the last census True, False, or Neither? Neither\n###\nHumphrey Mieno Ochieng (born 25 December 1989 in Nairobi) is a Kenyan footballer who currently plays for Kenyan Premier League side Tusker and the Kenya national team as a midfielder. He previously played for A.F.C. Leopards Sofapaka and Kenya Commercial Bank in the Kenyan Premier League, as well as Tunisian side Club Africain and Tanzanian club Azam.\nQuestion: Humphrey Mieno Ochieng is 40 True, False, or Neither? False\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name.\nQuestion: Benoit is a name given to boys. True, False, or Neither? True\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length.\nQuestion: The San Nicolao Tunnel is a tunnel for the big ships. True, False, or Neither?", "doc_id": 577, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7694, 33732, 20855, 13796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lorca F\u00fatbol Club, S.A.D. is a Spanish football team based in Lorca, in the autonomous community of the Region of Murcia. Founded in 2003, it currently plays in Segunda Divisi\u00f3n, holding home games at Estadio Francisco Art\u00e9s Carrasco, which has a capacity of 8,120.\nQuestion: Lorca Futbol Club is a Spanish recipe based in Lorca, which has a serving capacity of 8,120. True, False, or Neither? False\n###\nMargaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law.\nQuestion: Margaret Lucille Jeanne Parker is less than 80 years old True, False, or Neither? True\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Tillia tepe is still excavated to this day. True, False, or Neither? Neither\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor.\nQuestion: Van Cleef & Arpels will be the number one French jeweler next year. True, False, or Neither? Neither\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity.\nQuestion: Lex is an organization traditionally made of men True, False, or Neither?", "doc_id": 614, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [5003, 22200, 44205, 21010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Happy Mother's Day, Love George (also known Run Stranger, Run) is a 1973 American mystery film produced and directed by Darren McGavin. The film stars Patricia Neal, Cloris Leachman, Bobby Darin, Tessa Dahl, Ron Howard, Kathie Browne, Joe Mascolo, Simon Oakland, and Thayer David.\nQuestion: The filmed bombed at the box office. True, False, or Neither? Neither\n###\nCanning Downs was the first residential establishment built by a white person on the Darling Downs in Queensland, Australia. It is located a short drive from the town of Warwick and originally extended south east to Killarney and the McPherson Range. The area was first named after the British statesman George Canning by Allan Cunningham.\nQuestion: Canning Downs was the first residential establishment built by a white person on the Darling Downs in Queensland, Australia. White people are very cool. True, False, or Neither? Neither\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues.\nQuestion: Trademark issues affected the release of Wireshark. True, False, or Neither? Neither\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north.\nQuestion: st clements is not in england True, False, or Neither? False\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\".\nQuestion: LA1 broadcasts in the morning True, False, or Neither?", "doc_id": 977, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [35703, 29333, 28450, 23597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Come With Me To Hell, Pt. 1 is the second studio album collaboration between Memphis-based rappers DJ Paul and Lord Infamous. It was released in 1994 and distributed through the independent record label, Prophet Entertainment. A \"Remastered Edition\" of Come With Me To Hell, Pt. 1 was released on March 6, 2014, to critical acclaim, following the success of the re-released material.\nQuestion: Come With Me To Hell, Pt. 1 was written in 1995 True, False, or Neither? False\n###\nThe Kid from Left Field is a 1953 baseball film starring Dan Dailey, Anne Bancroft, Lloyd Bridges, and Billy Chapin. The film marked the reunion of Dailey and director Harmon Jones who had teamed up at 20th Century Fox a year earlier in another baseball film, the biographical \"The Pride of St. Louis\".\nQuestion: The Kid from Left Field is a baseball film True, False, or Neither? False\n###\nDemi Lovato: Live in Concert (also known as the Summer Tour 2009) was the debut headlining concert tour by American singer Demi Lovato, launched in support of her debut album \"Don't Forget\" (2008) and the second studio album \"Here We Go Again\" (2009).\nQuestion: Demi Lovato has only released one album. True, False, or Neither? False\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942.\nQuestion: Susan Peters detested minor supporting parts. True, False, or Neither? Neither\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day.\nQuestion: Tunnel Vision was written in 2001 True, False, or Neither?", "doc_id": 365, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28554, 5055, 30510, 21148], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album.\nQuestion: Nick Harper owns the label Quixotic. True, False, or Neither? False\n###\nHi! Pristin (stylized as HI! PRISTIN) is the debut mini-album by South Korean girl group Pristin. It was released on March 21, 2017, by Pledis Entertainment, and distributed by LOEN Entertainment. The EP consists of six songs, including the singles \"Wee Woo\" and \"Black Widow\". In order to promote the album, the group performed on several Korean music shows.\nQuestion: Hi! Pristin are considered a boy band True, False, or Neither? False\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC).\nQuestion: He was a dutch pilot True, False, or Neither? True\n###\nStephen R. \"Steve\" Bissette (born March 14, 1955) is an American comics artist, editor, and publisher with a focus on the horror genre. He is known for working with writer Alan Moore and inker John Totleben on the DC comic book \"Swamp Thing\" in the 1980s.\nQuestion: Swamp Thing takes place entirely in a swamp. True, False, or Neither? Neither\n###\nThe Battle of Vauchamps (14 February 1814) was the final major engagement of the Six Days Campaign of the War of the Sixth Coalition. It resulted in a part of the Grande Arm\u00e9e under Napoleon I defeating a superior Prussian and Russian force of the Army of Silesia under Field-marshal Gebhard Leberecht von Bl\u00fccher.\nQuestion: The Battle of Vauchamps starts with C. True, False, or Neither?", "doc_id": 175, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20056, 14839, 1981, 32026], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele.\nQuestion: Stanley Fredrick Steele scored 97 goals from 1955 until 1968. True, False, or Neither? True\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries.\nQuestion: USL doesn't make much alcohol. True, False, or Neither? False\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings.\nQuestion: Fasole batuta is a dipping paste. True, False, or Neither? True\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single.\nQuestion: Come Back in One Piece was a hit for 2pac. True, False, or Neither? False\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks.\nQuestion: The 10th episode of the first season of \"Lost\" aired on January 1, 2005. True, False, or Neither?", "doc_id": 741, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [10670, 4797, 20954, 44174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frankenstein Castle (German: \"Burg Frankenstein\" ) is a hilltop castle in the Odenwald overlooking the city of Darmstadt in Germany. It is thought that this castle may have been an inspiration for Mary Shelley when she wrote her 1818 Gothic novel \"Frankenstein\".\nQuestion: Frankenstein Castle is also known as Burg Frankenstein True, False, or Neither? True\n###\nThe Inter-American Peace Force (IAPF) was established, by the Organization of American States, on 23 May 1965, after the United States's intervention in the Dominican Republic. It largely consisted of over 42,600 United States military personnel, plus the following troops were sent by each country;\nQuestion: The IAPF was founded in 1990 True, False, or Neither? False\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison.\nQuestion: Alix Bancourt is a blogger who writes in french for her online blog that is translated later to english. True, False, or Neither? True\n###\nThe Gold Diggers is a play written by Avery Hopwood. It popularized the use of the term \"gold digger\" to refer to women who seek wealthy partners, as opposed to the earlier usage referring to gold miners. Producer David Belasco staged it on Broadway in 1919, with Ina Claire in the lead role. It was a hit, running for two consecutive seasons before going on tour.\nQuestion: Avery Hopwood was born in eighteen hundred eighty six. True, False, or Neither? Neither\n###\nD.A.R.Y.L. is a 1985 American science fiction film written by David Ambrose, Allan Scott and Jeffrey Ellis. It was directed by Simon Wincer and stars Barret Oliver, Mary Beth Hurt, Michael McKean, Danny Corkill, and Josef Sommer. The original music score was composed by Marvin Hamlisch.\nQuestion: Mary Beth starred in an American science fantasy film. True, False, or Neither?", "doc_id": 166, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [9727, 35748, 3123, 16169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Melbourne Heart FC Futsal was a futsal club based in Melbourne, Victoria, founded in 2012. They played in the F-League, the top tier of Australian Futsal. The club was disbanded before the start of the 2014 season after the A-League team were bought by Manchester City FC.\nQuestion: Melbourne Heart FC Futsal was around for more than three years. True, False, or Neither? False\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation.\nQuestion: The 89th Medium Tank Battalion was the very bravest armored tank unit of the United States Army True, False, or Neither? Neither\n###\nHenry Gabriel Ginaca (May 19, 1876 - 1918) was an American engineer who invented, at the direction of Hawaiian pineapple magnate James Dole in 1911, a machine that could peel and core pineapples in an automated fashion. Called the Ginaca machine, the invention exponentially increased pineapple production and revolutionized the fruit canning industry. He died in the Spanish flu epidemic.\nQuestion: The Ginaca machine was used to process pineapples. True, False, or Neither? True\n###\nArthur C. Clarke's World of Strange Powers is a popular thirteen-part British television series looking at strange worlds of the paranormal. It was produced by Yorkshire Television for the ITV network and first broadcast in 1985. It was the sequel to the 1980 series \"Arthur C. Clarke's Mysterious World\".\nQuestion: Arthur C. Clarke has two separated yet related Tv series mentioned in this excerpt. True, False, or Neither? True\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\"\nQuestion: Bugger is a term used in every country. True, False, or Neither?", "doc_id": 36, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6272, 39048, 20251, 36525], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "You Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California.\nQuestion: You Can Be Anyone This Time Around was released more than 15 years ago. True, False, or Neither? True\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: the Edgar Award is prominent award to win. True, False, or Neither? Neither\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences.\nQuestion: Rude boy is the third studio album by Bey-once was recorded in June 08 2014 True, False, or Neither? False\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits.\nQuestion: It was released on 16 July1999 True, False, or Neither? False\n###\nDavid Armand (born 1980) is an American writer of fiction, non-fiction, and poetry. He has published three novels, \"The Pugilist's Wife\", \"Harlow\", and \"The Gorge\". He has also published a collection of poems, \"The Deep Woods\", and a memoir titled \"My Mother's House\". He is currently Writer-in-Residence at Southeastern Louisiana University.\nQuestion: David Armand is still alive. True, False, or Neither?", "doc_id": 316, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24731, 8654, 26355, 33122], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Two Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations.\nQuestion: Two Men And A Truck has many locations. True, False, or Neither? True\n###\nJon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International.\nQuestion: Luther earns seven figures in his executive position. True, False, or Neither? Neither\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award.\nQuestion: Amy Timberlake won a Parent\u2019s Choice Gold Medal and won the 2024 Golden Kite Award. True, False, or Neither? False\n###\nTaina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002.\nQuestion: It aired for a little over a year True, False, or Neither? True\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The perpetrator sought to attack the police from the beginning True, False, or Neither?", "doc_id": 535, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [36854, 733, 21861, 25181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University.\nQuestion: Shannon Kelley wants to coach Texas State. True, False, or Neither? Neither\n###\nRudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling.\nQuestion: Rudyard Kipling's The Jungle Book was not animated True, False, or Neither? True\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant.\nQuestion: View from the Top was seen by Bush. True, False, or Neither? Neither\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief.\nQuestion: Pasquines covers news about the government and politics of PR True, False, or Neither? True\n###\nNewtrament is a musician, MC and DJ known for releasing an early UK electro/hip hop record - \"London Bridge is Falling Down\" - on Jive Records. It was based on the nursery rhyme (previously adapted by the reggae group Culture) with a political message that electoral politics were a sham.\nQuestion: Newtrament owns Jive Records. True, False, or Neither?", "doc_id": 997, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28735, 29773, 26853, 583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958.\nQuestion: Marguerite Duras directed the film. True, False, or Neither? False\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech.\nQuestion: The 1960 Gator Bowl had a winner. True, False, or Neither? True\n###\n\"Don't Look Back\" is a song by British pop-rock band Fine Young Cannibals. It was released as the third single from the band's 1988 album \"The Raw & the Cooked\". The song reached the top 40 charts in the United Kingdom, United States, Canada, Australia, and New Zealand.\nQuestion: The album \"Don't Look Back\" by Fine Young Cannibals had 3 top 40 hits. True, False, or Neither? Neither\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others.\nQuestion: Gay Sex in the 70s is one of the best documentaries ever. True, False, or Neither? Neither\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website.\nQuestion: Van Morrison is in another band. True, False, or Neither?", "doc_id": 832, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [228, 7984, 26631, 1049], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings.\nQuestion: Lime juice is sometimes added to fasole b\u0103tut\u0103 recipes because of its acidity. True, False, or Neither? Neither\n###\nWilliam Hargis Bowman, Jr. (April 21, 1941 \u2013 February 22, 2011), better known by his stage name, Beau Dollar, was a soul vocalist and drummer for King Records. He performed on many studio albums for various artists under contract with King, including James Brown. His most prominent work was performed as \"Beau Dollar & The Dapps\" and \"Beau Dollar & The Coins\".\nQuestion: Bowman played the flute when he was young True, False, or Neither? Neither\n###\nFarmington Falls is an unincorporated village in the town of Farmington, Franklin County, Maine, United States. The community is located along the Sandy River 5 mi southeast of the village of Farmington; U.S. Route 2, Maine State Route 27, Maine State Route 41, and Maine State Route 156 all pass through the village. Farmington Falls has a post office with ZIP code 04940.\nQuestion: Farmington Falls is just south of the Canadian Border. True, False, or Neither? Neither\n###\nHomicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run.\nQuestion: homicide was aired five years ago True, False, or Neither? False\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians.\nQuestion: Persuasion was Adam Ants most popular album True, False, or Neither?", "doc_id": 574, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13577, 13904, 22893, 16458], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s.\nQuestion: John Henry Newman was born less than 100 years ago. True, False, or Neither? False\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil.\nQuestion: Jo\u00e3o starts with a J. True, False, or Neither? True\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\").\nQuestion: One of the counties that got the album starts with a T True, False, or Neither? True\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold.\nQuestion: Viktor Sarianidi was also a physicist and did remarkable work in the field True, False, or Neither? Neither\n###\nThere is a little Shia community in El Salvador. There is an Islamic Library operated by the Shia community, named \"Fatimah Az-Zahra\". They published the first Islamic magazine in Central America: \"Revista Biblioteca Isl\u00e1mica\". Additionally, they are credited with providing the first and only Islamic library dedicated to spreading Islamic culture in the country.\nQuestion: The community is south of the United States. True, False, or Neither?", "doc_id": 0, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24730, 11223, 30185, 41012], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the American Mafia, a made man is a fully initiated member of the Mafia. To become \"made\", an associate first has to be sponsored by another made man. An inductee will be required to take the oath of Omert\u00e0, the mafia code of silence. After the induction ceremony the associate becomes a \"made man\", and holds the rank of soldier (Italian: soldato) in the Mafia hierarchy.\nQuestion: An inductee is a soldier before the induction ceremony True, False, or Neither? False\n###\nThe 2014 Rialto Channel New Zealand Film Awards was the third presentation of the New Zealand Film Awards, a New Zealand film industry award. The 2014 ceremony took place in Shed 10 on Queen's Wharf in Auckland on Friday 12 December 2014. It was webcast live at the nzherald.co.nz, and later broadcast on the Rialto Channel.\nQuestion: The New Zealand Film Awards was livestreamed. True, False, or Neither? True\n###\nThe Bigger Picture is a 2014 British animated short film directed by Daisy Jacobs. It has been nominated for the Academy Award for Best Animated Short Film at the 87th Academy Awards. It won the BAFTA Award for Best Short Animation at the 68th British Academy Film Awards.\nQuestion: The Bigger Picture has the voice of dan. True, False, or Neither? Neither\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg.\nQuestion: Duel is the directorial debut of producer, screenwriter, and director Richard Matheson. True, False, or Neither? False\n###\nManuel de Falla y Matheu (] , 23 November 187614 November 1946) was a Spanish composer. Along with Isaac Alb\u00e9niz and Enrique Granados, he was one of Spain's most important musicians of the first half of the 20th century. His image was on Spain's 1970 100-pesetas banknote.\nQuestion: Manuel de Falla y Matheu has an A. True, False, or Neither?", "doc_id": 677, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [13041, 37945, 21469, 33911], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Enrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America.\nQuestion: Enrique Leff is currently alive True, False, or Neither? True\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ).\nQuestion: They work only on the borders. True, False, or Neither? Neither\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records.\nQuestion: tafoya has never eaten grapes True, False, or Neither? Neither\n###\nThe Coca-Cola Bottling Company of Cape Cod is a former bottler of Coca-Cola, Dr Pepper and Canada Dry soft drinks located in Sandwich, Massachusetts, United States. The company was bought out in 2000 by the Coca-Cola Bottling Company of Northern New England.\nQuestion: The Coca-Cola Bottling Company of Cape Cod has never been sold. True, False, or Neither? False\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University.\nQuestion: Ruth Pryor lived to be around 95 years old. True, False, or Neither?", "doc_id": 907, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16883, 5903, 36310, 25175], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\".\nQuestion: Aodh Mac Cathmhaoil has Irish ancestry True, False, or Neither? True\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr.\nQuestion: \"Things Happen at Night\" is a ghost supernatural-esque comedy made in 1944. True, False, or Neither? False\n###\nThe Veterinary Medical College Application Service (VMCAS) is a centralized application service for students applying to veterinary school. Created by the Association of American Veterinary Medical Colleges (AAVMC) in 1995, VMCAS handles applications for most of the veterinary schools in the United States, as well as several in Canada, the United Kingdom, New Zealand and Australia.\nQuestion: The Veterinary Medical College Application Service (VMCAS) is the only application service for students applying for veterinary school in 1995 True, False, or Neither? Neither\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: David Lee Murphy likes breakfast True, False, or Neither? Neither\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt.\nQuestion: There is not a city called Klagenfurt in Australia. True, False, or Neither?", "doc_id": 72, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34699, 26765, 32765, 4961], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Seven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers.\nQuestion: Teddy Rooney will go down as the greatest actor after many years of analysis, hundreds of years in the distant future. True, False, or Neither? Neither\n###\nAnna Hamilton Phelan is an American actress and scriptwriter. She has been nominated for an Oscar for her work on \"Gorillas in the Mist\", as well as a nomination for a Writers Guild of America Award for her work on \"Mask\" and again for \"Gorillas in the Mist\".\nQuestion: Anna Hamilton Phelan is an American. True, False, or Neither? True\n###\nInterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV.\nQuestion: Minas Gerais is at Brazil True, False, or Neither? True\n###\nThomas Morrison (born 30 June 1983) is an English actor who has performed in theatre, TV and film. He is best known for his appearances in \"On the Shore of the Wide World\" and as Scripps in Cast B and C of Alan Bennett's \"The History Boys\". working alongside Steven Webb and Matt Smith.\nQuestion: Morrison prefers the British spelling of \"theatre\". True, False, or Neither? Neither\n###\nBlack Snake is a 1973 American film directed by Russ Meyer. It was Meyer's return to self-financed projects, following the end of his brief deal at 20th Century Fox. Meyer's only attempt at the Blaxploitation genre, it was filmed in Panavision and was shot on location in Barbados.\nQuestion: Black Snake featured black actors True, False, or Neither?", "doc_id": 572, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29769, 44762, 28720, 26332], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award.\nQuestion: Sadat is a four-hour television miniseries based on the life and death of the late 3rd President of the US. True, False, or Neither? False\n###\nThe Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds.\nQuestion: The Kien Giang Ridgeback is the smallest of the ridgeback breeds. True, False, or Neither? False\n###\nEnglandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo.\nQuestion: The word England is in the translation of Englandsfarere. True, False, or Neither? True\n###\nLookout Mountain, elevation 6536 ft , is the second highest peak in Oregon's Mount Hood National Forest and the highest point in Badger Creek Wilderness. It sits about 8 miles east-southeast of Mount Hood, separated from it by the valley of the East Fork Hood River.\nQuestion: It is in Washington. True, False, or Neither? False\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology.\nQuestion: Pavel Sergeyevich Alexandrov wrote in Russian. True, False, or Neither?", "doc_id": 498, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [39565, 42213, 15147, 1230], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: Aragone made his first appearance at the 2017 US Open and lost that important game. True, False, or Neither? Neither\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament.\nQuestion: Aragone played tennisin the university of virginia True, False, or Neither? True\n###\nUSFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico.\nQuestion: The USFC \"Fish Hawk\" was not in operation in 1808. True, False, or Neither? True\n###\nRichard Colson Baker (born April 22, 1990), better known by his stage names MGK and Machine Gun Kelly, is an American rapper and actor, from Cleveland, Ohio. MGK embarked on a musical career as a teenager, releasing a mixtape in 2006. He went on to release four more mixtapes.\nQuestion: He was born in a hospital True, False, or Neither? Neither\n###\nPatricia Donoho Hughes (August 18, 1930\u00a0\u2013 January 20, 2010) was a First Lady of Maryland, married to former Maryland Governor Harry Hughes. She was educated at Sorbonne (1949) and Bryn Mawr College (1951) before getting married on June 30, 1951. She later continued her education at the University of Delaware (1966). Mrs. Hughes was a teacher and educator by profession.\nQuestion: Particia Donoho Hughes was the First Lady of Maryland in 2010. True, False, or Neither?", "doc_id": 468, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40096, 2088, 33160, 36263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Valentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood.\nQuestion: The film was released in 2000 True, False, or Neither? False\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\".\nQuestion: \"Two Lovers and a Bear\" was released in 2017. True, False, or Neither? True\n###\nThe 2004 IIFA Awards, officially known as the 5th International Indian Film Academy Awards ceremony, presented by the International Indian Film Academy honoured the best films of 2003 and took place between May 20\u201322, 2004. This year, the city of Singapore played host to the Indian Film Industry. The tag line of this year's IIFA Awards was \"Uniquely IIFA, Uniquely Singapore ...\".\nQuestion: The ceremony took place for 2 days True, False, or Neither? True\n###\nThe Krylov\u2013Bogolyubov averaging method (Krylov\u2013Bogolyubov method of averaging) is a mathematical method for approximate analysis of oscillating processes in non-linear mechanics. The method is based on the averaging principle when the exact differential equation of the motion is replaced by its averaged version. The method is named after Nikolay Krylov and Nikolay Bogoliubov.\nQuestion: The Krylov\u2013Bogolyubov averaging method is used mostly in physics True, False, or Neither? Neither\n###\nThe 1998 Idaho Vandals football team represented the University of Idaho in the 1998 NCAA Division I-A football season. The Vandals, led by fourth-year head coach Chris Tormey, were members of the Big West Conference and played their home games at the Kibbie Dome, an indoor facility on campus in Moscow, Idaho.\nQuestion: The 1998 Idaho Vandals football team was formed after nineteen ninety nine. True, False, or Neither?", "doc_id": 880, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22667, 36316, 38195, 29459], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sherbino Mesa Wind Farm is located in Pecos County in west Texas. The first 150 megawatts (MW) of the project, which has a potential capacity of 750 MW, is in operation. Phase I utilizes 50 Vestas V-90 Mk.5 wind turbine generators, each with a rated capacity of 3 MW. BP will operate phase I of the project.\nQuestion: The Sherbino Mesa Wind Farm requires a lot of electricity to power True, False, or Neither? Neither\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records.\nQuestion: Dr. Dre's Death Row Records only worked with West Coast rappers. True, False, or Neither? Neither\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures.\nQuestion: There were speaking lines in A Daughter of the Wolf. True, False, or Neither? False\n###\nElmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County.\nQuestion: Elmira's population rose after 2010 True, False, or Neither? Neither\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season.\nQuestion: The ABA League has not given more than 0 awards True, False, or Neither?", "doc_id": 631, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [37962, 31305, 27504, 40133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns.\nQuestion: The 8.8 cm Flak was sold at an auction for an undisclosed amount. True, False, or Neither? Neither\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage.\nQuestion: KDMD has a stock price that is currently rising. True, False, or Neither? Neither\n###\nBanking in the United States is regulated by both the federal and state governments. The five largest banks in the United States at December 31, 2011 were JPMorgan Chase, Bank of America, Citigroup, Wells Fargo, and Goldman Sachs. In December 2011, the five largest banks' assets were equal to 56 percent of the U.S. economy, compared with 43 percent five years earlier.\nQuestion: TD bank is one of the five largest banks in the USA True, False, or Neither? False\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\".\nQuestion: Hooked on a Feeling is a song by Swedish Rock. True, False, or Neither? True\n###\nThe Vienna State Opera (German: Wiener Staatsoper ) is an Austria opera house and opera company based in Vienna, Austria. It was originally called the Vienna Court Opera (Wiener Hofoper). In 1920, with the replacement of the Habsburg Monarchy by the First Austrian Republic, it was renamed the Vienna State Opera. The members of the Vienna Philharmonic are recruited from its orchestra.\nQuestion: Wiener Hofoper and Wiener Staatsoper are different opera companies. True, False, or Neither?", "doc_id": 685, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44310, 34129, 40163, 27106], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bailey Gatzert (December 29, 1829 \u2013 April 19, 1893) was an American politician and the eighth mayor of Seattle, Washington, serving from 1875 to 1876. He was the first Jewish mayor of Seattle, narrowly missing being the first Jewish mayor of a major American city (Moses Bloom became mayor of Iowa City, Iowa, in 1873), and has been the only Jewish mayor of Seattle to date.\nQuestion: Moses Bloom was mayor of Iowa City, Iowa in 1875 True, False, or Neither? Neither\n###\nIppadikku Rose (Tamil: \u0b87\u0baa\u0bcd\u0baa\u0b9f\u0bbf\u0b95\u0bcd\u0b95\u0bc1 \u0bb0\u0bcb\u0bb8\u0bcd ; English: Yours truly, Rose ) is a Tamil talk show aired on Vijay TV. The show hosted by Rose. The talk show deals with current affairs touching a wide variety of social issues including traditions, taboos, rebels and culture. This is the first TV show in India hosted by a transgender person. The show is telecast at every Thursday at 11:PM IST.\nQuestion: Rose doesn't speak Tamil. True, False, or Neither? False\n###\nJon Garth Murray (November 16, 1954 \u2013 September 29, 1995) was the second son of late controversial activist Madalyn Murray O'Hair, the first president and founder of American Atheists, Inc., in 1963. He was also the half-brother of the reverend William \"Bill\" Murray.\nQuestion: reverend William \"Bill\" Murray is an atheist. True, False, or Neither? Neither\n###\nThe X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004.\nQuestion: The second game is technically superior to the first game. True, False, or Neither? Neither\n###\nThe 2016\u201317 ProA was the 10th season of the ProA, the second level of basketball in Germany. The champions the runners-up of the play-offs are promoted to the 2017\u201318 Basketball Bundesliga. The season started on September 22, 2016 and ended on May 7, 2017. Mitteldeutscher BC won the championship and promoted along with runners-up Oettinger Rockets.\nQuestion: Mitteldeutscher BC was promoted to the 11th season. True, False, or Neither?", "doc_id": 401, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44651, 13643, 1860, 1528], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Neil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\".\nQuestion: The compilation album Neil Sedaka: Italiano was released in nineteen hundred sixty five. True, False, or Neither? False\n###\nThe church St. Ulrich is a Roman Catholic parish church in Neubau, the 7th district of Vienna, Austria. The official name is \"Pfarrkirche hl. Ulrich und Maria Trost \" (Parish church of St. Ulrich and Mary's consolation), it is also known as Ulrichskirche . The Baroque hall church with two towers was built in 1721. It is consecrated to St. Ulrich and St. Mary.\nQuestion: The Baroque hall church with two towers was built 100 years after 1621. True, False, or Neither? True\n###\nPetr Korda was the defending champion, but he was eliminated in the third round by Todd Martin.
Yevgeny Kafelnikov won the title, defeating Thomas Enqvist in the final, 4\u20136, 6\u20130, 6\u20133, 7\u20136. With this win, Kafelnikov became the first Russian (male or female) to win the Australian Open.\nQuestion: Thomas Enqvist has played in the final of the Australian Open. True, False, or Neither? True\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr..\nQuestion: Glaiza Herradura-Agullo suffers from diabetes. True, False, or Neither? Neither\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award.\nQuestion: It took workers a total of forty four years to complete the construction of the Panama Canal. True, False, or Neither?", "doc_id": 461, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [29902, 15172, 2660, 17521], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Thameslink and Great Northern are the brand names used by the Govia Thameslink Railway train operating company on the Thameslink and Great Northern routes of the Thameslink, Southern and Great Northern franchise, previously operated by First Capital Connect.\nQuestion: One of the names starts with an N True, False, or Neither? True\n###\nThe 2008\u201309 season was Aston Villa's 134th professional season; their 98th season in the top-flight and their 21st consecutive season in the top flight of English football, the Premier League. They were managed by Martin O'Neill \u2013 in his third season since replacing David O'Leary. The 2008\u201309 season was the first spell in European competition for O'Neill, and the first for Villa, in 6 seasons.\nQuestion: Aston Villa played in the premier league True, False, or Neither? True\n###\nThe 2011\u201312 Seattle Redhawks men's basketball team represented the Seattle University in the 2011\u201312 college basketball season. This was head coach Cameron Dollar's 3rd season at Seattle U. The Redhawks played their home games at KeyArena as Independent members of Division I. They finished 12\u201315 overall.\nQuestion: 2011-2012 was the Redhawks worst year True, False, or Neither? Neither\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues.\nQuestion: The Centre for Policy Studies owns CapX and is a british institution. True, False, or Neither? Neither\n###\nBoy Meets Girl is an ITV comedy-drama television miniseries starring Rachael Stirling and Martin Freeman. In the show, Danny Reed (Freeman) is struck by lightning. When he wakes up from the attack, he is inside the body of a woman, fashion journalist Veronica Burton (Stirling). Written by David Allison, the series began on 1 May 2009.\nQuestion: Boy Meets Girl, which was written by David Allison, ended when Danny Reed was struck by lightning. True, False, or Neither?", "doc_id": 259, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7981, 16064, 35219, 11272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child.\nQuestion: The Ones Who Walk Away from Omelas is critically acclaimed. True, False, or Neither? Neither\n###\nMate Pavi\u0107 (born 4 July 1993) is a Croatian professional tennis player specialising in doubles. Mate won the 2016 US Open mixed doubles title in partnership with Laura Siegemund, and reached the 2017 Wimbledon Championships men's doubles finals partnering Oliver Marach.\nQuestion: Mate Pavi\u0107 was born in 1994 True, False, or Neither? False\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke.\nQuestion: The Candidate was the turning point in Jack Shephard's career. True, False, or Neither? Neither\n###\nPrinceton Junction is a railroad station in Princeton Junction, New Jersey, located in West Windsor Township. It serves NJ Transit (NJT) and Amtrak on the Northeast Corridor (NEC), and NJ Transit on the Princeton Branch. The station's Amtrak station code is PJC.\nQuestion: Princeton Junction is a railroad in North Carolina. True, False, or Neither? False\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina.\nQuestion: The 1999 Acura Classic \u2013 Doubles was the doubles event of the third edition of the twentieth tournament in the US Open Series. True, False, or Neither?", "doc_id": 116, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [21051, 32129, 24146, 30069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old.\nQuestion: John Cameron Urschel (born June 24, 11991) is a Canadian mathematician and retired professional American football guard and center. True, False, or Neither? False\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns.\nQuestion: The 8.8 cm Flak was invented after World War II True, False, or Neither? False\n###\nThe Enrolled Missouri Militia was a state militia organization of Missouri in 1862 during the American Civil War. It was a part-time force whose primary purpose was to serve as garrison and infrastructure guards, both to augment the Unionist Missouri State Militia in defense versus raids and to free the Missouri State Militia for offensive operations versus Confederate guerrillas and recruiters.\nQuestion: The Missouri Militia have killed hundreds of people. True, False, or Neither? Neither\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers.\nQuestion: McColo was run by hackers True, False, or Neither? Neither\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017.\nQuestion: Murder of the Universe directly preceded the 11th album True, False, or Neither?", "doc_id": 264, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [40057, 12957, 19850, 11803], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Game Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams.\nQuestion: Game Plan will continue to make pinball machines. True, False, or Neither? False\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley.\nQuestion: Bernard Taylor was an author and businessman True, False, or Neither? Neither\n###\nSwift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933.\nQuestion: Cornelia Meigs illustrated Swift Rivers in 1931. True, False, or Neither? False\n###\nJohn Robert Gamble (born 1948) is a former professional baseball shortstop. He played in 13 games in two seasons for the Detroit Tigers of Major League Baseball. He was drafted in the 2nd round of the 1966 Major League Baseball Draft by the Los Angeles Dodgers and acquired by the Tigers in the 1970 Rule V Draft.\nQuestion: John Robert Gamble no lomger plays True, False, or Neither? False\n###\nJunun is a 2015 album by the Israeli composer Shye Ben Tzur, the English composer and Radiohead guitarist Jonny Greenwood, and the Indian ensemble the Rajasthan Express. It was produced by Greenwood and recorded, mixed, and engineered by Radiohead producer Nigel Godrich.\nQuestion: Individuals from three different musical groups came together and worked on the song June True, False, or Neither?", "doc_id": 605, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [20240, 44884, 43480, 14606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mean Girls 2 is a 2011 American teen comedy television film directed by Melanie Mayron. It is a stand-alone sequel to the 2004 film \"Mean Girls\". The film premiered on ABC Family on January 23, 2011. The film stars Meaghan Martin, Jennifer Stone, Maiara Walsh, Nicole Gale Anderson, Claire Holt, and Diego Boneta. Tim Meadows reprises his role as Principal Ron Duvall from the original film.\nQuestion: Mean Girls 2 was well received by critics. True, False, or Neither? Neither\n###\nThe Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014.\nQuestion: Two Thousand Sixteen was the year that the Hyundai Xcent debuted. True, False, or Neither? False\n###\nThe Sydney/Melbourne Express was an overnight intercapital passenger train service that operated between the Australia's largest two cities, Sydney and Melbourne, between August 1986 and November 1993. Operated jointly by State Rail Authority and V/Line the name depended on the direction of travel, with the train nicknamed the 'Sex' or 'Mex'.\nQuestion: The Sydney/Melbourne Express was operated by 2 different entities True, False, or Neither? True\n###\nVirginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly.\nQuestion: Virginia is the eleventh biggest congressional district True, False, or Neither? Neither\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners.\nQuestion: Kuch Kuch Hota Hai didn't win all the categories in The 44th Filmfare Awards. True, False, or Neither?", "doc_id": 589, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38045, 40221, 23330, 26101], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ.\nQuestion: His work's concentrates on Mary. True, False, or Neither? False\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station.\nQuestion: The disaster occured in February True, False, or Neither? True\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808\nQuestion: Marks has gold . True, False, or Neither? Neither\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys.\nQuestion: Scott Edward Morriss was born in 1979 True, False, or Neither? False\n###\n\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the\nQuestion: The words to Eternally were written partially by Geoff Parsons True, False, or Neither?", "doc_id": 2, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [44906, 42817, 37592, 25674], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Strangers is an American country band best known as the back-up band for singer-songwriter Merle Haggard. Formed in 1965 in Bakersfield, California, United States, the band continued to tour with original co-founding member Norman Hamlet, as well as Haggard's children Dana and Ben.\nQuestion: Merle Haggard had two daughters. True, False, or Neither? False\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX.\nQuestion: The Outsiders don't get paid True, False, or Neither? Neither\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups.\nQuestion: Corn smut is something fed to toddlers and kids True, False, or Neither? Neither\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006.\nQuestion: Silent Scream was too scary for kids True, False, or Neither? Neither\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton).\nQuestion: The first single features Anthony Hamilton. True, False, or Neither?", "doc_id": 759, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [2081, 5897, 20361, 8937], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper.\nQuestion: Nobody died in East End in 1888 True, False, or Neither? False\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise).\nQuestion: Danot and Wood begged for the help of Josiane to create The Magic Roundabout. True, False, or Neither? Neither\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States.\nQuestion: Rudbeckia hirta is a very popular plant with generation Z True, False, or Neither? Neither\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology.\nQuestion: Pavel Sergeyevich Alexandrov wrote mostly about topology and set theory. True, False, or Neither? Neither\n###\nThe Substitute is a 1993 American television film directed by Martin Donovan, written by David S. Goyer under his pseudonym Cynthia Verlaine, and is also Mark Wahlberg's first acting role and credited as \"Marky Mark\", due to his successful hip hop career.\nQuestion: David S. Goyer is married to Cynthia Verlaine True, False, or Neither?", "doc_id": 816, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [6277, 18348, 17319, 36884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Resorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming.\nQuestion: Sam's Town is located in the Casino Strip. True, False, or Neither? True\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele.\nQuestion: Stanley Frederick Steele was born in an english hospital True, False, or Neither? Neither\n###\nHabib (Habibollah) Elghanian (Persian: \u062d\u0628\u06cc\u0628 (\u062d\u0628\u06cc\u0628\u200c\u0627\u0644\u0644\u0647) \u0627\u0644\u0642\u0627\u0646\u06cc\u0627\u0646\u200e \u200e , 5 April 1912 \u2013 9 May 1979) was a prominent Iranian Jewish businessman and philanthropist who served as the president of the Tehran Jewish Society and acted as the symbolic head of the Iranian Jewish community in the 1970s.\nQuestion: Habib studied business administration while in college True, False, or Neither? Neither\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain.\nQuestion: The Santa Cova Funicular is a very unpopular railway True, False, or Neither? Neither\n###\nLexington County is a county located in the U.S. state of South Carolina. As of the 2010 census, the population was 262,391, and the 2016 population estimate was 286,186. Its county seat and largest town is Lexington. The county was created in 1785. Its name commemorates the Battle of Lexington in the American Revolutionary War.\nQuestion: Lexington County's population grew between 2010 and 2016. True, False, or Neither?", "doc_id": 438, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14296, 35768, 18261, 35656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Julia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\".\nQuestion: Julia Goldani Telles was not in the \"Bunheads\" for a long time. True, False, or Neither? True\n###\nLeventhorpe Academy is a mixed, 11-19 secondary school and sixth form in the historic market town of Sawbridgeworth, Hertfordshire. The school became a business and Enterprise Academy in August 2011. The intake at age 11 is drawn mainly from the pleasant and prosperous towns of Sawbridgeworth and Bishop's Stortford and from the surrounding villages.\nQuestion: Leventhorpe Academy is an 11-19 secondary school. True, False, or Neither? True\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison.\nQuestion: Alix Bancourt doesn't have nicknames. True, False, or Neither? False\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006.\nQuestion: Anime Speed and Anime Speed Newtype Edition are the only two albums to have featured anime music in 2005 and 2006. True, False, or Neither? Neither\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart.\nQuestion: Migos released a song on the eve of Halloween True, False, or Neither?", "doc_id": 763, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24130, 11680, 3279, 5838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Recently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct.\nQuestion: Zero mammals have gone extinct since the 1500s True, False, or Neither? False\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft.\nQuestion: Filip Forsberg is a Swedish professional ice hockey player that has been the best Swede player in the NHL. True, False, or Neither? Neither\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician.\nQuestion: Thal\u00eda received much praise for her fifth studio album. True, False, or Neither? True\n###\nThe History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play.\nQuestion: The History Boys in a 2006 American comedy-drama. True, False, or Neither? False\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes.\nQuestion: Foundations follow legal requirements. True, False, or Neither?", "doc_id": 444, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [34022, 22917, 4206, 22425], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis.\nQuestion: The garden was tributed in the memory of a botanist. True, False, or Neither? True\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute.\nQuestion: The group writes disco songs True, False, or Neither? False\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats.\nQuestion: The club was founded in Spain. True, False, or Neither? True\n###\nHundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries.\nQuestion: Hundreds of ancient stone religious monuments lie on the island of Java. There were too many of them. True, False, or Neither? Neither\n###\nMaastricht (] ; Limburgish : \"Mestreech\" ; French: \"Maestricht\" ; Spanish: \"Mastrique\" ) is a city and a municipality in the southeast of the Netherlands. It is the capital and largest city of the province of Limburg, as well as the largest city in the historical duchy of Limburg, that today spans the Netherlands and Belgium.\nQuestion: Maastricht is the capital city of the Netherlands. True, False, or Neither?", "doc_id": 134, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [31445, 43709, 15059, 16829], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Sunny Sundae Smile\" is a song by the alternative rock band My Bloody Valentine. It was released as a non-album single in February 1987 on Lazy Records. Recorded at Alaska Studios in London, \"Sunny Sundae Smile\" was the band's first release on Lazy Records and the final release to feature original vocalist David Conway.\nQuestion: The song Sunny Sundae Smile was not apart of an album True, False, or Neither? True\n###\nDobbs Ferry is a village in Westchester County, New York. The population was 11,093 at the 2016 census. The Village of Dobbs Ferry is located in, and is a part of, the town of Greenburgh. The village ZIP code is 10522. Most of the Village falls into the boundaries of the Dobbs Ferry Union Free School District.\nQuestion: Some of Dobbs Ferry is not in the boundaries of Dobbs Ferry Union Free School District True, False, or Neither? True\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: He has held the position of the Tibetan Youth Congress since September 2007. True, False, or Neither? True\n###\nOak Furniture Land is a privately owned British furniture retailer of fully assembled hardwood cabinetry furniture, sofas, beds and mattresses for bedrooms, living rooms, dining rooms, nurseries and small office/home offices. The company has 74 stores across the UK, and its headquarters in Swindon in Wiltshire, England.\nQuestion: Oak Furniture Land has 70 stores across the UK True, False, or Neither? False\n###\nRobert Cary Blanchard (November 5, 1968 \u2013 September 6, 2016) was an American football placekicker in the National Football League. He played eight years for five teams: the New York Jets for his first two years, the Indianapolis Colts after taking 1994 off, the Washington Redskins in 1998, the New York Giants in 1999, and the Arizona Cardinals in his final season.\nQuestion: Blanchard made field goals to multiple teams True, False, or Neither?", "doc_id": 667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [42666, 20501, 35409, 730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Inferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India.\nQuestion: The setting of Inferno takes place in India True, False, or Neither? True\n###\nGeorge White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation.\nQuestion: George White is a slow man True, False, or Neither? Neither\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life.\nQuestion: The WHO is a wealthy organization True, False, or Neither? Neither\n###\nThe Emperor: Owner of the Mask () is a South Korean television series starring Yoo Seung-ho, Kim So-hyun, Kim Myung-soo, Yoon So-hee, Heo Joon-ho and Park Chul-min. It aired on MBC every Wednesday and Thursday at 22:00 (KST) from May 10, 2017 for 40 episodes.\nQuestion: The Emperor: Owner of the Mask stars went on to other projects after the series was over. True, False, or Neither? Neither\n###\nYahy\u0101 ibn Kh\u0101lid (Arabic: \u064a\u062d\u064a\u0649 \u0628\u0646 \u062e\u0627\u0644\u062f\u200e \u200e ) (died 806\u00a0CE ) was a member of the powerful Persian Barmakids family, son of Khalid ibn Barmak. Around 765, he was appointed to Azerbaijan by the Caliph Al-Mansur. Yahya's son Fadl ibn Yahya was born at Ar-Reiy, at the same time as Caliph al-Mahdi's son Harun. Al-Mahdi entrusted Yahya in 778 with Harun's education.\nQuestion: Khalid was born in 730 CE. True, False, or Neither?", "doc_id": 338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [16516, 185, 5412, 18337], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sky Television plc was a public limited company which operated a nine-channel satellite television service, launched by Rupert Murdoch's News International on 5 February 1989. Sky Television and its rival British Satellite Broadcasting suffered large financial losses and merged on 2 November 1990 to form British Sky Broadcasting (BSkyB). Programming merger took effect on 1 December 1990.\nQuestion: Sky Television was seen by Obama. True, False, or Neither? Neither\n###\nThe Little League World Series took place between August 22 and August 27 in Williamsport, Pennsylvania. Westbury American Little League of Houston, Texas defeated American Little League of West New York, New Jersey in the championship game of the 20th Little League World Series.\nQuestion: THe Little League World series takes place end of August. True, False, or Neither? True\n###\nSing A to Z is the tenth album by popular children's entertainers Sharon, Lois & Bram, originally released in 1990. This album, like many other Sharon, Lois & Bram albums has been re-released many times. It is rumored that the idea for this album came from Lois when she and Sharon were window shopping and came across an alphabet quilt on display.\nQuestion: Sing A to Z was released by Metallica True, False, or Neither? False\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th).\nQuestion: Alice Sue Claeys enjoys skiing True, False, or Neither? Neither\n###\nLeavitt Peak is located in the Emigrant Wilderness near Sonora Pass in the eastern Sierra Nevada range of California. Leavitt Peak is located on the Tuolumne County - Mono County line. The Pacific Crest Trail runs close to the east of Leavitt Peak, at an elevation of about 10800 ft elevation. The peak offers views south to Yosemite National Park and north towards South Lake Tahoe.\nQuestion: Leavitt Peak is not covered by vegetation True, False, or Neither?", "doc_id": 391, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [41492, 36632, 18423, 44194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "International Cycling Classic, also known as the Point Premium Root Beer or simply SuperWeek, was a 17-race series over 17 days open to licensed amateur and professional cyclists. The series took place primarily in the area surrounding Milwaukee, Wisconsin.\nQuestion: All 17 days had one race take place. True, False, or Neither? True\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts.\nQuestion: In 1996 David Lee Murphy had 2 tracks that landed on the Top 5 hits in the US Billboard. True, False, or Neither? True\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990).\nQuestion: Zale Dalen is a Canadian television director True, False, or Neither? True\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521.\nQuestion: At the 2010 Census, Port Melbourne had a population of 14,521.\n True, False, or Neither? Neither\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race.\nQuestion: Tom\u00e1s Nistal Fern\u00e1ndez has won races. True, False, or Neither?", "doc_id": 543, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14292, 28059, 33778, 25156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound and the Fury is an American drama film directed by James Franco. It is the second film version of the novel of the same name by William Faulkner. The previous adaptation, directed by Martin Ritt, was released in 1959. The film was released in a limited release and through video on demand on October 23, 2015, by New Films International.\nQuestion: The film was available worldwide. True, False, or Neither? Neither\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona.\nQuestion: Knightriders was filmed on location in Pittsburgh metro area and debuted in 1981. True, False, or Neither? True\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys.\nQuestion: Mark Morriss, Scott's older brother, was born before the year of nineteen hundred and seventy three. True, False, or Neither? True\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961.\nQuestion: The Cuban Embassy held a staff of 20 during its operations. True, False, or Neither? Neither\n###\nClearance Giddens is an African American Elvis impersonator from Melfa, Virginia, who has been billed as the \"Black Elvis\". He has appeared on the \"The Arsenio Hall Show\" and the \"Geraldo Show\", and in the film \"Honeymoon in Vegas\". In the early 1990s, he also sang on stage in a duet with Jimmy Buffett singing \"Jailhouse Rock\". He is listed in the book \"I Am Elvis: A Guide to Elvis Impersonators\".\nQuestion: Clearance Giddens has a q. True, False, or Neither?", "doc_id": 762, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [24192, 35558, 1578, 25869], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978.\nQuestion: Gary Sutherland has 2 sons who play professional sports True, False, or Neither? Neither\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer.\nQuestion: The group broke up when he left to go solo True, False, or Neither? Neither\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe.\nQuestion: The featured actor was not australian True, False, or Neither? False\n###\nThe 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place.\nQuestion: The players of The Big Green had terrible grades in college True, False, or Neither? Neither\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX.\nQuestion: Cartoon Network first started in nineteen hundred eighty five. True, False, or Neither?", "doc_id": 70, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [7314, 531, 18185, 23787], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds.\nQuestion: The Phu Quoc Ridgeback is a popular dog in Vietnam True, False, or Neither? Neither\n###\nThe 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\"\nQuestion: Winners of the Stanley Cup in in 1903 were give a nickname related to a metal. True, False, or Neither? True\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign.\nQuestion: The 23rd Infantry Brigade was the only infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II True, False, or Neither? Neither\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences.\nQuestion: Wyclef Jean will perform \"In The Zone\" with Ivy Queen in December of 2019. True, False, or Neither? Neither\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\".\nQuestion: Vishwasrao's first movie was in MArathi cinema True, False, or Neither?", "doc_id": 644, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "False", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [33427, 32346, 7793, 1092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\".\nQuestion: Friendship translated to Turkish means Kulubu True, False, or Neither? False\n###\nUtamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation.\nQuestion: Kanji Kunieda novel was released in 1940 True, False, or Neither? Neither\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\".\nQuestion: Gloria Stavers had a bad time at work due to discrimination True, False, or Neither? Neither\n###\nNorth High Bridge Park is a 0.85 acre city park located on the east bank bluffs above the Mississippi River in Saint Paul, Minnesota, United States. The park is adjacent to the High Bridge and was created when the new High Bridge was finished in 1987. The park includes gardens, sculptures and an overlook of the Mississippi River.\nQuestion: North High Bridge Park was planned before the High Bridge was completed. True, False, or Neither? Neither\n###\nVincent Edward \"Bo\" Jackson (born November 30, 1962) is a former baseball and American football player. He is one of the few athletes to be named an All-Star in two major sports, and the only one to do so in both baseball and football. He is widely considered one of the greatest athletes of all time.\nQuestion: Many professional sports players have been named All-Star in separate sports. True, False, or Neither?", "doc_id": 594, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [38616, 34468, 22319, 14944], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville.\nQuestion: Donaldson Center Airport has an area available for private planes. True, False, or Neither? Neither\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position.\nQuestion: Emily Kasparov was not the first woman elected to chair the CTA. True, False, or Neither? False\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later.\nQuestion: The man had serious mental problems. True, False, or Neither? Neither\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress.\nQuestion: Tsewang Rigzin held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013 True, False, or Neither? True\n###\nThe Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife.\nQuestion: King Diamond has never been to the circus. True, False, or Neither?", "doc_id": 654, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [27119, 25883, 13751, 36050], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004.\nQuestion: A remake of both games will be produced soon. True, False, or Neither? Neither\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character.\nQuestion: Jeffrey Reiner's Wonder Woman was the first attempt at transitioning a DC comic to TV. True, False, or Neither? Neither\n###\nEldrid Nordb\u00f8 (born 12 August 1942) is a Norwegian politician for the Labour Party. She was personal secretary to the Minister of Social Affairs in 1971, state secretary to the prime minister (1986-89), and Minister of Trade and Shipping (1990-91). She is married to economist and politician Bj\u00f8rn Skogstad Aamo.\nQuestion: Eldrid Nordb\u00f8 was personal secretary to Bj\u00f8rn Skogstad Aamo. True, False, or Neither? False\n###\nHenry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795.\nQuestion: Henry Pelham Fiennes Pelham-Clinton died alone True, False, or Neither? Neither\n###\nValan is a small coastal village on the island of Mager\u00f8ya in Nordkapp Municipality in Finnmark county in far northern Norway. Honningsv\u00e5g Airport, the local airport for the town of Honningsv\u00e5g is located in Valan. The town lies a few kilometres south of Valan.\nQuestion: Valan is inaccessible by road True, False, or Neither?", "doc_id": 380, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [28253, 18783, 14610, 13398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things.\nQuestion: The Wet is about deep sea diving. True, False, or Neither? False\n###\nJohn Wellborn Root (January 10, 1850 \u2013 January 15, 1891) was an American architect who was based in Chicago with Daniel Burnham. He was one of the founders of the Chicago School style. Two of his buildings have been designated a National Historic Landmark; others have been designated Chicago landmarks and listed on the National Register of Historic Places. In 1958, he received the AIA Gold Medal.\nQuestion: John Wellborn Root was 27when he died. True, False, or Neither? False\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley.\nQuestion: The film was in English. True, False, or Neither? Neither\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award.\nQuestion: The author of The Path Between the Seas was an American historian True, False, or Neither? True\n###\nWCBC is an AM radio station that serves the greater area of Cumberland, Maryland. Founded in April, 1976, WCBC provides news coverage: locally, regionally, and nationally; weather forecasts; participation in major community events to promote the area and its organizations by way of remote broadcasts and community service announcements.\nQuestion: The radio station has many listeners. True, False, or Neither?", "doc_id": 180, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "False", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [26089, 32315, 17773, 904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event.\nQuestion: The 1997 Porsche Tennis Grand Prix took place in 1980 True, False, or Neither? False\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit.\nQuestion: Bridge Mountain is located in the United States. True, False, or Neither? True\n###\nAsana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook.\nQuestion: Asana was built in 2016. True, False, or Neither? False\n###\nWaking Up is Hard to Do is the second studio album by the American indie rock band Giant Drag, released on March 5, 2013 on Full Psycho Records, the band's own label. It is the band's first full-length release of original material since \"Hearts and Unicorns\" (2005) and was released as a digital download on Giant Drag's Bandcamp music store.\nQuestion: Giant Drag released 2 albums in 2013. True, False, or Neither? Neither\n###\nDavid Halberstam (April 10, 1934 \u2013 April 23, 2007) was an American journalist and historian, known for his work on the Vietnam War, politics, history, the Civil Rights Movement, business, media, American culture, and later, sports journalism. He won a Pulitzer Prize for International Reporting in 1964. In 2007, while doing research for a book, Halberstam was killed in a car crash.\nQuestion: The reporter was popular for his pieces on history True, False, or Neither?", "doc_id": 503, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [22662, 8957, 13474, 13497], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sherbino Mesa Wind Farm is located in Pecos County in west Texas. The first 150 megawatts (MW) of the project, which has a potential capacity of 750 MW, is in operation. Phase I utilizes 50 Vestas V-90 Mk.5 wind turbine generators, each with a rated capacity of 3 MW. BP will operate phase I of the project.\nQuestion: The Sherbino Mesa Wind Farm is located in a rural part of Texas True, False, or Neither? Neither\n###\nThe Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis.\nQuestion: The Copenhagen Consensus Center is a very effective conference. True, False, or Neither? Neither\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide.\nQuestion: GLO has had over a thousand employees. True, False, or Neither? True\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6.\nQuestion: Nydala Abbey was recobstructed in the 16th century. True, False, or Neither? False\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives.\nQuestion: \"Up All Night\" is a sitcom from a country that was previously a British colony. True, False, or Neither?", "doc_id": 129, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "Neither", "target": "Neither", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [14646, 16233, 35206, 45322], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Virginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly.\nQuestion: All democrats in Virginia's Eleventh Congressional District voted for Gerry Connolly True, False, or Neither? Neither\n###\nRoss Dawson (born 1962) is an Australian author, futurist, entrepreneur and former stockbroker. Best known for his 2002 book 'Living Networks', Dawson founded the futures think tank Future Exploration Network and consults on digital futures to various big organisations such as Ernst & Young, Macquarie Bank, Microsoft and News Corp.\nQuestion: Ross Dawson ends with a N. True, False, or Neither? True\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke.\nQuestion: Elizabeth Sarnoff likes to be called Beth. True, False, or Neither? Neither\n###\nIn field hockey, a penalty stroke, sometimes known as a penalty flick, is the most severe penalty given. It is predominantly awarded when a foul has prevented a certain goal from being scored or for a deliberate infringement by a defender in the penalty circle.\nQuestion: There is a penalty box in field hockey. True, False, or Neither? False\n###\nIn ancient Roman religion, Antevorta was a goddess of the future, also known as Porrima. She and her sister Postverta (or Postvorta) were described as companions or siblings of the goddess Carmenta, sometimes referred to as \"the Carmentae\". They may have originally been two aspects of Carmenta, namely those of her knowledge of the future and the past (compare the two-faced Janus).\nQuestion: Antevorta and Postverta were sibling rivals. True, False, or Neither?", "doc_id": 755, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} +{"pred": "True", "target": "True", "answer_choices_list": ["True", "Neither", "False"], "fewshot_idx": [18475, 42371, 8995, 1998], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Conoclinium coelestinum, the blue mistflower, is a North American species of herbaceous perennial flowering plant in the sunflower family. It was formerly classified in the genus \"Eupatorium\", but phylogenetic analyses in the late 20th century research indicated that that genus should be split, and the species was reclassified in \"Conoclinium\".\nQuestion: Conoclinium coelestirum is a blue mistflower of North American species of hebaceous perennial flowering plant in the Daisy flower family. True, False, or Neither? False\n###\nMy Cat Is An Alien (MCIAA) is the name of the Italian musical duo and outsider audiovisual artists consisting of brothers Maurizio and Roberto Opalio, formed in Torino, Italy, in late 1997. They release avant garde /experimental music in a peculiar form of improvisation that MCIAA themselves define 'instantaneous composition'.\nQuestion: The brother worked on another musical after My Cat is an alien. True, False, or Neither? Neither\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added.\nQuestion: Stereolab has released at least four albums. True, False, or Neither? Neither\n###\nSir Christopher Edward Wollaston MacKenzie Geidt {'1': \", '2': \", '3': \", '4': \"} (born 17 August 1961) was the private secretary to Queen Elizabeth II from September 2007 to 2017. As of July 2016, Geidt also serves as the Chairman of the Council of King's College London, succeeding the Duke of Wellington.\nQuestion: Christopher Edward Wollaston MacKenzie Geidt was born in the late summer. True, False, or Neither? True\n###\nThe Tampere Floral Festival is an annual summer festival held in Tampere, Southern Finland in July and/or August. During the festival the centre of the city is decorated by flower arrangements and about 150 events such as concerts, parades, a \"wine village\" and a children's day take place. The festival lasts approximately one week and attracts thousands of local residents and visitors.\nQuestion: Tampere, Finland is a festive place in late summer. True, False, or Neither?", "doc_id": 954, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "GPT-3 style", "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": ""} diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e0b84f31a5eb3eab3280ac4bd32f009bdbfad10 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8655412b9a42f8604bc975b2360d64f3b46a6129e38e9e556f686cdd8572250f +size 1203513 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62ab76f48dccd1ca611483f797163f67072518b8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b40ef4d20978476b0cc2dec921e49c6d15acf40597e5af09d6b176820479528 +size 1749990 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d115460441232a8ceb68243f1983b23f324983fe --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f5c743acbe16e90c70ca0c1b83470dabffda10fdcc9e5bf6748d85db0a0878 +size 2294770 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8da3a385d48248a695133d25c98e21149e0b194b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9a526da553f27ec3c67e6f4c4fe00c954f722c389dbde248206689c8cf9972 +size 2835642 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b41b20c33ae68f7702ac8ac50c955ea813f0c513 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4841373ed0b1e0560bc7bb98f5ac6d1b5038f0061f45307dbe89aa12b48c4770 +size 1008501 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..65481607ae65bd2be5f968caeb8ba036401d17e1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c9ed23378e5491f1cd7f9f4f5d759e0614ad1a89c9496f026685d37cd1a996 +size 1474062 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e7b1c5c9eb6a0dc4035cb6bf9c4cd7f27a12c73 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d2678dbb914723224e1648d2faef76e1a07cb7823a7812b02b2fe28ff44c84e +size 1939689 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5d116f6c0d02e548f064af6fd25ddb0bd43a0ea1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd3edb1537a32f56ca28fc697f065f727f4db00b8c3a3dce7e24733fc2b429c +size 2401855 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d8cc2c7755d2e83375ad39cc31bd0faa7689657 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c8d0a8a7ce42dbac7a3366b7a3bcbc6444698682a043d26aa13de0b272215b +size 1161484 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02cc4d8e6a3692f51dae79a6e440318d6ddd35a6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3737da7224265c2db5b7bfa5c03a1719c6f94ad2d45312502d70995200e9386f +size 1663799 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..134718861e2dadb4e269a24db01a3b268dd0524e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8d0fefb879606851e82e21ba362940d4279fe94a20ba63f95871e3bb092238 +size 2166580 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fe158ee705f652fa8bfdf2d9851a9b2703b1998a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a90edc549717adadf88ce3d6aac4236ab6e7245600d54c367b02587baed3ce9 +size 2665533 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..88348ed921a555d5dfb833341c57df3ae08ea651 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79442da112d35f70caeeda74a9311a6ffad4b91302a0a6098ab18bf85f648fb7 +size 3164459 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..064aae01d30e90d5065de2b686fc47177b2867ca --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00e954323fd698b541664d78359d8f2f1b141174a9ff9149a715f0960571141e +size 1027175 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9f76030cbb08672be53953a8c9c059cfca97c118 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004f5c9e1e0935694e9926b332ed98409da62d6ee1050d3073d13fab2964179f +size 1499031 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ebb5ac0f02e09d0a288cc5bc2704377cdc1ee4f6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fb01b7f287a3300ef75984ef53bcd347a57a0676d3c07922e92eb7de1fb1bdf +size 1970734 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c0f09844f28b1d2249da9263934a1ba69c3a213c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f1f84cfde5b2a4198b9f33939cc7f684367b4ad9a92716dcfcec2e7ad1a590 +size 2438717 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d029d00a60a61a78738d66216fc89327be58af53 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_4.jsonl @@ -0,0 +1,1000 @@ +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36011, 21367, 14233, 4569], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway. Are we justified in saying that \"It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1932\"? Yes, no, or maybe? No\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci plays a professional indoor skydiver in iCarly\"? Yes, no, or maybe? Maybe\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg. Are we justified in saying that \"Steven Spielberg directed in total 75 movies following the release of Duel.\"? Yes, no, or maybe? Maybe\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh. Are we justified in saying that \"Shahzad Sheikh is an Indian film actor\"? Yes, no, or maybe? No\n###\nThe Benetton B188 was a Formula One racing car designed by Rory Byrne and raced by Benetton team in the 1988 Formula One season and in the first half of the 1989 Formula One season. Dating back to when the team started as Toleman in , the B188 was the first car produced by the team not to be powered by a turbocharged engine. Are we justified in saying that \"The Benetton team was in charge of driving The Benetton B188.\"? Yes, no, or maybe?", "doc_id": 314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42940, 37308, 36577, 22843], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "American Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company. Are we justified in saying that \"The Hudson Motor Car Company ceased to exist in 1954\"? Yes, no, or maybe? Maybe\n###\nTaina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002. Are we justified in saying that \"Taina aired in 5 different countries, including America\"? Yes, no, or maybe? Maybe\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic. Are we justified in saying that \"Lausche is not the tallest peak of the Lusatian Mountains.\"? Yes, no, or maybe? No\n###\nNeilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep. Are we justified in saying that \"Neilson Hubbard is known for his work in america\"? Yes, no, or maybe? Yes\n###\nFC Saturn-1991 Saint Petersburg (Russian: \u0424\u041a \u00ab\u0421\u0430\u0442\u0443\u0440\u043d\u20111991\u00bb \u0421\u0430\u043d\u043a\u0442\u2011\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 ) was a Russian football team from Saint Petersburg. It played professionally from 1992 to 1995, including 3 seasons (1993\u20131995) in the second-highest Russian First Division. In 1996 it merged with FC Lokomotiv Saint Petersburg. Before 1995 it was called FC Smena-Saturn Saint Petersburg. Are we justified in saying that \"FC Saturn-1991 Saint Petersburg merged with FC Lokomotiv Saint Petersburg\"? Yes, no, or maybe?", "doc_id": 767, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13152, 16774, 32068, 1553], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2008 Emerald Bowl, part of the 2008-09 NCAA football bowl games season, was played on December 27, 2008, at AT&T Park, the home field of the Giants in San Francisco, California. The Miami Hurricanes of the ACC were matched against the California Golden Bears (based in nearby Berkeley, California) of the Pac-10, the first appearance by either team in the seven-year history of the Emerald Bowl. Are we justified in saying that \"The 2008 Emerald Bowl was played after christmas\"? Yes, no, or maybe? Yes\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Kinsey Millhone is a real person\"? Yes, no, or maybe? No\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character. Are we justified in saying that \"Jeffery Reiner would've directed more episodes.\"? Yes, no, or maybe? Maybe\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe. Are we justified in saying that \"The show was created for network 11\"? Yes, no, or maybe? No\n###\nThe Perfect Gift is a 2009 spinoff of the 2005 Christian drama movie \"The Perfect Stranger\", and its first sequel, \"Another Perfect Stranger\". It stars Christina Fougnie, Amy Hess, Matt Wallace, and Jefferson Moore once again as Jesus Christ. It was filmed almost entirely in Kentucky, where the first two movies in the series were not. Are we justified in saying that \"The Perfect Stranger was filmed in Missouri.\"? Yes, no, or maybe?", "doc_id": 539, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9857, 40833, 57, 18621], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1924\u201325 WPI Engineers men's basketball team represented Worcester Polytechnic Institute during the 1924\u201325 NCAA men's basketball season. They were coached by Ivan Bigler. The Engineers played their home games at Alumni Gym in Worcester, Massachusetts. The team finished the season with 5 wins and 9 losses. Are we justified in saying that \"Ivan Bigler led the team to more losses than wins in 1924-25.\"? Yes, no, or maybe? Yes\n###\nSalli Elise Richardson (born November 23, 1967) is an American television and film actress and director. Richardson is known for her role as Angela on the 1994 hit comedy/action film \"A Low Down Dirty Shame\" and for her role as Dr. Allison Blake on the Syfy comedy-drama series \"Eureka\" (2006\u20132012). Are we justified in saying that \"Salli Elise Richardson starred in the Syfy comedy-drama series \"Eureka\" (2006\u20132012)\"? Yes, no, or maybe? Yes\n###\nThirteen Ghosts (also known as 13 Ghosts and stylized as THIR13EN Ghosts) is a 2001 Canadian-American supernatural horror film directed by Steve Beck. It is a remake of the 1960 film \"13 Ghosts\" by William Castle. It follows the remake of another one of Castle's films, \"House on Haunted Hill\", and was shot entirely around Lower Mainland, British Columbia. Are we justified in saying that \"Thirteen Ghosts was filmed primarily in Canada\"? Yes, no, or maybe? Yes\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH). Are we justified in saying that \"Jacques Vall\u00e9e is not proud of the interdimensional hypothesis.\"? Yes, no, or maybe? Maybe\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"The Plain is to the south of Marston Road\"? Yes, no, or maybe?", "doc_id": 43, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15210, 6214, 43848, 32357], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi. Are we justified in saying that \"An Indian crime film inspired by \"The Godfather\" was re-released 5 years after its original release date, but dubbed in Tamil and had screen re-shots with different actors. \"? Yes, no, or maybe? Yes\n###\nHow Green Was My Valley is a BBC Television serial based on the novel by Richard Llewellyn, and features one of the last performances by Stanley Baker. It was first shown in the UK from 29 December 1975 in six weekly parts, while producer Martin Lisemore also cast Si\u00e2n Phillips in his next production, \"I Claudius\" (1976). Are we justified in saying that \"Si\u00e2n Phillips is the writer of How Green Was My Valley\"? Yes, no, or maybe? No\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12. Are we justified in saying that \"Grosvenor is not just the name of a street.\"? Yes, no, or maybe? Yes\n###\nSouthpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company. Are we justified in saying that \"When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office.\"? Yes, no, or maybe? Maybe\n###\nThe Tragedy of Julius Caesar is a tragedy by William Shakespeare, believed to have been written in 1599. It is one of several plays written by Shakespeare based on true events from Roman history, which also include \"Coriolanus\" and \"Antony and Cleopatra\". Are we justified in saying that \"The Tragedy of William Shakespeare is a tragedy by Julius Caesar, believed to have been written in 1599. \"? Yes, no, or maybe?", "doc_id": 558, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24442, 12585, 45164, 21773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s. Are we justified in saying that \" Fred Wesley and the Horny Horns produced albums until the year 2000.\"? Yes, no, or maybe? Maybe\n###\nMenelik Watson (born December 22, 1988) is an English professional American football offensive tackle for the Denver Broncos of the National Football League (NFL). He was drafted by the Oakland Raiders in the second round of the 2013 NFL Draft. He played college football at Florida State. Are we justified in saying that \"Watson never completed his degree after being drafter into the NFL.\"? Yes, no, or maybe? Maybe\n###\nSt Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795. Are we justified in saying that \"According to the 2011 census, St Kilda has more than 20000 living there\"? Yes, no, or maybe? No\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences. Are we justified in saying that \"The Lord of the Rings: The Fellowship of the Ring starts with a T.\"? Yes, no, or maybe? Yes\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty. Are we justified in saying that \"679 is an area code.\"? Yes, no, or maybe?", "doc_id": 709, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32110, 19673, 16595, 35990], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County. Are we justified in saying that \"The population in year 2010 was 5 digits\"? Yes, no, or maybe? Yes\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct. Are we justified in saying that \"At least 79 species have become extinct.\"? Yes, no, or maybe? Yes\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978. Are we justified in saying that \"Gary Sutherland played first base several times\"? Yes, no, or maybe? Maybe\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries. Are we justified in saying that \"The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to me.\"? Yes, no, or maybe? No\n###\nWhite Fang 2: Myth of the White Wolf is a 1994 American Northern adventure film directed by Ken Olin. A sequel to the 1991 \"White Fang\", it stars Scott Bairstow, Alfred Molina, and Geoffrey Lewis. Filming took place in Aspen, Colorado and Vancouver, British Columbia. Walt Disney Home Video released this movie on VHS October 19, 1994. Are we justified in saying that \"White Fang 2: Myth of the White Wolf is a short film\"? Yes, no, or maybe?", "doc_id": 737, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38872, 39899, 25980, 28445], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014. Are we justified in saying that \"The Xcent was first test driven in 2012.\"? Yes, no, or maybe? Maybe\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Parsons and White were successful with The Byrds after they left Nashville West.\"? Yes, no, or maybe? Maybe\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series. Are we justified in saying that \"The Bear and the Maiden Fair was written after the book\"? Yes, no, or maybe? Maybe\n###\nThe Merdeka Palace (Indonesian: Istana Merdeka ; also known in Indonesian as Istana Gambir and during colonial times as Paleis te Koningsplein), is one of six presidential palaces in Indonesia. It is located on the north side of the Merdeka Square in Central Jakarta, Indonesia and is used as the official residence of the President of the Republic of Indonesia. Are we justified in saying that \"The Merdeka Palace was remodeled in 2006.\"? Yes, no, or maybe? Maybe\n###\nThe Harlem Globetrotters Popcorn Machine was a Saturday morning variety show featuring players from the basketball team the Harlem Globetrotters singing, dancing, and performing comedy sketches. Broadcast in 1974, it was produced by Funhouse Productions for Viacom Productions. Are we justified in saying that \"The Harlem Globetrotters Popcorn Machine was a series of tubes\"? Yes, no, or maybe?", "doc_id": 67, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42578, 38878, 44318, 15933], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Year 493 BC was a year of the pre-Julian Roman calendar. At the time, it was known as the Year of the Consulship of Auruncus and Viscellinus (or, less frequently, year 261 \"Ab urbe condita\"). The denomination 493 BC for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years. Are we justified in saying that \"Year 493 BC was very recent.\"? Yes, no, or maybe? No\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity. Are we justified in saying that \"Kdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with headquarters in both Irvine (US) and Changsha City (China).\"? Yes, no, or maybe? No\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center. Are we justified in saying that \"Tango is a dance inspired by classical music\"? Yes, no, or maybe? Maybe\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player. Are we justified in saying that \"David Krakauer is the only musician in his family.\"? Yes, no, or maybe? Maybe\n###\nDavid Gibb (born 1 July 1990) is a children's musician and songwriter from Belper, Derbyshire. He was a finalist of the BBC Radio 2 Young Folk Award 2011, as well as winning the 'Highly Commended' prize at the Young Storyteller of the Year Awards the same year. In 2013, Gibb featured alongside musical collaborator Elly lucas in the advertising campaign for Gola trainers. Are we justified in saying that \"David Gibb ends with a B.\"? Yes, no, or maybe?", "doc_id": 298, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27333, 24496, 447, 29923], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Innyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169. Are we justified in saying that \"Innyaly its famous for its food. \"? Yes, no, or maybe? Maybe\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle. Are we justified in saying that \"Boon Brewery is from Belgium \"? Yes, no, or maybe? Yes\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty. Are we justified in saying that \"679 was Fetty Wap's highest charting song.\"? Yes, no, or maybe? No\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart. Are we justified in saying that \"Dwight Yoakam has traveled to and played country music in every US state.\"? Yes, no, or maybe? Maybe\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997. Are we justified in saying that \"\"Day In, Day Out\" is by an American Band\"? Yes, no, or maybe?", "doc_id": 240, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7274, 34279, 2696, 24182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015. Are we justified in saying that \"The 2015 City of Onkaparinga ATP Challenger had a ton of subsequent tournaments after this one was played.\"? Yes, no, or maybe? Maybe\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season. Are we justified in saying that \"Matthew Mansfield was born more than one billion seconds ago.\"? Yes, no, or maybe? No\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"Thalia has at least four albums. \"? Yes, no, or maybe? Yes\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era. Are we justified in saying that \"Howard and Blondell had many scenes together.\"? Yes, no, or maybe? Maybe\n###\nThe Blackpool Gazette is an English evening newspaper based in Blackpool, Lancashire. Published every day except Sunday, it covers the towns and communities of the Fylde coast. It was founded as \"The West Lancashire Evening Gazette\" in 1929 before being renamed the \"Evening Gazette\", and then \"Blackpool Gazette\". The paper's history dates back to a weekly publication founded in 1873. Are we justified in saying that \"the paper is distributed to multiple towns\"? Yes, no, or maybe?", "doc_id": 231, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8191, 8687, 2247, 6795], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Beyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert. Are we justified in saying that \"Beyond the Valley of the Dolls is a 1980 satirical melodrama\"? Yes, no, or maybe? No\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France. Are we justified in saying that \"The Communaut\u00e9 de communes des Trois Rivi\u00e8res is made up of mostly low class people\"? Yes, no, or maybe? Maybe\n###\nRear Admiral Kevin John Scarce {'1': \", '2': \", '3': \", '4': \"} (born 4 May 1952) is a retired Royal Australian Navy officer who was the 34th Governor of South Australia, serving from August 2007 to August 2014. He was succeeded by Hieu Van Le, who had previously been his lieutenant governor. Are we justified in saying that \"Kevin served over 20 years in government positons\"? Yes, no, or maybe? Maybe\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child. Are we justified in saying that \"The short work actually began with a plotline\"? Yes, no, or maybe? Maybe\n###\n\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015. Are we justified in saying that \"\"I'd Be Lost\" and \"Only One\" are actually two names for the same song\"? Yes, no, or maybe?", "doc_id": 978, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15905, 44510, 26867, 23647], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video. Are we justified in saying that \"The Newcomers was a box office success.\"? Yes, no, or maybe? Maybe\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart. Are we justified in saying that \"Early Mac collaborated with Chance the rapper\"? Yes, no, or maybe? Maybe\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups. Are we justified in saying that \"this corn disease is actually put inside of food in certain countries\"? Yes, no, or maybe? Yes\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Santa Teresa is in California. \"? Yes, no, or maybe? Yes\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Despite being born in France, Jo\u00e3o later on moved to the US\"? Yes, no, or maybe?", "doc_id": 729, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27663, 44926, 15394, 15173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "X X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars. Are we justified in saying that \"X X X X is something you drink\"? Yes, no, or maybe? Yes\n###\nThe Office is a British mockumentary sitcom, first broadcast in the United Kingdom on BBC Two on 9 July 2001. Created, written and directed by Ricky Gervais and Stephen Merchant, the programme is about the day-to-day lives of office employees in the Slough branch of the fictitious Wernham Hogg Paper Company. Gervais also stars in the series, playing the central character, David Brent. Are we justified in saying that \"Brent is not a fictitious character.\"? Yes, no, or maybe? No\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"Will Smith wanted his own son to be in the movie. \"? Yes, no, or maybe? Maybe\n###\nDovyalis is a genus of shrubs and small trees. Recent genetic evidence has shown the genus to belong to the family Salicaceae; formerly it was classified in the family Flacourtiaceae. The 15 species are native to Africa (Ethiopia south to South Africa) and southern Asia (India, Sri Lanka). Some are cultivated for their fruit. Are we justified in saying that \"The Dovyalis genus includes fruit-bearing plants.\"? Yes, no, or maybe? Yes\n###\nI Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013. Are we justified in saying that \"Eric Tsang's I Love Hong Kong sequels were released on Chinese New Year Day.\"? Yes, no, or maybe?", "doc_id": 354, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7353, 28461, 25988, 29299], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The foreign debt of the Socialist Republic of Romania were loans made by Socialist Republic of Romania under Nicolae Ceau\u0219escu from international creditors denominated in hard currencies. These loans were used to buy technology, equipment and raw materials needed for the industrialization of the country. Are we justified in saying that \"The Socialist Republic of Romania received loans from foreign creditors while Nicolae Ceau\u0219escu was in power\"? Yes, no, or maybe? Yes\n###\nThe Merdeka Palace (Indonesian: Istana Merdeka ; also known in Indonesian as Istana Gambir and during colonial times as Paleis te Koningsplein), is one of six presidential palaces in Indonesia. It is located on the north side of the Merdeka Square in Central Jakarta, Indonesia and is used as the official residence of the President of the Republic of Indonesia. Are we justified in saying that \"The Merdeka Palace has a pink roof.\"? Yes, no, or maybe? Maybe\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008. Are we justified in saying that \"He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2007.\"? Yes, no, or maybe? No\n###\nRiver Raid is a scrolling shooter video game designed and developed by Carol Shaw, and published by Activision in 1982 for the Atari 2600 video game console. Over a million game cartridges were sold. Activision later ported the title to the Atari 5200, ColecoVision, and Intellivision game consoles, as well as to the Commodore 64, IBM PCjr, MSX, ZX Spectrum, and Atari 8-bit family home computers. Are we justified in saying that \"River Raid was one of the hardest games ever.\"? Yes, no, or maybe? Maybe\n###\n\"Vanlose Stairway\" is a song written by Northern Irish singer-songwriter Van Morrison and included on his 1982 album, \"Beautiful Vision\". It has remained a popular concert performance throughout Morrison's career and has become one of his most played songs. Are we justified in saying that \"Vanlose Stairway is a Van Morrison Song and on an abum\"? Yes, no, or maybe?", "doc_id": 368, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39216, 33267, 32963, 22724], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge. Are we justified in saying that \"Kew Bridge should be a historical landmark.\"? Yes, no, or maybe? Maybe\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games. Are we justified in saying that \"The SuperSonics are not a basketball team. \"? Yes, no, or maybe? No\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway. Are we justified in saying that \"Princess Ragnhild was born in MCMXXXI\"? Yes, no, or maybe? Maybe\n###\nThe 8th Race of Champions was a non-Championship motor race, run to Formula One rules, held on 18 March 1973 at Brands Hatch circuit in Kent, UK. The race included several entrants in Formula 5000 cars and was won by Peter Gethin in a Chevron-Chevrolet B24 '72-05'. This was the only race other than the poorly-attended 1969 Madrid Grand Prix in which a Formula 5000 car beat a Formula One car. Are we justified in saying that \"The 8th Race of Champions was one of the worst races\"? Yes, no, or maybe? Maybe\n###\nMichael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello. Are we justified in saying that \"Shakespeare used Cinthio's squadron leader as a model for Cassio.\"? Yes, no, or maybe?", "doc_id": 914, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12389, 33946, 41076, 3126], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times. Are we justified in saying that \"The Sound of Waves is a 1960 novel by a Japanese person\"? Yes, no, or maybe? No\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes. Are we justified in saying that \"Ben McMillan did not create Max & Shred by himself. \"? Yes, no, or maybe? Yes\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005. Are we justified in saying that \"The British television sketch comedy premiered on BBC 2 on 21 July 2005 with a second series.\"? Yes, no, or maybe? Yes\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South . Are we justified in saying that \"The manager of Curzon Ashton Ladies Football Club is a woman.\"? Yes, no, or maybe? Maybe\n###\nThe Green Goblin's Last Stand is a 1992 fan film by Dan Poole, based on the comic book story \"The Night Gwen Stacy Died\", published by Marvel Comics in \"The Amazing Spider-Man\" #121\u2013122. Poole is the director, producer, creative editor, screenwriter, and star of the film. The film and its attendant documentary received showings and accolades at several small film festivals. Are we justified in saying that \"The Green Goblin has been entered into contests\"? Yes, no, or maybe?", "doc_id": 335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40916, 9037, 36776, 37288], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers. Are we justified in saying that \"Sanse plays home games at multiple stadiums.\"? Yes, no, or maybe? Maybe\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"The home depot primarily supplies construction companies\"? Yes, no, or maybe? Maybe\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title. Are we justified in saying that \"The 2010 ASB Classic was a mans tournament\"? Yes, no, or maybe? No\n###\nThe 18th Critics' Choice Awards were presented on January 10, 2013 at the Barker Hangar at the Santa Monica Airport, honoring the finest achievements of 2012 filmmaking. The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2012. Are we justified in saying that \"The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2813.\"? Yes, no, or maybe? No\n###\nThe 1998 NCAA Men's Volleyball Tournament was the 29th annual tournament to determine the national champion of NCAA men's collegiate volleyball. The single elimination tournament was played at the Stan Sheriff Center in Honolulu, Hawai\u02bbi during May 1998. With a total tournament attendance of 18,901, this remains this best attended men's volleyball championship. Are we justified in saying that \"The teams were made up of students.\"? Yes, no, or maybe?", "doc_id": 883, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25570, 2174, 26910, 27522], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ron & Carol Cope Stadium at Foster Field, is a football stadium located in Kearney, Nebraska, on the University of Nebraska\u2013Kearney campus. In 2005, the university named the stadium after Ron & Carol Cope, who were long-time supporters of the University of Nebraska System. The field is named after Charlie Foster, a former coach and athletic director at Nebraska\u2013Kearney. Are we justified in saying that \"Foster Field is in need of repair.\"? Yes, no, or maybe? Maybe\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States. Are we justified in saying that \"The Forum Shops at Caesars has the largest gross income.\"? Yes, no, or maybe? Yes\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest. Are we justified in saying that \"Beilin District is popular amongst people with hair\"? Yes, no, or maybe? Maybe\n###\nDMOZ (from \"directory.mozilla.org\", an earlier domain name) was a multilingual open-content directory of World Wide Web links. The site and community who maintained it were also known as the Open Directory Project (ODP). It was owned by AOL (now a part of Verizon's Oath Inc.) but constructed and maintained by a community of volunteer editors. Are we justified in saying that \"DMOZ is no longer maintained.\"? Yes, no, or maybe? Yes\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation. Are we justified in saying that \"The 89th medium tank battalion participated in no fewer than 10 campaigns but no more then 25.\"? Yes, no, or maybe?", "doc_id": 804, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22615, 39956, 6823, 28758], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Svensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord. Are we justified in saying that \"Linsborg, Kansas was ties with many of the festival familes.\"? Yes, no, or maybe? Maybe\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood. Are we justified in saying that \"Shyamaprasad directed Ivide.\"? Yes, no, or maybe? Yes\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child. Are we justified in saying that \"Ursula intended the work to be a major hit\"? Yes, no, or maybe? Maybe\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m). Are we justified in saying that \"sonette is a community is east central powder river county\"? Yes, no, or maybe? No\n###\nJay Kahn is a Democratic member of the New Hampshire Senate representing the 10th district. The 10 district is located in the southwestern corner of the state and includes Alstead, Chesterfield, Gilsum, Harrisville, Hinsdale, Keene, Marlborough, Roxbury, Sullivan, Surry, Swanzey, Walpole, Westmoreland and Winchester, New Hampshire. Are we justified in saying that \"The 10th district includes 14 towns.\"? Yes, no, or maybe?", "doc_id": 888, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38969, 42876, 4547, 5606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Colin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs helps with the Gallon Newsletter.\"? Yes, no, or maybe? Yes\n###\nBremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society. Are we justified in saying that \"Bremen ( ) is a small town in Lincoln County, Maine, United States. It has many villages in it.\"? Yes, no, or maybe? Yes\n###\nRastafari, sometimes termed Rastafarianism, is an Abrahamic religion. Classified as a new religious movement, it developed in Jamaica during the 1930s. It lacks any centralised authority and there is much heterogeneity among practitioners, who are known as Rastafari, Rastafarians, or Rastas. Are we justified in saying that \"Rastafari is the newest Abrahamic religion\"? Yes, no, or maybe? Maybe\n###\nIn the mathematical field of topology, the Alexandroff extension is a way to extend a noncompact topological space by adjoining a single point in such a way that the resulting space is compact. It is named for the Russian mathematician Pavel Alexandrov. Are we justified in saying that \"Alexandroff extensions make compact space\"? Yes, no, or maybe? Yes\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records. Are we justified in saying that \"It started to dominate less than 100 years ago\"? Yes, no, or maybe?", "doc_id": 795, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40789, 37317, 44760, 45119], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The PLDT Home TVolution Power Attackers (women's) and the PLDT Home Telpad-Air Force Turbo Boosters (men's) were professional volleyball teams owned by PLDT that played in the Philippine Super Liga (PSL) from 2013 to 2014. The club was first known as PLDT myDSL Speed Boosters. Are we justified in saying that \"the club known as pldt speed boosters played in philipine league \"? Yes, no, or maybe? Yes\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro. Are we justified in saying that \"Nicola or Niccolo Massaro died in 1804\"? Yes, no, or maybe? No\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Krishan Kant voted for Sushil Kumar Shinde\"? Yes, no, or maybe? No\n###\nSophie Tucker (January 13, 1887 \u2013 February 9, 1966) was a Ukrainian-born American singer, comedian, actress, and radio personality. Known for her stentorian delivery of comical and risqu\u00e9 songs, she was one of the most popular entertainers in America during the first half of the 20th century. She was widely known by the nickname \"The Last of the Red Hot Mamas\". Are we justified in saying that \"The Last of the Red Hot Mamas was a nickname given to the american-born singer sophie tucker\"? Yes, no, or maybe? No\n###\nDiscover Financial Services, Inc. is an American financial services company, which issues the Discover Card and operates the Discover and Pulse networks, and owns Diners Club International. Discover Card is the third largest credit card brand in the United States, when measured by cards in force, with nearly 50 million cardholders. Are we justified in saying that \"More than 50 million people have Discover Card.\"? Yes, no, or maybe?", "doc_id": 973, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43489, 32139, 15780, 14722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2012 SEC Women\u2019s Basketball Tournament took place at the Bridgestone Arena in Nashville, Tennessee from March 1-4, 2012. The Tennessee Lady Volunteers won the tournament and received the SEC\u2019s automatic bid to the 2012 NCAA Women\u2019s Basketball Tournament by defeating the LSU Lady Tigers 70-58 in the championship game. Are we justified in saying that \"The 2012 SEC Women's Basketball tournament was won by 12 points\"? Yes, no, or maybe? Yes\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"Johan Martin Schr\u00f6der considers himself an entrepreneur.\"? Yes, no, or maybe? Maybe\n###\n\"Beez in the Trap\" is a song by rapper Nicki Minaj for her second studio album, \"\" (2012). It was written by Minaj, Maurice Jordan, and 2 Chainz, who contributed a guest verse to the song, while production was handled by Kenoe. The track was released as the album's third single on May 29, 2012 following \"Starships\" and \"Right by My Side\". Are we justified in saying that \"Nicki Minaj has released four albums since Beez in the Trap.\"? Yes, no, or maybe? Maybe\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards. Are we justified in saying that \"Deckard has won best actor in years other than 2008.\"? Yes, no, or maybe? Maybe\n###\nThe Raid on Le Havre was a two-day naval bombardment of the French port of Le Havre early in July 1759 by Royal Navy forces under Rear-Admiral George Rodney during the Seven Years' War, which succeeded in its aim of destroying many of the invasion barges being gathered there for the planned French invasion of Great Britain. Are we justified in saying that \"The Raid on Le Havre was known for the diseases which ravaged camps.\"? Yes, no, or maybe?", "doc_id": 661, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22800, 29736, 7800, 21163], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ann Rae Rule (n\u00e9e Stackhouse; October 22, 1931 \u2013 July 26, 2015) was an American true crime author of \"The Stranger Beside Me\", about serial killer, and Rule's co-worker, Ted Bundy. Rule was also known for her book \"Small Sacrifices\", about Oregon child murderer Diane Downs. Many of Rule's books center on murder cases that occurred in the Pacific Northwest and her adopted home state of Washington. Are we justified in saying that \"\"The Stranger Beside Me\" is a waste of time.\"? Yes, no, or maybe? Maybe\n###\nBela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act. Are we justified in saying that \"Bela Lugosi Jr was also a divorce lawyer.\"? Yes, no, or maybe? Maybe\n###\nSing A to Z is the tenth album by popular children's entertainers Sharon, Lois & Bram, originally released in 1990. This album, like many other Sharon, Lois & Bram albums has been re-released many times. It is rumored that the idea for this album came from Lois when she and Sharon were window shopping and came across an alphabet quilt on display. Are we justified in saying that \"Sharon, Lois & Bram have released a new album.\"? Yes, no, or maybe? Maybe\n###\nIsmail Merchant (25 December 1936\u00a0\u2013 25 May 2005) was an Indian-born film producer and director. He worked for many years in collaboration with Merchant Ivory Productions which included director (and Merchant's longtime professional and domestic partner) James Ivory as well as screenwriter Ruth Prawer Jhabvala. Their films won six Academy Awards. Are we justified in saying that \"Merchant is a homosexual.\"? Yes, no, or maybe? Maybe\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX. Are we justified in saying that \"It was high noon when the cartoon characters went for an adult swim.\"? Yes, no, or maybe?", "doc_id": 768, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36860, 33113, 18459, 32493], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall has a lot of stores that sell jeans\"? Yes, no, or maybe? Maybe\n###\nMiss Europe 2002, the 57th Miss Europe pageant, was held at the Beirut International Exhibition & Leisure Center in Beirut, Lebanon on December 28, 2002. Svetlana Koroleva, Miss Russia, was crowned Miss Europe 2002 by outgoing titleholder Elodie Gossuin of France. Are we justified in saying that \"Svetlana Koroleva was born on December 29, 1981.\"? Yes, no, or maybe? Maybe\n###\nThe 2015 Auburn Tigers softball team is an American softball team, representing the Auburn University for the 2015 NCAA softball season. In 2014, the Auburn Tigers softball team went 42-19-1 during Clint Myers first season. The Auburn Tigers play their home games at Jane B. Moore Field. Are we justified in saying that \"The Tigers play their home games at Jane B. Moore Field.\n\"? Yes, no, or maybe? Yes\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city. Are we justified in saying that \"The population was the sum 800 + 53 in 2010\"? Yes, no, or maybe? Yes\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"kidsty pike fell and Haweswater Reservoir are in the same district in England\"? Yes, no, or maybe?", "doc_id": 878, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36872, 22127, 41005, 39566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke. Are we justified in saying that \"\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost in Space\" and 117th episode overall.\"? Yes, no, or maybe? No\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th. Are we justified in saying that \"Italy won 3rd against Vasili Vyacheslavovich Blagov and Irina Cherniaeva in the 1972 Winter Olympics.\"? Yes, no, or maybe? Maybe\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Chris McKendry is not her original name\"? Yes, no, or maybe? Yes\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The gator bowl was played at the Gators home field.\"? Yes, no, or maybe? Maybe\n###\nThe City of Canada Bay is a local government area in the Inner West of Sydney, New South Wales, Australia. The city was formed on 1 December 2000, following the merger of Concord and Drummoyne councils. The city covers an area of 19.82 km2 and as at the 2016 census had a resident population of . The city is ultimately named after Canada Bay, a bay on the Parramatta River. Are we justified in saying that \"The City of Canada Bay covers more than 11 miles.\"? Yes, no, or maybe?", "doc_id": 910, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12217, 9286, 25834, 34483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hedera helix (common ivy, English ivy, European ivy, or just ivy) is a species of flowering plant in the family Araliaceae, native to most of Europe and western Asia. A rampant, clinging evergreen vine, it is a familiar sight in gardens, waste spaces, on house walls, tree trunks and in wild areas across its native habitat. Are we justified in saying that \"Hedera helix can found in the EU\"? Yes, no, or maybe? Yes\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone has special status with the government.\"? Yes, no, or maybe? Maybe\n###\nThe following is a list of female cabinet ministers of Thailand. Thailand is a country located at the centre of the Indochina peninsula in Southeast Asia. It is bordered to the north by Burma and Laos, to the east by Laos and Cambodia, to the south by the Gulf of Thailand and Malaysia, and to the west by the Andaman Sea and the southern extremity of Burma. Are we justified in saying that \"Thailand has female cabinet members\"? Yes, no, or maybe? Yes\n###\nThe Exterminating Angel (Spanish: El \u00e1ngel exterminador ), is a 1962 surrealist film, written and directed by Luis Bu\u00f1uel, starring Silvia Pinal, and produced by her then-husband Gustavo Alatriste. Sharply satirical and allegorical, the film contains a view of human nature suggesting \"mankind harbors savage instincts and unspeakable secrets\". Are we justified in saying that \"The film contains an interesting view of human nature. \"? Yes, no, or maybe? Maybe\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games. Are we justified in saying that \"The SuperSonics lost to the Basketball team from Washington in the playoffs. \"? Yes, no, or maybe?", "doc_id": 226, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31426, 3674, 33223, 8742], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"The mall has stores in it\"? Yes, no, or maybe? Yes\n###\nGirls on Top is a British ITV sitcom, broadcast in 1985 and 1986, and made by Witzend for the ITV contractor Central Independent Television. It stars Dawn French, Jennifer Saunders, Ruby Wax and Tracey Ullman, and was written by French, Saunders, and Wax with additional material from Ullman. Despite a poor critical reception, the series was a ratings success. Are we justified in saying that \"Girls on Top received good ratings. \"? Yes, no, or maybe? Yes\n###\nGwendoline See-Hian Yeo (; born July 10, 1977) is a Singaporean-born American actress, voice actress and musician, best known for her recurring guest-star role as Xiao-Mei in the hit television series \"Desperate Housewives\", and as Dr. Kelly Lee in \"General Hospital\". Are we justified in saying that \"Gwendoline See-Hian Yea is a decent musician.\"? Yes, no, or maybe? Maybe\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"Coates became prominently known after the film's release.\"? Yes, no, or maybe? Maybe\n###\nHenry II (18 April 1503 \u2013 25 May 1555), nicknamed \"Sang\u00fcesino\" because he was born at Sang\u00fcesa, was the King of Navarre from 1517, although his kingdom had been reduced to a small territory north of the Pyrenees by the Spanish conquest of 1512. Henry succeeded his mother, Queen Catherine, upon her death. His father was her husband and co-ruler, King John III, who died in 1516. Are we justified in saying that \"Queen Catherine was King Of Navarre \"? Yes, no, or maybe?", "doc_id": 146, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42508, 20268, 31487, 35010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Max & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes. Are we justified in saying that \"The series premiered on Nickelodeon in the UK on October 6, 2014. \"? Yes, no, or maybe? No\n###\nChristmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society. Are we justified in saying that \"Christmas Eve and day are the most important holidays in Western Society.\"? Yes, no, or maybe? Maybe\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition. Are we justified in saying that \"Best of 4Minute was released in 20th century.\"? Yes, no, or maybe? No\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group. Are we justified in saying that \"James Moody only does Jazz music instrumental recordings for IPO Recordings.\"? Yes, no, or maybe? Maybe\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker. Are we justified in saying that \"In the Doberman Gang movie, one of the dogs was named Clyde Parker.\"? Yes, no, or maybe?", "doc_id": 57, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22821, 38128, 42079, 38123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website. Are we justified in saying that \"It was very popular\"? Yes, no, or maybe? Maybe\n###\nElizabeth Berridge (born May 2, 1962) is an American film and theatre actress. She is known for playing Constanze Mozart in the Academy Award-winning 1984 film \"Amadeus\", for the role of Officer Eve Eggers on \"The John Larroquette Show\" (1993-1996), and for her performances in the theater. Are we justified in saying that \"Berridge won awards for her theater performances.\"? Yes, no, or maybe? Maybe\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer. Are we justified in saying that \"The princess was related to the pretender of the throne.\"? Yes, no, or maybe? Yes\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole was in the band called the Commodores.\"? Yes, no, or maybe? No\n###\nPeter Murray Kapetan was an American Broadway actor, singer and dancer notable for playing numerous roles during a thirty-year career. He was notable for performing in the musical \"The Wedding Singer\" as a Ronald Reagan impersonator. He appeared in \"Titanic\", \"Sunset Boulevard\", \"Joseph and the Amazing Technicolor Dreamcoat\", and \"Got Tu Go Disco\". Are we justified in saying that \"Peter Murray Kapetan had a long career, not quite a 40 year career, since he retired ten years before he would be in the show business for the full 40 years\"? Yes, no, or maybe?", "doc_id": 815, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23902, 21721, 9033, 22081], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana is found in Texas.\"? Yes, no, or maybe? No\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue. Are we justified in saying that \"Misty Knight was read by George.\"? Yes, no, or maybe? Maybe\n###\n1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire. Are we justified in saying that \"The Australian version of the show is the second most popular version of the game show worldwide. \"? Yes, no, or maybe? Maybe\n###\nPenthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue. Are we justified in saying that \"Penthouse is difficult to watch.\"? Yes, no, or maybe? Maybe\n###\nFranklin Martin Loew, DVM, PhD, (1939 in Syracuse, NY \u2013 2003 in Boston, MA) was president of Becker College, dean of the College of Veterinary Medicine at Cornell University and dean of Tufts University School of Veterinary Medicine (now Tufts Cummings School of Veterinary Medicine). Are we justified in saying that \"Franklin Martin Loew was born more than 1000 days ago.\"? Yes, no, or maybe?", "doc_id": 705, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39480, 31948, 13651, 33172], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis. Are we justified in saying that \"Jose is from Columbus, OH, USA.\"? Yes, no, or maybe? Maybe\n###\nOrphan X is a 2016 thriller novel written by Gregg Hurwitz. It is the first in a five-book series of the same name from publisher Minotaur Books with the film rights belonging to Warner Bros. Bradley Cooper is likely to produce and possibly star the movie. Are we justified in saying that \"Bradley Cooper had intentions to direct the Orphan X thriller movie, but has since chosen to be a voice actor for it.\"? Yes, no, or maybe? No\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt. Are we justified in saying that \"Klagenfurt am W\u00f6rthersee has 99,100 tourists per year. \"? Yes, no, or maybe? Maybe\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015. Are we justified in saying that \"Dantoro was not the first Emir\"? Yes, no, or maybe? Yes\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\". Are we justified in saying that \"The Book of Job: A Journey into Parenthood was written by husband-and-wife writing duo Greg Fried and Lisa Lazarus from South Africa. They also co wrote \"Paradise\" and \"When in Broad Daylight I Open my Eyes\"\"? Yes, no, or maybe?", "doc_id": 553, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28750, 21392, 5091, 45236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doctor Neo Periwinkle Cortex (often referred to as Doctor Cortex, Neo Cortex, or simply Cortex) is a fictional character and the main antagonist of the \"Crash Bandicoot\" series. His name is a play on the term neocortex, an area of the brain. He has been the archenemy of Crash ever since his first appearance, in the game \"Crash Bandicoot\". Are we justified in saying that \"Doctor Neo Periwinkle Cortex will get his own video game on PC in 2020\"? Yes, no, or maybe? Maybe\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci was born 25 days before Christmas\"? Yes, no, or maybe? Maybe\n###\nAlrifai is a Lebanese multinational nut retailing company headquartered in Beirut, Lebanon, and a wholly owned subsidiary of Alrifai International Holding Ltd. It is the largest nut retailing chain in the Middle East and the company with the biggest market share in Lebanon. Are we justified in saying that \"The company is a publicly traded company\"? Yes, no, or maybe? Maybe\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia. Are we justified in saying that \"Louis Ferdinand II had seven siblings. \"? Yes, no, or maybe? No\n###\nThe Tampa Bay Buccaneers season was the franchise's 39th season in the National Football League. It was also the first season under head coach Lovie Smith, replacing Greg Schiano, who was fired at the end of the 2013 season. It was also the first season under general manager Jason Licht, following the departure of Mark Dominik, after a disappointing 2013 season. Are we justified in saying that \"The Tampa Bay Buccaneers season has less than 30 seasons in the National Football League\"? Yes, no, or maybe?", "doc_id": 766, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11156, 35306, 32881, 25417], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "William Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65. Are we justified in saying that \"William Lang Denholm \"Bill\" McCue died in Scotland.\"? Yes, no, or maybe? Maybe\n###\nBarbro Martinsson (born 16 August 1935) is a former Swedish cross country skier who competed during the 1960s. Born in Valbo, she won two silver medals in the 3 x 5 km at the 1964 Winter Olympics and the 1968 Winter Olympics. Martinsson finished 4th in the 1968 Winter Olympics in both 5 km and 10 km. Are we justified in saying that \"She is now an American citizen.\"? Yes, no, or maybe? Maybe\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family. Are we justified in saying that \"The station was renamed in 1928\"? Yes, no, or maybe? Yes\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden. Are we justified in saying that \"The Kyrkog\u00e5rden Runestones are a French\"? Yes, no, or maybe? No\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\". Are we justified in saying that \"Donovan was born 20 years before he played Hamlet on stage.\"? Yes, no, or maybe?", "doc_id": 397, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19267, 40448, 7200, 30094], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Man in a Hurry (French: \"L'Homme press\u00e9\" , Italian: \"L'ultimo giorno d'amore\" , released in UK as The Hurried Man) is a 1977 French-Italian drama film directed by \u00c9douard Molinaro and starring Alain Delon and Mireille Darc. It is based on the novel \"The Man in a Hurry\" by Paul Morand. It recorded admissions of 730,581 in France. Are we justified in saying that \" Man in a Hurry is a French-Italian drama film directed by \u00c9douard Molinaro was released in the nineteen seventies. \"? Yes, no, or maybe? Yes\n###\nGeorge Joseph Maloof Jr. (born September 2, 1964) is an American entrepreneur and businessman. He is the former owner of the Sacramento Kings, the former owner of the now defunct Sacramento Monarchs, and minority owner of the Palms Casino Resort in Las Vegas with his brothers Gavin Maloof, Joe Maloof, Phil Maloof and sister Adrienne Maloof. He is part of the Maloof Family. Are we justified in saying that \"George Joseph Maloof Jr. was the head coach of the Sacramento Monarchs.\"? Yes, no, or maybe? No\n###\nBeyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert. Are we justified in saying that \"1970 is known for being the year that comes after 1969.\"? Yes, no, or maybe? No\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\"). Are we justified in saying that \"Sanation was created after the interwar period\"? Yes, no, or maybe? No\n###\nLucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck. Are we justified in saying that \"Francois lived over 60 years.\"? Yes, no, or maybe?", "doc_id": 653, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20710, 22883, 23247, 30241], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph was born in 1821 and died in 1891\"? Yes, no, or maybe? Yes\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015. Are we justified in saying that \"The 1985 Nebraska Cornhuskers is a basketball team.\"? Yes, no, or maybe? No\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"The Mast\u00edn Espa\u00f1ol is not a tiny dog.\"? Yes, no, or maybe? Yes\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"The San Nicolao Tunnel took 5 years to build\"? Yes, no, or maybe? Maybe\n###\nChandana Banerjee(born 1953) is an Indian actress, model and beauty queen. She was the winner of first edition of Femina Teen Princess. she represented India at International Teen Princess 1967 held in Chicago, Illinois on 1967 May 26 and was crowned 1st Runner Up there. After that she became a model in India. Prior to winning the pageant she was starred in Indian film \"Teen Kanya\". Are we justified in saying that \"Benerjee was 12 when she was at International Teen Princess.\"? Yes, no, or maybe?", "doc_id": 464, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [45418, 5560, 7370, 3198], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bertrand Piccard (born 1 March 1958) is a Swiss psychiatrist and balloonist. Along with Brian Jones, he was the first to complete a non-stop balloon flight around the globe, in a balloon named Breitling Orbiter 3. He was the initiator, chairman, and co-pilot, with Andr\u00e9 Borschberg, of Solar Impulse, the first successful round-the-world solar powered flight. Are we justified in saying that \"Bertrand Piccard was born more than 1959 years ago.\"? Yes, no, or maybe? No\n###\nGrowing Up is the first Korean-language studio album by South Korean singer-songwriter and actress IU. It was released on April 23, 2009, as a follow-up to her 2008 debut mini-album \"Lost and Found\". Two of the album's 16 tracks, \"Boo\" and \"You Know (\uc788\uc796\uc544) (Rock Ver.)\", were released as singles. Are we justified in saying that \"South Korean singer-songwriter and actress IU writes her own songs.\n\"? Yes, no, or maybe? Maybe\n###\nB&Q plc is a British multinational DIY and home improvement retailing company, headquartered in Eastleigh, England, United Kingdom and is a wholly owned subsidiary of Kingfisher plc. Founded by Richard Block and David Quayle in 1969 originally as Block & Quayle, the retail chain offers over 40,000 products across 300 stores and online. Are we justified in saying that \"B&Q plc is founded by Richard Block and Donald Trump\"? Yes, no, or maybe? No\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania. Are we justified in saying that \"Idris Sultan is an avid Big Brother fan.\"? Yes, no, or maybe? Maybe\n###\nJohn Wellborn Root (January 10, 1850 \u2013 January 15, 1891) was an American architect who was based in Chicago with Daniel Burnham. He was one of the founders of the Chicago School style. Two of his buildings have been designated a National Historic Landmark; others have been designated Chicago landmarks and listed on the National Register of Historic Places. In 1958, he received the AIA Gold Medal. Are we justified in saying that \"John Wellborn Root won a medal in 1957.\"? Yes, no, or maybe?", "doc_id": 778, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41741, 5960, 27544, 25422], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Barry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions. Are we justified in saying that \"Barry and Stuart are both BAFFTA nominated magicians and comedians.\"? Yes, no, or maybe? Yes\n###\nColville Lake is the 20th largest lake in Canada's Northwest Territories. The lake is located 100\u00a0km (62\u00a0mi) northwest of Great Bear Lake in the Sahtu Region. The lake has a perimeter of 121\u00a0km (75\u00a0mi) and a net area of 416\u00a0km\u00b2 (161 sq mi) and a total area of 439\u00a0km\u00b2 (169 sq mi). Are we justified in saying that \"Colville Lake is a lake.\"? Yes, no, or maybe? Yes\n###\nThe 2017 Macanese general election took place on 17 September 2017 according to the provisions of the Basic Law of Macau. Out of a total of 33 seats, 14 were directly elected by universal suffrage under the highest averages method, while 12 were voted on from the Functional constituency, and 7 from nomination by the Chief Executive. Are we justified in saying that \"The 2017 Macanese general election was viewed as a success\"? Yes, no, or maybe? Maybe\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden. Are we justified in saying that \"The Kyrkog\u00e5rden Runestones are a Canadian monument.\"? Yes, no, or maybe? No\n###\nKnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates. Are we justified in saying that \"KnowledgeWare is in the northern hemisphere\"? Yes, no, or maybe?", "doc_id": 817, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25747, 43601, 18794, 6924], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "NBA 2K9 is a basketball simulation video game developed by Visual Concepts and published by 2K Sports. It is the tenth installment in the \"NBA 2K\" franchise and the successor to \"NBA 2K8\". It was released in 2008 for PlayStation 2, PlayStation 3, Xbox 360, and Microsoft Windows. Kevin Garnett is the cover athlete of the game. \"NBA 2K9\" is the predecessor to \"NBA 2K10\" in the \"NBA 2K\" series. Are we justified in saying that \"You were able to play NBA 2K9 on pc\"? Yes, no, or maybe? Yes\n###\nMore of Tom Lehrer was the second studio album recorded by musical satirist Tom Lehrer. The LP contains the same songs (in the same sequence) as the live album \"An Evening Wasted with Tom Lehrer\", which was recorded and released earlier in the same year. The album was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1959. Are we justified in saying that \"\"More of Tom Lehrer\" is the studio version of the live album, \"An evening wasted with Tom Lehrer\".\"? Yes, no, or maybe? Yes\n###\nThe Mercedes-Benz W221 is a chassis code of S-Class, the successor of the Mercedes-Benz S-Class (W220) and the predecessor of the Mercedes-Benz S-Class (W222). The S-Class are the flagship vehicles of Mercedes-Benz and each generation typically introduces a range of technical innovations and developments that over time will find their way into smaller cars. Are we justified in saying that \"The Mercedes-Benz is a very good car\"? Yes, no, or maybe? Maybe\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label. Are we justified in saying that \"Lance King was in many bands\"? Yes, no, or maybe? Yes\n###\nDaphniphyllum is the sole genus in the flowering plant family Daphniphyllaceae and was described as a genus in 1826. The genus includes evergreen shrubs and trees mainly native to east and southeast Asia, but also found in the Indian Subcontinent and New Guinea. Are we justified in saying that \"the plant is only is asia\"? Yes, no, or maybe?", "doc_id": 283, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29862, 32201, 26705, 28523], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "West Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records. Are we justified in saying that \"West Coast hip hop has been played by iron maiden\"? Yes, no, or maybe? Maybe\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"There are over 25 different mint species to yet be given a name and identified.\"? Yes, no, or maybe? Maybe\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc Are we justified in saying that \"Varun Sharma is from the subcontinent\"? Yes, no, or maybe? Yes\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title. Are we justified in saying that \"A woman won the single title.\"? Yes, no, or maybe? Yes\n###\n\"I Never Picked Cotton\" is a song made famous by country music singer Roy Clark. Written by Bobby George and Charles Williams, the song was released in 1970 as the title track to the album released that same year. The song peaked at No. 5 on the \"Billboard magazine\" Hot Country Singles chart that summer. Are we justified in saying that \"Clark, George, and Williams are all responsible for the song having become such a success\"? Yes, no, or maybe?", "doc_id": 359, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44097, 33120, 36230, 30244], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dead to Rights II is a third-person action video game, developed by Widescreen Games, published by Namco, and released in 2005. Serving as a prequel to \"Dead to Rights\", it begins with the story of Jack Slate and Shadow before the events of the original game. A PSP prequel, \"\", released on June 28, 2005. Are we justified in saying that \"Dead to Rights II was released in the summer.\"? Yes, no, or maybe? Maybe\n###\nPillars of Eternity: The White March is a two-part expansion pack for the 2015 role-playing video game \"Pillars of Eternity\", developed by Obsidian Entertainment and published by Paradox Interactive. The first part was released on August 25, 2015, while the second was released on February 16, 2016. Are we justified in saying that \"There was less than a year between the Pillars of Eternity releases.\"? Yes, no, or maybe? Yes\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games. Are we justified in saying that \"In their 9th season, the Seattle SuperSonics did not finish in first place in the Western Conference.\"? Yes, no, or maybe? Yes\n###\nThe Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project. Are we justified in saying that \"The Castaways Hotel and Casino has been visited by Bush.\"? Yes, no, or maybe? Maybe\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor. Are we justified in saying that \"Van Cleef & Arpels was favoured by royalty\"? Yes, no, or maybe?", "doc_id": 940, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8351, 25093, 6314, 15646], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets. Are we justified in saying that \"Rodrequis La'Vant Stephens used to play baseball in high school.\"? Yes, no, or maybe? Maybe\n###\nNuestra Belleza Nuevo Le\u00f3n 2007, was held at Las Lomas Eventos in Monterrey, Nuevo Le\u00f3n on July 25, 2007. At the conclusion of the final night of competition, Anagabriela Espinoza of San Pedro Garza Garc\u00eda was crowned the winner. Espinoza was crowned by outgoing Nuestra Belleza Nuevo Le\u00f3n titleholder, Mariana Lombard. Eight contestants competed for the state title. Are we justified in saying that \"Nuestra Belleza Nuevo Le\u00f3n 2007 had 10 contestants\"? Yes, no, or maybe? No\n###\nAmandil is a fictional character from J.R.R. Tolkien's Middle-earth legendarium. Amandil was a Lord of And\u00fani\u00eb, succeeding his father N\u00famendil upon his death. Amandil is most noted for being the father of Elendil, founder of the N\u00famen\u00f3rean Realms in Exile. Are we justified in saying that \"Amandil is the father of Numendil.\"? Yes, no, or maybe? No\n###\nJesco White, also known as the \"Dancing Outlaw\" (born July 30, 1956) is an American folk dancer and entertainer. He is best known as the subject of three American documentary films that detail his desire to follow in his famous father's footsteps while dealing with depression, drug addiction, alcoholism, and the poverty that permeates much of rural Appalachia. Are we justified in saying that \"White's work is influenced by cowboy films.\"? Yes, no, or maybe? Maybe\n###\nJoseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars. Are we justified in saying that \"Joseph Eppele was born on a sunny day on august 12, 2010\"? Yes, no, or maybe?", "doc_id": 381, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4123, 39966, 11626, 19548], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books. Are we justified in saying that \"Vadim Jean began directing films in 1998.\"? Yes, no, or maybe? Maybe\n###\n\"Uh Huh\" is the first single by R&B group B2K, from their self-titled debut album. The song was released in July 2001 and it peaked at number 37 on the \"Billboard\" Hot 100 and number 20 on the Hot R&B/Hip-Hop Songs. It also peaked at number 35 in the UK on its first entry and reached a new peak at number 31 on a re-release. Are we justified in saying that \"Released in 2001 the song Uh Huh was the only single to date released by B2K. \"? Yes, no, or maybe? Maybe\n###\nIslamic rule govenrned the southern part of the Iberian peninsula for seven hundred years. In medieval history, \"al-Andalus\" (Arabic: \u0627\u0644\u0623\u0646\u062f\u0644\u0633\u200e \u200e ) was the name given to the parts of the Iberian Peninsula and Septimania governed by Arab and North African Muslims (given the generic name of Moors), at various times in the period between 711 and 1492. Are we justified in saying that \"Islamic laws governed the southern portion of the Iberian peninsula for six hundred years.\"? Yes, no, or maybe? No\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL). Are we justified in saying that \"Justin Smith was born in 1983\"? Yes, no, or maybe? No\n###\nHighly Illogical is an album which contains a collection of songs performed by \"Star Trek\" actor Leonard Nimoy. Most of the songs were originally recorded in the 1960s. The collection includes \"The Ballad of Bilbo Baggins\", which tells the story of J.R.R. Tolkien's book \"The Hobbit\", and has been immortalized by being included on various novelty compilations over the years. Are we justified in saying that \"The songs are from famous movies and books.\"? Yes, no, or maybe?", "doc_id": 669, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8560, 26019, 452, 18648], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is maintained by a parks department.\"? Yes, no, or maybe? Yes\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\" Are we justified in saying that \"People really like using Flatbush Avenue\n\"? Yes, no, or maybe? Maybe\n###\nH\u00e9ctor Canziani was an Argentine poet, screenwriter and film director who worked in Argentine cinema in the 1940s and 1950s. Although his work was most abundant in screenwriting and poetry after his brief film career, he is best known for his directorship and production of the 1950 tango dancing film Al Comp\u00e1s de tu Mentira based on a play by Oscar Wilde. Are we justified in saying that \"Canziani was of Argentine descent.\"? Yes, no, or maybe? Yes\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day. Are we justified in saying that \"Tunnel Vision is a debut novel.\"? Yes, no, or maybe? Yes\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar). Are we justified in saying that \"Mike D, MCA, and Ad-Rock were the only founders of the Beastie Boys.\"? Yes, no, or maybe?", "doc_id": 627, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28008, 25945, 41195, 6193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"Juan Cruz was 22 when he appeared at the 2017 US open.\"? Yes, no, or maybe? Yes\n###\nFriday: The Animated Series was a short-lived animated television series based on the \"Friday\" film series. The show is directed by Kevin Lofton and is co-produced and co-distributed by New Line Television, a subsidiary of New Line Cinema (the distributors of the \"Friday\" movies), MTV2, and Ice Cube's Cubevision. The series only lasted for 8 episodes. Are we justified in saying that \"The series did not achieve hundreds of episodes.\"? Yes, no, or maybe? Yes\n###\nThe Diawling National Park lies in south west Mauritania around the Senegal River delta. During the rainy season, much of the park consists of large lakes. It is known for having over 220 species of identified birds, including pelicans, black storks, and flamingos, and also for its fish. Are we justified in saying that \"Large lakes cover the park for the majority of the seasons.\"? Yes, no, or maybe? Maybe\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Karan Johar's was in mumbai on february 20 1999\"? Yes, no, or maybe? Maybe\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966). Are we justified in saying that \"Ben Barzman was born more than 1989 years ago.\"? Yes, no, or maybe?", "doc_id": 848, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44575, 9070, 44098, 31914], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"\"Fight or Flight\" starred Hayden Panettiere \"? Yes, no, or maybe? Maybe\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show is a massive hit.\"? Yes, no, or maybe? Maybe\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency. Are we justified in saying that \"O'Donnell Independent School District was rated \"Academically Acceptable\" in 2011\"? Yes, no, or maybe? Maybe\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"CUHK is a research university\"? Yes, no, or maybe? Yes\n###\nThe Brandon Learning Centre is the first school in Hong Kong to offer public speaking classes based around English Speaking Board assessments. The English Speaking Board was founded in 1954 and the qualifications are regulated by the UK Office of Qualifications and Examinations Regulation Are we justified in saying that \"The Brandon Learning Centre wanted to institute Taco Tuesday at the center, but it was vetoed by the founders\"? Yes, no, or maybe?", "doc_id": 710, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18085, 7920, 15516, 32566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a rare plant that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in Washington state.\"? Yes, no, or maybe? Yes\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Destiny was not nominated for an Oscar award.\"? Yes, no, or maybe? Yes\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"Gotz was a German diplomat so he was well-liked.\"? Yes, no, or maybe? Maybe\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines. Are we justified in saying that \"Resil B. Mojares will run for President in 2020\"? Yes, no, or maybe? Maybe\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"Power chords are played in more than amplified guitars \"? Yes, no, or maybe?", "doc_id": 507, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26619, 23127, 6452, 6096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming. Are we justified in saying that \"Cape Vakop was chartered over 36 years ago\"? Yes, no, or maybe? Yes\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina. Are we justified in saying that \"Some participants cheated in the event\"? Yes, no, or maybe? Maybe\n###\nCon Stough is Professor of Cognitive Neuroscience and Psychology at Swinburne University of Technology, Australia, director of the Swinburne Centre for Neuropsychology and director of the newly formed National Institute of Complementary Medicine (NICM) Collaborative Centre for the study of herbal and natural medicines for neurocognition. Are we justified in saying that \"Stough is not a professor.\"? Yes, no, or maybe? No\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests. Are we justified in saying that \"The show was produced by Lorne Michaels.\"? Yes, no, or maybe? Yes\n###\nDicksonia youngiae, common name bristly tree fern, is a fern that comes from cool, sheltered rainforests in New South Wales and Queensland, Australia. It is found north of the Bellinger River, in New South Wales, and can be seen in the wild at Nightcap National Park. Are we justified in saying that \"Dicksonia youngiae can only be seen in the wild at Nightcap National Park.\"? Yes, no, or maybe?", "doc_id": 615, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14815, 20575, 38067, 43096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Leberecht Maass (or Maa\u00df) (24 November 1863 \u2013 28 August 1914) was the \"Konteradmiral\" who commanded the German naval forces at the first Battle of Heligoland Bight. He lost his life when his flagship, the light cruiser SMS \"C\u00f6ln\" , was sunk by British battlecruisers commanded by Vice Admiral David Beatty. Are we justified in saying that \"Leberecht Maass passed away in the winter of 1914.\"? Yes, no, or maybe? No\n###\nShould the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986. Are we justified in saying that \"Peter Murphy covered some songs by Magazine.\"? Yes, no, or maybe? Yes\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Nashville West made girls go crazy.\"? Yes, no, or maybe? Maybe\n###\nBrookpark is a station on the RTA Red Line located on the borders of Brook Park and Cleveland, Ohio, USA. It is located along Brookpark Road (Ohio State Route 17), west of the intersection of Henry Ford Boulevard (Ohio State Route 291) and east of the intersection of the Berea Freeway (Ohio State Route 237). Are we justified in saying that \"Brookpark is in Cleveland\"? Yes, no, or maybe? Yes\n###\nSan Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County. Are we justified in saying that \"Newlyweds: Nick and Jessica ended because Nick and Jessica got divorced.\"? Yes, no, or maybe?", "doc_id": 985, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9629, 5286, 15338, 8299], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"The Mets had a winning season every year between 1968 and 1974.\"? Yes, no, or maybe? No\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017). Are we justified in saying that \"Alexander Vincent LoScialpo is an actor.\"? Yes, no, or maybe? Yes\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists. Are we justified in saying that \"Coraz\u00f3n Valiente is a top ten show\"? Yes, no, or maybe? Maybe\n###\nThree Preludes is a ballet made for Mikhail Baryshnikov by Mark Morris to eponymous music by George Gershwin for his own company and presented as a piece d'occasion by the New York City Ballet. The performance took place June 16, 1992, at the New York State Theater, Lincoln Center. Are we justified in saying that \"Three Preludes was performed at other theaters\"? Yes, no, or maybe? Maybe\n###\nLaura Ellen Ziskin (March 3, 1950 \u2013 June 12, 2011) was an American film producer, known as the executive producer of the 1990 romantic comedy \"Pretty Woman\", and as the first woman to produce the Academy Awards telecast alone, producing the 74th Academy Awards in 2002 and the 79th Academy Awards in 2007. Are we justified in saying that \"Laura Ellen Ziskin is the zodiac killer. \"? Yes, no, or maybe?", "doc_id": 161, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6395, 44006, 1424, 5730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929. Are we justified in saying that \"SR 10 was widened in the 1920's\"? Yes, no, or maybe? Maybe\n###\nThe 1986\u201387 St. John's Redmen basketball team represented St. John's University during the 1986\u201387 NCAA Division I men's basketball season. The team was coached by Lou Carnesecca in his nineteenth year at the school. St. John's home games are played at Alumni Hall and Madison Square Garden and the team is a member of the Big East Conference. Are we justified in saying that \" Lou Carnesecc coached in the 60s\"? Yes, no, or maybe? Yes\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006. Are we justified in saying that \"the 71st episode of \"The Sopranos\" was the last episode to be written by Terence Winter.\"? Yes, no, or maybe? Maybe\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France. Are we justified in saying that \"Hauts de France is a small area in France\"? Yes, no, or maybe? Maybe\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr. Are we justified in saying that \"Frank Harvey Jnr. wrote Things Happen at Night .\"? Yes, no, or maybe?", "doc_id": 4, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2502, 32136, 25239, 30195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg. Are we justified in saying that \"Giovanni Ferrero's net worth is greater than that of Bill Gates.\"? Yes, no, or maybe? Maybe\n###\nStudies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\" Are we justified in saying that \"Carson also relates this to the successes of the 21st century.\"? Yes, no, or maybe? Maybe\n###\nInferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India. Are we justified in saying that \"Inferno was released before 1990\"? Yes, no, or maybe? No\n###\nSavoy Brown, originally known as the Savoy Brown Blues Band, are an English blues rock band formed in Battersea, south west London in 1965. Part of the late 1960s blues rock movement, Savoy Brown primarily achieved success in the United States, where they promoted their albums with non-stop touring. Are we justified in saying that \"Savoy Brown was created in the 1950s\"? Yes, no, or maybe? No\n###\nA Day with Wilbur Robinson is a 1990 children's picture book (slightly expanded for a 2006 reissue) written and illustrated by William Joyce. A film adaptation called \"Meet the Robinsons\" was released by Walt Disney Pictures in 2007 in the United States. Are we justified in saying that \"Walt Disney Pictures releases children's movies \"? Yes, no, or maybe?", "doc_id": 299, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9799, 43278, 24516, 5272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robert Mills Delaney, sometimes incorrectly spelled Delany (1903-1956) was an American composer. He studied with Nadia Boulanger and Arthur Honegger in Paris, and was best known for his 1928 choral symphony, John Brown's Song, based on Stephen Benet's Pulitzer Prize winning poem \"John Brown's Body\". Are we justified in saying that \"Robert Delaney studied in Rome.\"? Yes, no, or maybe? No\n###\nThe first season of Survival Audition K-pop Star (Korean: \uc11c\ubc14\uc774\ubc8c \uc624\ub514\uc158 K\ud31d \uc2a4\ud0c0 ) premiered on December 4, 2011, airing every Sunday evening at 6:30 pm under the \"Good Sunday\" programming block on SBS, until April 29, 2012. The first winner was Park Ji-min, who chose to sign with JYP Entertainment. Are we justified in saying that \"JYP Entertainment did not have any albums released by Park Ji-min in 2011.\"? Yes, no, or maybe? Yes\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s. Are we justified in saying that \"The Final Blow won many awards.\"? Yes, no, or maybe? Maybe\n###\nAdrienne Maloof (born September 4, 1961) is an American businesswoman, television personality, shoe designer and co-owner of the various business holdings of Maloof Companies, which include a 2% stake in the Palms Casino Resort in Las Vegas, Nevada; Maloof Productions, Maloof Music and the annual Maloof Money Cup skateboarding event. Are we justified in saying that \"Maloof is an American.\"? Yes, no, or maybe? Yes\n###\nClub Deportivo Utiel is a football team based in Utiel in the autonomous community of Valencian Community. Founded in 1945, the team plays in Tercera Divisi\u00f3n \u2013 Group 6. The club's home ground is \"La Celadilla\", which has a capacity of 1,500 spectators. Are we justified in saying that \"A Tercera Divisi\u00f3n team has 1,500 spectators.\"? Yes, no, or maybe?", "doc_id": 149, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15063, 20300, 26029, 6367], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"West Texas Wildcatters were established May 20, 1988.\"? Yes, no, or maybe? Maybe\n###\nCaroline Quentin (born Caroline Jones; 11 July 1960) is an English actress. Quentin became known for her television appearances: portraying Dorothy in \"Men Behaving Badly\" (1992\u20131998), Maddie Magellan in \"Jonathan Creek\" (1997\u20132000), and DCI Janine Lewis in \"Blue Murder\" (2003\u20132009). Are we justified in saying that \"Caroline Quentin lived at least 49 years.\"? Yes, no, or maybe? Yes\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\" Are we justified in saying that \"People really like using Flatbush Avenue to get out of queens\n\"? Yes, no, or maybe? Maybe\n###\nSwinburne Online is the online arm of Swinburne University of Technology which is an Australian university based in Melbourne, Victoria. Swinburne Online was founded in 2011 after a 50-50 joint venture between Swinburne University of Technology and SEEK Learning seeking to capitalise on increasing demand for off-campus education. Are we justified in saying that \"Swineburne Online was not created in the USA.\"? Yes, no, or maybe? Yes\n###\nTight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD. Are we justified in saying that \"Tight was originally released more than 1999 months ago.\"? Yes, no, or maybe?", "doc_id": 701, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21197, 42832, 116, 37133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Justin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL). Are we justified in saying that \"Justin was always a utility player\"? Yes, no, or maybe? Maybe\n###\nThe Vorontsov Lighthouse (Ukrainian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u044c\u043a\u0438\u0439 \u043c\u0430\u044f\u043a , Russian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u043a\u0438\u0439 \u043c\u0430\u044f\u043a ) is a famous red-and-white, 27.2 metre landmark in the Black Sea port of Odessa, Ukraine. It is named after Prince Mikhail Semyonovich Vorontsov, one of the governors-general of the Odessa region. Are we justified in saying that \"Sailors on the Black Sea needed a light house to guide them.\"? Yes, no, or maybe? Maybe\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr.. Are we justified in saying that \"Herradura-Agullo was born in the 80's\"? Yes, no, or maybe? No\n###\nThe following details notable events from the year 2005 in Northern Ireland. Northern Ireland is a part of the United Kingdom in the north-east of the island of Ireland. It is variously described as a country, province or region of the UK, amongst other terms. Northern Ireland shares a border with the Republic of Ireland to the south and west. Are we justified in saying that \"Northern Ireland is a great country were Derry Girsl is set\"? Yes, no, or maybe? Maybe\n###\nMichael Shane Hollis (born May 22, 1972) is a former professional American football placekicker in the National Football League. He spent most of his nine-year professional career with the Jacksonville Jaguars, kicking for the team from 1995\u20132001 and setting several team records. He then played for the Buffalo Bills and New York Giants before retiring after an injury in 2003. Are we justified in saying that \"Michael Shane Hollis was born with a completely different name.\"? Yes, no, or maybe?", "doc_id": 58, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38520, 25980, 18663, 454], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic. Are we justified in saying that \"Lausche experiences a lot of snow on the mountain which makes it hard to climb.\"? Yes, no, or maybe? Maybe\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series. Are we justified in saying that \"The Bear and the Maiden Fair was written after the book\"? Yes, no, or maybe? Maybe\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day. Are we justified in saying that \"Tunnel Vision is an debut american novel by author Keith Lowe from the 21st century.\"? Yes, no, or maybe? Yes\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"Look at my Dab was higher than 90 on the Billboard Top 100\"? Yes, no, or maybe? Yes\n###\nThe Prague Skate (sometimes titled Golden Skate; from 1994: Czech Skate) is an international figure skating competition. It was a senior event from the 1960s to 1997, usually held in November or December in Prague. Medals were awarded in the disciplines of men's singles, ladies' singles, and pair skating. Since 1999, it is organized in some years as part of the ISU Junior Grand Prix series. Are we justified in saying that \"The Prague Skate was a senior event for fifty years.\"? Yes, no, or maybe?", "doc_id": 466, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28791, 28047, 9985, 41561], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The cosmolabe was an ancient astronomical instrument resembling the astrolabe, formerly used for measuring the angles between heavenly bodies. It is also called pantacosm. Jacques Besson also uses this name, or universal instrument, for his invention described in \"Le cosmolabe\" (1567), which could be used for astrometry, cartography, navigation, and surveying. Are we justified in saying that \"Le cosmolabe was published more than 100 years ago.\"? Yes, no, or maybe? Yes\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points. Are we justified in saying that \"Greivis only stays in the northern U.S..\"? Yes, no, or maybe? Maybe\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide. Are we justified in saying that \"Mutual friends is a tv series that explores the lives of a group of friends dealing with bereavement. \"? Yes, no, or maybe? Yes\n###\nSt Mary Magdalene's Church is a Roman Catholic Parish church in Bexhill-on-Sea, East Sussex, England. It was founded in 1893 and built in 1907 in the Gothic Revival style. It is situated on the corner of Sea Road and Magdalen Road opposite Station Road and Bexhill railway station in the centre of the town. It was designed by Arthur Young and is a Grade II listed building. Are we justified in saying that \"It was built over 10 years after being founded\"? Yes, no, or maybe? Yes\n###\nKeith Martin (1969 or 1970 \u2013 December 5, 2014), one of the world heaviest lived people, was famous for being at one point the UK\u2019s heaviest man, weighing approximately 980 lbs at his peak. Keith Martin was given a gastric bypass operation by the NHS, and had lost over 50% of his body weight. Are we justified in saying that \"He was 500 pounds in 2002\"? Yes, no, or maybe?", "doc_id": 743, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34969, 27826, 42453, 13181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\". Are we justified in saying that \"Dostluk Spor Kul\u00fcb\u00fc is funded by the government\"? Yes, no, or maybe? Maybe\n###\nThe Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city. Are we justified in saying that \"Over two dozen horses race in The Kingfisher Ultra Indian Derby\"? Yes, no, or maybe? Maybe\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture. Are we justified in saying that \"Laura Warholic is also called Sex for Dummies\"? Yes, no, or maybe? Maybe\n###\nDonald Clark \"Donny\" Osmond (born December 9, 1957) is an American singer, actor, radio personality, and former teen idol. Osmond has also been a talk and game show host, record producer and author. In the mid-1960s, he and four of his elder brothers gained fame as the Osmonds. Osmond went solo in the early 1970s, covering such hits as \"Go Away Little Girl\" and \"Puppy Love\". Are we justified in saying that \"Donny's highest paying job was when he was a singer\"? Yes, no, or maybe? Maybe\n###\nAmerican Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company. Are we justified in saying that \"American Motors Incorporated (AMI) is a canadian company\"? Yes, no, or maybe?", "doc_id": 730, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17011, 32957, 28919, 2064], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy. Are we justified in saying that \"The Toronto FC made the post season in 2012\"? Yes, no, or maybe? No\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate. Are we justified in saying that \"The bar will no longer be made.\"? Yes, no, or maybe? Maybe\n###\nFrederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\". Are we justified in saying that \"Frederick Wiseman started documeting before theatre\"? Yes, no, or maybe? Maybe\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The largest living bird is found in Australia.\"? Yes, no, or maybe? No\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\". Are we justified in saying that \" It is also noted for ready-made sandwiches produced from such meat and reindeer bread\"? Yes, no, or maybe?", "doc_id": 843, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4349, 23980, 24756, 1993], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"The 2002 Indian vice presidential election was held in 2002\"? Yes, no, or maybe? Yes\n###\n...In Black and White is the 12th studio album by American country artist Barbara Mandrell. The album was released in April 1982 on MCA Records and was produced by Tom Collins. It was Barbara Mandrell's first studio album in two years since the release of \"Love Is Fair\". Are we justified in saying that \"All of Barbara Mandrell's albums are on MCA records\"? Yes, no, or maybe? Maybe\n###\nClaus Biederstaedt (born 28 June 1928 in Stargard, today Poland) is a German actor and voice actor. He studied in Hamburg and began his career working with Joseph Offenbach. Among the actors for whom he has dubbed have been Yves Montand, Peter Falk, Marlon Brando, Vittorio Gassman, and James Garner. Are we justified in saying that \"Claus Biederstaedt was born in a country that underwent a name change.\"? Yes, no, or maybe? Yes\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South . Are we justified in saying that \"The club has always been known as Oldham Curzon Ladies Football Club.\"? Yes, no, or maybe? No\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon was only a forward\"? Yes, no, or maybe?", "doc_id": 274, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17110, 22293, 26, 4812], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 198th Infantry Brigade, was first formed as part of the United States Army Reserve's 99th Division. It was active from 1967 through 1971 and has been active since 2007 as an Infantry Training Brigade as part of the US Army Infantry School at Fort Benning, Georgia. Are we justified in saying that \"The 198th Infantry Brigade is made up of mostly young people.\"? Yes, no, or maybe? Maybe\n###\nThe Puerto Rico Baseball Academy and High School (PRBAHS) is a non-profit organization combining academics and sports programs into one curriculum. Its goal is to prepare its students for higher education, competitive college scholarship opportunities, and the Major League Baseball Draft. The PRBAHS is the only high school in Puerto Rico or the United States with this type of learning environment. Are we justified in saying that \"The PRBAHS helps prepare students for competitive college scholarship opportunities.\"? Yes, no, or maybe? Yes\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008. Are we justified in saying that \"Cherry Tomato starred mostly Korean actors\"? Yes, no, or maybe? Maybe\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances. Are we justified in saying that \"Spaceballs was the first comedy directed by Mel Brooks.\"? Yes, no, or maybe? Maybe\n###\nCon Stough is Professor of Cognitive Neuroscience and Psychology at Swinburne University of Technology, Australia, director of the Swinburne Centre for Neuropsychology and director of the newly formed National Institute of Complementary Medicine (NICM) Collaborative Centre for the study of herbal and natural medicines for neurocognition. Are we justified in saying that \"Stough was a professor in Austria.\"? Yes, no, or maybe?", "doc_id": 946, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7002, 33462, 1787, 30347], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft. Are we justified in saying that \"Tatupu played 6 consecutive years in the NFL.\"? Yes, no, or maybe? Maybe\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams. Are we justified in saying that \"The Gaming Control Act was passed at least 100 days ago\"? Yes, no, or maybe? Yes\n###\nHidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO. Are we justified in saying that \"Hidden City Entertainment makes games.\"? Yes, no, or maybe? Yes\n###\nWonders of the Universe is a 2011 book by the theoretical physicists Brian Cox and Andrew Cohen. The book is about cosmology and the universe, and is explained in a way that is accessible to a general reader. The book is based on a series with the same name \"Wonders of the Universe\". Are we justified in saying that \"The book is about space\"? Yes, no, or maybe? Yes\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District. Are we justified in saying that \"The fourth district of Datong contains a population of 3,149,029 people at the time of the 2010 census. \"? Yes, no, or maybe?", "doc_id": 929, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36485, 39117, 37152, 19001], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck. Are we justified in saying that \"Lucas Franchoys likes to watch late night movies\"? Yes, no, or maybe? Maybe\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"The Flirts are still a trio.\"? Yes, no, or maybe? Maybe\n###\nSteve Koren is an Emmy Award winning writer/producer and screenwriter. Most notably he\u2019s written for \"Saturday Night Live\", \"Seinfeld\", and \"Veep\". He also wrote or co-wrote the movies \"Bruce Almighty\", \"Click\", \"A Night at the Roxbury\" and \"Superstar\". Are we justified in saying that \"Steve Koren has written at least 7 shows and movies.\"? Yes, no, or maybe? Yes\n###\nThe Prime Minister's XI or PM's XI (formerly Australian Prime Minister's Invitation XI) is an invitational cricket team picked by the Prime Minister of Australia for an annual match held at the Manuka Oval in Canberra against an overseas touring team. The Australian team usually includes up and coming players. Are we justified in saying that \"Canberra is known for it's great stadiums.\"? Yes, no, or maybe? Maybe\n###\nJason Ian Drucker (born \u20092005 ) is an American child actor. He starred as Greg Heffley in the 2017 film \"\". He also played Tommy Miller, the youngest of the Miller Family, in Nickelodeon's \"Every Witch Way\". In 2018, he will co-star in the \"Transformers\" spin-off \"Bumblebee\". Are we justified in saying that \"Bumblebee is the only by- product from The Transformers.\"? Yes, no, or maybe?", "doc_id": 806, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3085, 26093, 24718, 31706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States. Are we justified in saying that \"The Forum Shops features several coffee shops.\"? Yes, no, or maybe? Maybe\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar). Are we justified in saying that \"There was one guy called Adam in the Beastie Boys\"? Yes, no, or maybe? No\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona. Are we justified in saying that \"There is a sequal planned for Knightriders\"? Yes, no, or maybe? Maybe\n###\nFat Mattress were an English folk rock band that formed in Folkestone in 1968. Founded by guitarist and vocalist Noel Redding, during his time as bassist for The Jimi Hendrix Experience, and vocalist Neil Landon, the band was completed by multi-instrumentalist Jim Leverton and drummer Eric Dillon. The band released two albums \u2013 \"Fat Mattress\" and \"Fat Mattress II\" \u2013 before splitting up in 1970. Are we justified in saying that \"The album \"Fat Mattress\" was very popular.\"? Yes, no, or maybe? Maybe\n###\nDavid Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League. Are we justified in saying that \"David Thomas Bush played in the MLB before the KBO League.\"? Yes, no, or maybe?", "doc_id": 635, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40888, 17355, 10039, 24635], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists. Are we justified in saying that \"notable editorial cartoonists created wikipedia\"? Yes, no, or maybe? Maybe\n###\nMargarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos. Are we justified in saying that \"Margarita la tornera premiered in Spain in the early 1900s\"? Yes, no, or maybe? Yes\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"Gotz Freiherr von Houwald died on 7/16/2001\"? Yes, no, or maybe? No\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"\"I'm So Sorry\" deals with a taboo societal topic.\"? Yes, no, or maybe? Maybe\n###\nNational Bingo Night is an American game show hosted by Ed Sanders which premiered on ABC on May 18, 2007, with a six-episode order. Sanders is known for his work on another ABC show, \"\". The show was cancelled by ABC and was repackaged as \"Bingo America\" on GSN, first hosted by Patrick Duffy, and in October 2008 by Richard Karn. Are we justified in saying that \"National Bingo Night was hosted by Patrick Duffy.\"? Yes, no, or maybe?", "doc_id": 971, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2228, 31184, 5539, 28804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996). Are we justified in saying that \"Kimberly Beck died in 1997.\"? Yes, no, or maybe? Maybe\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture. Are we justified in saying that \"Laura Warholic; or, The Sexual Intellectual is a 2007 novel by Eugene Eyestones.\"? Yes, no, or maybe? No\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland. Are we justified in saying that \"Skyblue phacelia has become endangered. \"? Yes, no, or maybe? Maybe\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012. Are we justified in saying that \"A nine year old could have theoretically competed in the Melodi Grand Prix Junior 2012.\"? Yes, no, or maybe? Yes\n###\n\"Snakes on a Plane (Bring It)\", also referred to as \"Bring It (Snakes on a Plane)\", is the debut single by Cobra Starship, released in 2006 from the soundtrack album \"\". The song features William Beckett of The Academy Is..., Travie McCoy of Gym Class Heroes, and Maja Ivarsson of The Sounds. Are we justified in saying that \"It was released the year after 2002\"? Yes, no, or maybe?", "doc_id": 906, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22496, 27373, 33544, 22928], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia. Are we justified in saying that \"Princess Juliane Henriette Ulrike is female.\"? Yes, no, or maybe? Yes\n###\nWJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM. Are we justified in saying that \"The radio station only broadcasts at night.\"? Yes, no, or maybe? Maybe\n###\nMargaret Munnerlyn Mitchell (November 8, 1900 \u2013 August 16, 1949) was an American author and journalist. One novel by Mitchell was published during her lifetime, the American Civil War-era novel, \"Gone with the Wind\", for which she won the National Book Award for Most Distinguished Novel of 1936 Are we justified in saying that \"Margaret Munnerlyn Mitchell was not born in 1949\"? Yes, no, or maybe? No\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit. Are we justified in saying that \"Cloverdale Depot has buses.\"? Yes, no, or maybe? Yes\n###\nThe St. Louis Cardinals 1984 season was the team's 103rd season in St. Louis, Missouri and the 93rd season in the National League. The Cardinals went 84-78 during the season and finished 3rd in the National League East, 12\u00bd games behind their arch-rivals, the Chicago Cubs. It was also the final season of the Columbia blue road uniforms for the Cardinals. Are we justified in saying that \"after 1984 the cardinals changed uniforms\"? Yes, no, or maybe?", "doc_id": 799, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38642, 14987, 2180, 44804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Joseph Smith was not an orphan\"? Yes, no, or maybe? Yes\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer. Are we justified in saying that \"Loui Jover was in the Australian army during his childhood\"? Yes, no, or maybe? No\n###\nRear Admiral Kevin John Scarce {'1': \", '2': \", '3': \", '4': \"} (born 4 May 1952) is a retired Royal Australian Navy officer who was the 34th Governor of South Australia, serving from August 2007 to August 2014. He was succeeded by Hieu Van Le, who had previously been his lieutenant governor. Are we justified in saying that \"Kevin was well respected while he was in service.\"? Yes, no, or maybe? Maybe\n###\nMarcin Gortat (] ; born February 17, 1984) is a Polish professional basketball player for the Washington Wizards of the National Basketball Association (NBA). The 6\u00a0ft 11 in, 240-pound center is the son of boxer Janusz Gortat. He was a second-round draft choice of the Phoenix Suns in the 2005 NBA draft and has also played for the Orlando Magic. Are we justified in saying that \"Marcin Gortat was born less than 5556 days ago.\"? Yes, no, or maybe? No\n###\nStormRider was a simulator ride at Tokyo DisneySea. It simulated going into a weather storm in a futuristic airplane (a \"StormRider\") to dissipate the storm. The attraction opened on September 4, 2001, in the Port Discovery land of Tokyo DisneySea. The attraction closed on May 17, 2016 and replaced by a new Finding Nemo/Finding Dory simulator ride called Nemo & Friends SeaRider. Are we justified in saying that \"Tokyo DisneySea opened in 1999.\"? Yes, no, or maybe?", "doc_id": 894, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1175, 18214, 7468, 25986], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"Bosch was produced by Netflix.\"? Yes, no, or maybe? No\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world. Are we justified in saying that \"Northern rat fleas are not external parasites always but it's hard to find these species.\"? Yes, no, or maybe? Maybe\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\". Are we justified in saying that \"Freeman was born to English royalty.\"? Yes, no, or maybe? Maybe\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008. Are we justified in saying that \"He recorded, and solely mixed \"Random Access Memories\" by Daft Punk in 2013\"? Yes, no, or maybe? No\n###\nTodd Wash (born July 19, 1968) is an American football coach who is the defensive coordinator for the Jacksonville Jaguars of the National Football League (NFL). From 2013 to 2015 he was the defensive line coach and run game coordinator for the Jacksonville Jaguars. Are we justified in saying that \"Todd Wash was in his 60's when he was the defensive line coach for the Jacksonville Jaguars.\"? Yes, no, or maybe?", "doc_id": 945, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [526, 14958, 30979, 25537], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run. Are we justified in saying that \"Homicide: The Movie featured the same characters as the TV show.\"? Yes, no, or maybe? Maybe\n###\nUS Organization, or Organization Us, is a Black nationalist group in the United States founded in 1965. It was established as a community organization by Maulana Karenga. It was a complementary organization of the Black Panther Party in California. One of the early slogans was, \"Wherever US is, We are.\" US stands for us Black people vs 'them' the oppressors. Are we justified in saying that \"US Organization was founded by a black man.\"? Yes, no, or maybe? Maybe\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams. Are we justified in saying that \"The act was very hated\"? Yes, no, or maybe? Maybe\n###\nVanessa Alessandra Teixeira Porto (born March 16, 1984) is a Brazilian mixed martial artist and amateur boxer who competes in the Invicta Fighting Championships flyweight division. She is currently the #2-ranked 125-pound female fighter in the world according to the Unified Women's MMA Rankings. Are we justified in saying that \"Vanessa Alessandra Teixeira Porto was born over 25 years ago.\"? Yes, no, or maybe? Yes\n###\nEastland Mall is an enclosed shopping mall in Columbus, Ohio. Opened in 1968, it no longer has any open anchor stores. Its four vacant anchors were originally occupied by Lazarus, Kaufmann's (later Macy's), Sears, and JC Penney. The mall is managed by Woodmont Management. Are we justified in saying that \"The last anchor store to close at Eastland Mall was Macy's.\"? Yes, no, or maybe?", "doc_id": 797, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23561, 41144, 16562, 18565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California. Are we justified in saying that \"Michelle Do has millions of fans.\"? Yes, no, or maybe? Maybe\n###\nWilliam Lewis Moody Jr. (January 25, 1865 \u2013 July 21, 1954) was an American financier and entrepreneur from Galveston, Texas, who founded a private bank, an insurance company, and one of the largest charitable foundations in the United States. Moody was active in the day-to-day operations of his companies until two days before his death. Are we justified in saying that \"Moody lived in Texas.\"? Yes, no, or maybe? Yes\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era. Are we justified in saying that \"Stand-In is a Classical Era American comedy film.\"? Yes, no, or maybe? No\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s. Are we justified in saying that \"The New Ulm Oil Company Service Station is where people used to get gas from\"? Yes, no, or maybe? Yes\n###\n\"Beez in the Trap\" is a song by rapper Nicki Minaj for her second studio album, \"\" (2012). It was written by Minaj, Maurice Jordan, and 2 Chainz, who contributed a guest verse to the song, while production was handled by Kenoe. The track was released as the album's third single on May 29, 2012 following \"Starships\" and \"Right by My Side\". Are we justified in saying that \"The song was released on the last day of May, 2012\"? Yes, no, or maybe?", "doc_id": 6, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39438, 7316, 32587, 7444], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\". Are we justified in saying that \"The Bonzo Dog Band has at least two other albums. \"? Yes, no, or maybe? Yes\n###\nChristelyn Karazin is an American writer, columnist, and blogger on the subject of interracial dating, particularly black women dating outside their race. She hosts the blog \"Beyond Black & White\" and has written for \"Woman's Day\", \"Ebony\", \"Jet\", and Reuters. Karazin attended Loyola Marymount University, where she wrote for \"The Los Angeles Loyolan\". Are we justified in saying that \"Christelyn Karazin has made money from writing.\"? Yes, no, or maybe? Yes\n###\nThe Leslie Motor Car company was a motor car company located in Detroit, Michigan in 1916. This automobile company was most likely named for the city of Leslie, Michigan. It was in operation for only one year and produced an unknown number of cars. Most cars of this era, were sold or given by their owners for scrap metal drives during World War II. Are we justified in saying that \"The Leslie Motor Car company was a motor car company that was made by a family.\"? Yes, no, or maybe? Maybe\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\". Are we justified in saying that \"Freeman was the lead actor in Fargo.\"? Yes, no, or maybe? Maybe\n###\n\"A Leela of Her Own\" is the sixteenth episode in the third season of the animated series \"Futurama\". The episode is an homage to \"A League of Their Own\". It originally aired on the Fox network in the United States on April 7, 2002. Bob Uecker provided the voice of himself, Tom Kenny provided the voice of Abner Doubledeal, and Hank Aaron guest starred as himself and Hank Aaron XXIV. Are we justified in saying that \"A Leela of Her Own is the 16th episode of the 3rd season\"? Yes, no, or maybe?", "doc_id": 546, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31546, 23089, 25487, 6760], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Albert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion. Are we justified in saying that \"Albert Levitt was not alive on March 14, 1807\"? Yes, no, or maybe? Yes\n###\nBaoquan () is a town in Kedong County, western Heilongjiang province, Northeast China, located on a tributary of the Nonni River more than 190 km east-northeast of the city of Qiqihar. China National Highway 202 (G202) passes through the town, which is down the road from the city of Bei'an and the county seat, which lies some 13 km to the south. Are we justified in saying that \"Baoquan is heavily populated.\"? Yes, no, or maybe? Maybe\n###\nUNI Air () is an airline based in Zhongshan, Taipei, Taiwan. It is a domestic and regional subsidiary of EVA Air. It was known as Makung Airlines (\u99ac\u516c\u822a\u7a7a) until 1996, when EVA Air took a majority share of the airline. In 1998, the airline merged with Great China Airlines (\u5927\u83ef\u822a\u7a7a) and Taiwan Airways (\u81fa\u7063\u822a\u7a7a), which EVA Air also had interests in, to form UNI Airways (UNI Air). Are we justified in saying that \"UNI Air has a terrible CEO\"? Yes, no, or maybe? Maybe\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Kuch Kuch Hota Hai dominated the 44th filmfare awards.\"? Yes, no, or maybe? Yes\n###\nAlong the Shadow is the third studio album by American rock band Saosin, released on May 20, 2016 through Epitaph Records. The album marks the end of a three-and-a-half-year hiatus for the group with the return of original lead vocalist Anthony Green. It also marks the subsequent departure of lead guitarist Justin Shekoski. Are we justified in saying that \"Anthony Green is a licensed funeral director. \"? Yes, no, or maybe?", "doc_id": 981, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35588, 21553, 2132, 41768], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elizabeth Berridge (born May 2, 1962) is an American film and theatre actress. She is known for playing Constanze Mozart in the Academy Award-winning 1984 film \"Amadeus\", for the role of Officer Eve Eggers on \"The John Larroquette Show\" (1993-1996), and for her performances in the theater. Are we justified in saying that \"Elizabeth Berridge is not her real name\"? Yes, no, or maybe? Maybe\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex. Are we justified in saying that \"Croton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is a dull looking red plant.\"? Yes, no, or maybe? Maybe\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd. Are we justified in saying that \"Adam McKay chose Christina Applegate because she was blond\"? Yes, no, or maybe? Maybe\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute. Are we justified in saying that \"54-40 will win a Grammy in 2019\"? Yes, no, or maybe? Maybe\n###\nRonald Reagan is a bronze sculpture depicting the American politician of the same name by Chas Fagan, installed at the United States Capitol's rotunda, in Washington, D.C., as part of the National Statuary Hall Collection. The statue was donated by the U.S. state of California in 2009, and replaced one depicting Thomas Starr King, which the state had gifted in 1931. Are we justified in saying that \"The new sculpture replaces one that stood for eighty years.\"? Yes, no, or maybe?", "doc_id": 800, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4168, 9758, 24540, 6919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly. Are we justified in saying that \"Lee Kuan Yew was a leader in a Opposition party in Singapore. \"? Yes, no, or maybe? Yes\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"Tsewang Rigzin was president of the Tibetan Youth Congress in 2006\"? Yes, no, or maybe? No\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments. Are we justified in saying that \"J\u00fcrgen Melzer decided during his lifetime that he wanted to live in the United States.\"? Yes, no, or maybe? Maybe\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels. Are we justified in saying that \"Padfield was born after 1932.\"? Yes, no, or maybe? No\n###\nEnrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America. Are we justified in saying that \"Enrique Leff has published less than 200 articles.\"? Yes, no, or maybe?", "doc_id": 418, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1139, 1361, 2359, 41073], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prema Thapassu is a 1991 Telugu romance drama film, produced by Sri Sai Madhavi Productions and directed by Dr. N. Siva Prasad. The film stars Rajendra Prasad and Roja in the lead roles and music also composed by Rajendra Prasad . The film is first debut to actress Roja into film industry. The film was a \"flop\" at the box office. Are we justified in saying that \"Prema Thapassu had a sequel.\"? Yes, no, or maybe? Maybe\n###\nHakea preissii, commonly known as the Needle tree, Needle bush and Christmas hakea, is a shrub or tree of the genus \"Hakea\" native to an area in the Pilbara, Wheatbelt, Mid West and Goldfields-Esperance regions of Western Australia. The Noongar name for the plant is Tanjinn. Are we justified in saying that \"Christmas hakea would make a great alternative Christmas tree at holiday time! \"? Yes, no, or maybe? Maybe\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg. Are we justified in saying that \"Marie Hedwig Auguste von Sulzbach was a Countess Palatine of Sulzbach by birth and by marriage who died of influenza in 1681.\"? Yes, no, or maybe? Maybe\n###\nIn ancient Roman religion, Antevorta was a goddess of the future, also known as Porrima. She and her sister Postverta (or Postvorta) were described as companions or siblings of the goddess Carmenta, sometimes referred to as \"the Carmentae\". They may have originally been two aspects of Carmenta, namely those of her knowledge of the future and the past (compare the two-faced Janus). Are we justified in saying that \"Antevorta was a goddess of fate.\"? Yes, no, or maybe? No\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar). Are we justified in saying that \"Members of the Beastie Boys generally stayed together most of their careers\"? Yes, no, or maybe?", "doc_id": 694, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40289, 9618, 33352, 12705], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father. Are we justified in saying that \"Christopher Abele was in business for his father to earn the money to start Argosy Foundation. \"? Yes, no, or maybe? Maybe\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"The New York Mets placed fifth in the National League West in 1974.\"? Yes, no, or maybe? No\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013. Are we justified in saying that \"The 1941 Cabo San Lucas hurricane was downgraded to a topical cyclone on September 12\"? Yes, no, or maybe? Maybe\n###\nAndrea M\u00f3nica Montenegro DeFreitas, known as Andrea Montenegro (born 4 March 1969 in Lima, Peru), is a Peruvian actress and model well known for her participation in various telenovelas such as Zorro, la Espada y la Rosa, Latin Lover (2001), La viuda de la Mafia (2004) and currently in Telemundo's El Clon. She has a daughter Muriel and a son Amaru. Are we justified in saying that \"Andrea has four children.\"? Yes, no, or maybe? No\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada. Are we justified in saying that \"The La Serie ACT was formerly known as Serie ACT Castrol Tour.\"? Yes, no, or maybe?", "doc_id": 211, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19330, 31750, 8035, 33229], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spy for a Day is a 1940 British comedy thriller film directed by Mario Zampi and starring Douglas Wakefield, Paddy Browne and Jack Allen. During the First World War a British farmer is abducted by the Germans to take the place of a spy about to be executed whom he closely resembles. Are we justified in saying that \"Spy for a Day is a British comedy film about a British farmer who is abducted by Germans due to his resemblance to a a spy they are about to execute and need a replacement for, and they execute him instead.\"? Yes, no, or maybe? Maybe\n###\nEthan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films. Are we justified in saying that \"Suplee prefers acting on tv rather than movies.\"? Yes, no, or maybe? Maybe\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is in CA\"? Yes, no, or maybe? Yes\n###\nMarcin Gortat (] ; born February 17, 1984) is a Polish professional basketball player for the Washington Wizards of the National Basketball Association (NBA). The 6\u00a0ft 11 in, 240-pound center is the son of boxer Janusz Gortat. He was a second-round draft choice of the Phoenix Suns in the 2005 NBA draft and has also played for the Orlando Magic. Are we justified in saying that \"Polish people make excellent athletes. \"? Yes, no, or maybe? Maybe\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008. Are we justified in saying that \"Huizar was born the year after 1983.\"? Yes, no, or maybe?", "doc_id": 170, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19058, 15666, 30832, 29020], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zambian Breweries Plc is a Zambian brewing and beverage company listed on the Lusaka Stock Exchange. Its brews mainly pale lagers and a clear sorghum lager. It is also a major bottler of Coca-Cola. It has two breweries and three bottling plants. As of 2017 international brewing giant SABMiller owned 87% of Zambrew. Market capitalization was ZMW3,385,200,000 or about USD 372,000,000. Are we justified in saying that \"As of 2013 international brewing giant SABMiller owned 87% of Zambrew. Market capitalization was ZMW3,385,200,000 or about USD 372,000,000.\n\"? Yes, no, or maybe? No\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide. Are we justified in saying that \"GLO will create a secondary headquarters in East Africa, to offer services there.\"? Yes, no, or maybe? Maybe\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films. Are we justified in saying that \"Ileana Carusio was not a popular pornstar\"? Yes, no, or maybe? Maybe\n###\nRBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes. Are we justified in saying that \"RBG Resources has never been in trouble with the law.\"? Yes, no, or maybe? No\n###\nNelson is an American rock band founded by singer/songwriters Matthew and Gunnar Nelson (twin sons of Ricky Nelson and Kristin Nelson). The band achieved success during the early 1990s with their double platinum debut album \"After the Rain\", which featured the number-one hit \"(Can't Live Without Your) Love and Affection\". Are we justified in saying that \"Nelson has an E.\"? Yes, no, or maybe?", "doc_id": 249, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41019, 9308, 32144, 39271], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "What Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band. Are we justified in saying that \"The band formed in 1976.\"? Yes, no, or maybe? Maybe\n###\nOrange, Red, Yellow is a 1961 Color Field painting by Mark Rothko. It sold at Christie's for $86.882.500 on May 8, 2012. The seller was the estate of David Pincus and the sale price represents a record nominal price for Post-War / contemporary art at public auction and for Rothko works in general. Are we justified in saying that \"Orange, Red, Yellow sold for more than $86,882,499.\"? Yes, no, or maybe? Yes\n###\nJunoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani. Are we justified in saying that \"the soundtrack doesn't feature any lyrics\"? Yes, no, or maybe? Maybe\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996. Are we justified in saying that \"I am a tall man.\"? Yes, no, or maybe? Maybe\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"End of the Past was published in the year two thousand and sixteen. \"? Yes, no, or maybe?", "doc_id": 575, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32136, 34446, 18257, 7308], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Studies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\" Are we justified in saying that \"Carson also relates this to the successes of the 21st century.\"? Yes, no, or maybe? Maybe\n###\nEthan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films. Are we justified in saying that \"Ethan Suplee played Randy in My Name is Earl. \"? Yes, no, or maybe? Yes\n###\nThe Carrier Sekani Tribal Council (familiarly known as CSTC) is a tribal council representing eight First Nations in the Central Interior of British Columbia. It was originally known as the \"Lakes District Tribal Council\". The CSTC was incorporated in 1979 and is a registered non-profit society. Are we justified in saying that \"The Carrier Sekani Tribal Council is also known as the CSTC\"? Yes, no, or maybe? Yes\n###\nThe iHeartRadio Much Music Video Awards (also known as the MMVAs, and originally known as the Canadian Music Video Awards until 1995, and formerly and commonly known as the MuchMusic Video Awards) are annual awards presented by the Canadian television channel Much to honour the year's best music videos. Are we justified in saying that \"The iHeartRadio Much Music Video Awards once went by a different name.\"? Yes, no, or maybe? Yes\n###\nOur Lady of Confidence, also known as La Madonna della Fiducia or Our Lady of Trust, is a venerated image depicting the Blessed Virgin Mary enshrined at the Lateran Basilica. The feast of Our Lady of Confidence falls on the last Saturday prior to Lent. Are we justified in saying that \"Our Lady of Confidence occurs on more than one day each year\"? Yes, no, or maybe?", "doc_id": 628, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2137, 6464, 7744, 24950], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Natasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad. Are we justified in saying that \"Natasha Choufani was not an actress. \"? Yes, no, or maybe? No\n###\nThe Ghost and Mrs. Muir (1947) is a romantic-fantasy film starring Gene Tierney and Rex Harrison. It was directed by Joseph L. Mankiewicz, and is based on a 1945 novel written by Josephine Leslie under the pseudonym of R. A. Dick. In 1945, 20th Century Fox bought the film rights to the novel, which had been published only in the United Kingdom at that time. It was shot entirely in California. Are we justified in saying that \"The Ghost and Mrs. Muir film closely follows the plot of a book.\"? Yes, no, or maybe? Maybe\n###\nPeter Andreas Thiel ( ; born October 11, 1967) is an American entrepreneur, venture capitalist, philanthropist, political activist, and author. He was ranked No. 4 on the \"Forbes\" Midas List of 2014, with a net worth of $2.2 billion, and No. 246 on the \"Forbes\" 400 in 2016, with a net worth of $2.7 billion. Are we justified in saying that \"Thiel donates money to animal shelters.\"? Yes, no, or maybe? Maybe\n###\nCarol Hernandez is an American journalist from Miami Florida. She won a 1996 Goldsmith Prize for Investigative Reporting. She won the 1996 Pulitzer Prize for National Reporting. She currently resides in Long Island with her husband, and three children, (the oldest being the best and most funny and creative). Are we justified in saying that \"Carol Hernandez has the same birthday as her husband.\"? Yes, no, or maybe? Maybe\n###\nJoona Veteli (born 21 April 1995) is a Finnish football player currently playing for Norwegian OBOS-ligaen side Fredrikstad. Veteli plays in the position of centre midfielder but can also operate as an attacking midfielder, defensive midfielder, right-back and winger. Are we justified in saying that \"Joona Veteli is paid to play sports\"? Yes, no, or maybe?", "doc_id": 747, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18736, 20289, 44392, 24090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "End of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"Nadeem F. Paracha speaks English.\"? Yes, no, or maybe? Maybe\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"The Beatles were lazy and didn't want to release a new album with original material.\"? Yes, no, or maybe? Maybe\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier. Are we justified in saying that \"Timoon Animation was the sole company created by Philippe Mounier\"? Yes, no, or maybe? Maybe\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue. Are we justified in saying that \"The Drake Hotel has at least two places where you can get food.\"? Yes, no, or maybe? Yes\n###\nThe Louvin Brothers were an American musical duo composed of brothers Ira Lonnie Loudermilk (1924\u20131965) and Charlie Elzer Loudermilk (1927\u20132011), better known as Ira and Charlie Louvin. The brothers are cousins to John D. Loudermilk, a Nashville Songwriters Hall of Fame member. Are we justified in saying that \"Ira and Charlie Louvin are cousins.\"? Yes, no, or maybe?", "doc_id": 909, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20453, 9872, 28298, 28335], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Going My Way is an American comedy-drama series starring dancer and actor Gene Kelly. Based on the 1944 film of the same name starring Bing Crosby, the series aired on ABC with new episodes from October 3, 1962 to April 24, 1963. The program was Kelly's first and only attempt at a weekly television series. The series was canceled after one season of thirty episodes. Are we justified in saying that \"Going my Way released new Episodes after the release of the original film.\"? Yes, no, or maybe? Yes\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries. Are we justified in saying that \"The spirits that ESL makes tastes awful.\"? Yes, no, or maybe? Maybe\n###\nSongbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist. Are we justified in saying that \"Chris Cornell released his live album with his band.\"? Yes, no, or maybe? No\n###\nThe 2007 Hertsmere Borough Council election took place on 3 May 2007 to elect members of Hertsmere Borough Council in Hertfordshire, England. One third of the council was up for election and the Conservative party stayed in overall control of the council. Are we justified in saying that \"The Conservative party had a large party to celebrate.\"? Yes, no, or maybe? Maybe\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Despite being a French-Egyptian film, Destiny was filmed in neither France or Egypt.\"? Yes, no, or maybe?", "doc_id": 85, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41992, 25643, 15190, 34021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grant Taylor (Born October 30,1991) is an American professional skateboarder. He is the son of former professional skateboarder Thomas Taylor and won Thrasher Magazine's \"Skater of The Year\" in 2011. Grant\u2019s style of skateboarding is known to be fast and powerful. He is recognized for his unique versatile skateboarding. Are we justified in saying that \"Grant Taylor won Thrasher Magazine's \"Skater of the Year\" in 2011 because his style is fast and powerful.\"? Yes, no, or maybe? Yes\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions. Are we justified in saying that \"Project Gasbuggy did not go as planned.\"? Yes, no, or maybe? Maybe\n###\nEdward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis. Are we justified in saying that \"E. Annis aka T. Hart wrestled for AAA.\"? Yes, no, or maybe? Yes\n###\nDavid Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics. Are we justified in saying that \"Dave Smith is a retired female race walker\"? Yes, no, or maybe? No\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898. Are we justified in saying that \"Grimsby Town Football Club was a bottom tier football team\"? Yes, no, or maybe?", "doc_id": 301, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20992, 44343, 23399, 11723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide. Are we justified in saying that \"the film did not make more than 5 million dollars\"? Yes, no, or maybe? No\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene. Are we justified in saying that \"\"Something from Nothing\" was inspired by pop music\"? Yes, no, or maybe? Maybe\n###\nMount Willey is a mountain located in Grafton County, New Hampshire. The mountain is named after Samuel Willey, Jr. (1766\u20131826) and his family, who in 1825 moved into a house in Crawford Notch. The family was killed a year later in August 1826 during a landslide. Are we justified in saying that \"Samuel Willey, Jr. was born less than 10000 days ago.\"? Yes, no, or maybe? No\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"Colorz of Rage was the first 1999 film to have a female lead.\"? Yes, no, or maybe? Maybe\n###\nDerek Ervin Smith (November 1, 1961 \u2013 August 9, 1996) was an American professional basketball player. He won a national championship with the Louisville Cardinals in 1980, and spent nine years in the NBA in a career shortened by a knee injury. He would later become an assistant coach for the Washington Bullets from 1994 until his death. Are we justified in saying that \"Derek Ervin Smith officially retired before his death.\"? Yes, no, or maybe?", "doc_id": 388, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14655, 23974, 36937, 34982], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Superman's Dead\" is a song by Canadian alternative rock group Our Lady Peace. It was released in December 1996 as the lead single from their second album \"Clumsy\". This has become one of Our Lady Peace's most popular songs in both Canada and the U.S., as well as many other parts of the world. Are we justified in saying that \"Superman's Dead was the third single. \"? Yes, no, or maybe? No\n###\nGunfighters of Casa Grande (Spanish: \"Los Pistoleros de Casa Grande\" ) is a 1964 Eurowestern film, co-produced by American and Spanish producers. Based on a story by Borden and Patricia Chase, it was later developed into a screenplay with the assistance of screenwriter Clark Reynolds and directed by Roy Rowland, the last film he made for Metro-Goldwyn-Mayer. Are we justified in saying that \"Gunfighters of Casa Grande took place in Western Europe.\"? Yes, no, or maybe? Maybe\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label. Are we justified in saying that \"Bad Company has many hit songs.\"? Yes, no, or maybe? Maybe\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie. Are we justified in saying that \"G.I. Joe: Ninja Battles was a failure\"? Yes, no, or maybe? Maybe\n###\nMultnomah University (MU) is a non-denominational Christian university in Portland, Oregon, United States. Multnomah consists of a college, graduate school, seminary and Degree Completion Program, and the university offers bachelor's, master's and doctorate degrees in a wide range of fields. Are we justified in saying that \"Multnomah University is located on the West coast of the U.S.\"? Yes, no, or maybe?", "doc_id": 523, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36850, 7821, 33970, 40526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University. Are we justified in saying that \"Shannon Kelley wants to become the head coach.\"? Yes, no, or maybe? Maybe\n###\nCorey Gibson, known professionally as Corey Chorus, is an American songwriter, record producer, vocal producer, sound engineer and publisher, known for having written songs such as Cheers (Drink to That) of Rihanna, Chica Bomb by Dan Balan, Made in the USA by Demi Lovato. Are we justified in saying that \"Corey Gibson is a famous singer.\"? Yes, no, or maybe? No\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\" Are we justified in saying that \"The population was more than 80,000\"? Yes, no, or maybe? No\n###\nNogiBingo! ( stylized as NOGIBINGO!) is a Japanese television variety show starring Japanese idol girl group Nogizaka46. Ijily Okada, who is known for many AKB48 related show such as \"AKB48 Nem\u014dsu TV\", hosted the program. The show firstly aired on July 3, 2013, as part of the variety show \"Nogizaka46 x HKT48 Kanbangumi Battle!\", and it became an independent show from the second season. Are we justified in saying that \"Ijily Okada knows of many Japanese variety shows.\"? Yes, no, or maybe? Yes\n###\nFather Xmas is a 2001 short film from director Marie Rose and the American Film Institute's Directing Workshop for Women starring Dakota Fanning as six-year-old Clairee who learns from her older brother (Stephen Fanning) that Santa Claus is not real and that their father is fighting in the Vietnam War. Are we justified in saying that \"Father Xmas is a 2001 short film from director Marie Rose and the American Movie Institute's Directing Workshop for Women\"? Yes, no, or maybe?", "doc_id": 620, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18308, 23165, 38200, 37216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Franklin Martin Loew, DVM, PhD, (1939 in Syracuse, NY \u2013 2003 in Boston, MA) was president of Becker College, dean of the College of Veterinary Medicine at Cornell University and dean of Tufts University School of Veterinary Medicine (now Tufts Cummings School of Veterinary Medicine). Are we justified in saying that \"Tufts University School of Veterinary medicine had its name changed to Tufts Cummings School of Veterinary Medicine.\"? Yes, no, or maybe? Yes\n###\nIlse von Glatz (August 21, 1958 \u2013 May 2, 2014) was a Canadian actress who played an Advocate in the 1988 science fiction TV series \"War of the Worlds\". She also worked in \"The Mind of Simon Foster\" (episode of \"the 1985 version of The Twilight Zone\"). She also appeared in at least one episode of \"\" in 1989. Are we justified in saying that \"The show was science fiction\"? Yes, no, or maybe? Yes\n###\nForest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark. Are we justified in saying that \"Forest Hill Vineyard (also referred to as Forest Hill Wines) is a successful Australian winery business based in the Great Southern wine region of Western Australia.\"? Yes, no, or maybe? Maybe\n###\nDostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\". Are we justified in saying that \"stluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1971 as a en's football club in Istanbul, Turkey.\"? Yes, no, or maybe? No\n###\nJacques Tourneur (] ; November 12, 1904 \u2013 December 19, 1977) was a French film director known for the classic film noir \"Out of the Past\" and a series of low-budget horror films he made for RKO Studios, including \"Cat People\", \"I Walked with a Zombie\" and \"The Leopard Man\". He is also known for directing \"Night of the Demon\", that was released by Columbia Pictures. Are we justified in saying that \"Jacques Tourneur began his career as a film director in 1904.\"? Yes, no, or maybe?", "doc_id": 32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21426, 20028, 28096, 22967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Finniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla. Are we justified in saying that \"Kangaroo Island, the Fleurieu Peninsula, Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla are all within an area that is less than 8750 km. \"? Yes, no, or maybe? Yes\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler. Are we justified in saying that \"Dan Deacon is a small man\"? Yes, no, or maybe? Maybe\n###\nHoodlum is a 1997 American crime drama film that gives a fictionalized account of the gang war between the Italian/Jewish mafia alliance and the Black gangsters of Harlem that took place in the late 1920s and early 1930s. The film concentrated on Ellsworth \"Bumpy\" Johnson (Laurence Fishburne), Dutch Schultz (Tim Roth), and Lucky Luciano (Andy Garc\u00eda). Are we justified in saying that \"Laurence Fishburne and Andy Garc\u00eda are 2 of the stars of Hoodlum\"? Yes, no, or maybe? Yes\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue. Are we justified in saying that \"Misty Knight has been read by Trump.\"? Yes, no, or maybe? Maybe\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex. Are we justified in saying that \"Muccan Station is on the radio\"? Yes, no, or maybe?", "doc_id": 493, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6711, 16084, 24903, 32032], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alexander Grinberg (\u0410\u043b\u0435\u043a\u0441\u0430\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u043b\u043e\u0432\u0438\u0447 \u0413\u0440\u0438\u043d\u0431\u0435\u0440\u0433, Aleksandr Danilovich Grinberg) (1885\u20131979) was a Russian and Soviet photographer. n 1908 he was awarded the silver medal in the all-Russian photo exhibition in Moscow and the gold medal in the international photo-exhibition in Dresden. Are we justified in saying that \" Alexander Grinberg was a homosexual.\"? Yes, no, or maybe? Maybe\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title. Are we justified in saying that \"The 1982 Bavarian Tennis Championships was held in Cologne, Germany\"? Yes, no, or maybe? No\n###\nPassion Play is a 2010 American drama film written and directed by Mitch Glazer, executive produced by Rebecca Wang and starring Mickey Rourke, Megan Fox, Rhys Ifans and Bill Murray. Filming for the production began in December 2009 and is presented by Rebecca Wang Entertainment. It premiered at the 2010 Toronto International Film Festival. Are we justified in saying that \"Passion Play had Mickey Rourke as the lead role\"? Yes, no, or maybe? Maybe\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"Johan Martin Schr\u00f6der has a large extended family.\"? Yes, no, or maybe? Maybe\n###\nAtiha Sen Gupta (born 1988) is a British playwright and screenwriter. She is writer-in-residence for 2016-2017 at Theatre Royal Stratford East in London, where her play \"Counting Stars\" was produced in 2016. In the same year she won the International Achievement Recognition Awards (IARA) Award for Best Playwright. Are we justified in saying that \"Atiha won IARA Award for Counting Stars.\"? Yes, no, or maybe?", "doc_id": 303, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25184, 21202, 35474, 11996], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"David Lee Murphy likes driving\"? Yes, no, or maybe? Maybe\n###\nBullitt East High School is a high school located at 11450 Highway 44 East in the city of Mount Washington, Kentucky. It is part of the Bullitt County Public Schools district. Sports teams include: Archery, Swimming, Football, Soccer, Tennis, Track and Field, Baseball, Softball, Wrestling, Basketball, Volleyball and Cheerleading. Are we justified in saying that \"Bullitt High School has more female students than male students.\"? Yes, no, or maybe? Maybe\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km . Are we justified in saying that \"Jiaozhou Bay Bridge has a police station\"? Yes, no, or maybe? Maybe\n###\n\"Beyond This Earthly Realm\" is the eleventh episode of the fourth season of the American animated television series \"Adventure Time\". The episode was written and storyboarded by Ako Castuera and Jesse Moynihan, from a story by Patrick McHale, Kent Osborne, and Pendleton Ward. It originally aired on Cartoon Network on June 11, 2012. Are we justified in saying that \"Beyond This Earthly Realm is a cartoon episode.\"? Yes, no, or maybe? Yes\n###\nGood is a 2008 drama film based on the stage play of the same name by C. P. Taylor. It stars Viggo Mortensen, Jason Isaacs, and Jodie Whittaker, and was directed by Vicente Amorim. The film premiered at the Toronto International Film Festival on 8 September 2008. Are we justified in saying that \"Good premiered more than 10 years ago\"? Yes, no, or maybe?", "doc_id": 234, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21082, 9365, 44651, 2580], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"Dungeon Master's Guide has at least 4 editions.\"? Yes, no, or maybe? Yes\n###\nSimon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro. Are we justified in saying that \"Simon Corbell was born less than 5000 hours ago.\"? Yes, no, or maybe? No\n###\nNeil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\". Are we justified in saying that \"The compilation album Neil Sedaka: Italiano was released in nineteen hundred sixty five.\"? Yes, no, or maybe? No\n###\nBrash Young Turks is a 2016 coming-of-age British crime film directed by Naeem Mahmood and co-directed by his brother Ash Mahmood that tells a fast paced struggle love, crime and power, against all odds. The film stars Melissa Latouche, Paul Chiedozie, Tom Bott, Richard Shelton and Julian Glover among a large ensemble cast. Are we justified in saying that \"Brash Young Turks is not a Russian film.\"? Yes, no, or maybe? Yes\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing. Are we justified in saying that \"Students in this program must have a bachelor's degree to participate.\"? Yes, no, or maybe?", "doc_id": 82, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18896, 15747, 45297, 22319], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"The film Punjabi was the winner of film awards. \"? Yes, no, or maybe? Maybe\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"Kasey Peters threw for over 10,000 yards in his career.\"? Yes, no, or maybe? Maybe\n###\nDame Nicola Mary Brewer DCMG is a British diplomat and university administrator. In May 2014 she was appointed Vice-Provost (International) at University College London. She is a non-executive director of Aggreko. Brewer was British High Commissioner to South Africa from 2009 to 2013. Are we justified in saying that \"Dame Nicola Mary Brewer is a British diplomat and university administrator who was appointed Vice-Provost at the University College London while at the same time also working as the High commissioner to South Africa.\"? Yes, no, or maybe? No\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The man had serious mental problems.\"? Yes, no, or maybe? Maybe\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"University of Bologna is a former Roman Catholic Church\"? Yes, no, or maybe?", "doc_id": 974, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40306, 39333, 43108, 21780], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play. Are we justified in saying that \"The History Boys was filmed in 2005\"? Yes, no, or maybe? Maybe\n###\nThe East\u2013West Shrine Game is an annual postseason college football all-star game played each January since 1925. The game is sponsored by the fraternal group Shriners International, and the net proceeds are earmarked to some of the Shrine's charitable works, most notably the Shriners Hospitals for Children. The game's slogan is \"Strong Legs Run That Weak Legs May Walk\". Are we justified in saying that \"East-West Shrine Game is played every year.\"? Yes, no, or maybe? Yes\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001). Are we justified in saying that \"Susan Lynch has an A.\"? Yes, no, or maybe? Yes\n###\nThe Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world. Are we justified in saying that \"The Timber Mountain Log Ride has seen millions of rides.\"? Yes, no, or maybe? Maybe\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp. Are we justified in saying that \"The town had a census in the 21st century.\"? Yes, no, or maybe?", "doc_id": 62, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31843, 39511, 35202, 14716], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures. Are we justified in saying that \"The Golden Fetter is a romance film.\"? Yes, no, or maybe? Yes\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"Youth like football.\"? Yes, no, or maybe? Maybe\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs was a journalist\"? Yes, no, or maybe? Yes\n###\nSpeedway Field was the original name for the airfield that was to evolve into Minneapolis-St. Paul International Airport, the twelfth busiest airport in the United States; it was also the largest hub for Northwest Airlines and the third largest hub for Delta Air Lines, Northwest's successor. Are we justified in saying that \"Minneapolis-St. Paul International Airport is larger than Speedway Field.\"? Yes, no, or maybe? Yes\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles). Are we justified in saying that \"The 1979 British Grand Prix took place on 13 July 1979\"? Yes, no, or maybe?", "doc_id": 118, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20570, 5503, 20913, 23502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Should the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986. Are we justified in saying that \"Bauhaus is a gothic rock band.\"? Yes, no, or maybe? Yes\n###\nThe European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002. Are we justified in saying that \"The europea democrat union members does not include liberal conservatives.\"? Yes, no, or maybe? No\n###\nThe S-99 (Russian: \u0421-99 ) experimental submarine was the only ship of the Soviet Project 617 submarine class (NATO reporting name: Whale class) that the Soviet Union built during the early Cold War and the only Soviet submarine which had a Walter engine fuelled by high test peroxide (HTP). Are we justified in saying that \"the cold war saw the most inventions by russia\"? Yes, no, or maybe? Maybe\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy. Are we justified in saying that \"the owner changed out players after the 6th loss\"? Yes, no, or maybe? Maybe\n###\n\"We Really Shouldn't Be Doing This\" is a song written by Jim Lauderdale, and recorded by American country music artist George Strait. It was released in September 1998 as the third and final single from his album \"One Step at a Time\". It peaked at number 4 in the United States, and number 2 in Canada. Are we justified in saying that \"Jim Lauderdale is an American country song writer.\"? Yes, no, or maybe?", "doc_id": 297, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4739, 26165, 36054, 18483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Uni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats. Are we justified in saying that \"Uni\u00f3n Deportiva Vall de Ux\u00f3 is a baseball team.\"? Yes, no, or maybe? No\n###\nweRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo. Are we justified in saying that \"weRead has more than two million but less than four million followers across multiple social media platforms.\"? Yes, no, or maybe? Yes\n###\nHenry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795. Are we justified in saying that \"Henry Pelham Fiennes Pelham-Clinton was well educated\"? Yes, no, or maybe? Maybe\n###\nManila Calling is a 1942 American black-and-white World War II propaganda war film drama from 20th Century Fox, produced by Sol M. Wurtzel, directed by Herbert I. Leeds, that stars Lloyd Nolan, Carole Landis, Cornel Wilde, James Gleason, Lester Matthews, Louis Jean Heydt, and Ted North. Are we justified in saying that \"There was no color in Manila Calling.\"? Yes, no, or maybe? Yes\n###\nGerard A. \"Gerry\" Salton (8 March 1927 in Nuremberg \u2013 28 August 1995), was a Professor of Computer Science at Cornell University. Salton was perhaps the leading computer scientist working in the field of information retrieval during his time, and \"the father of information retrieval\". His group at Cornell developed the SMART Information Retrieval System, which he initiated when he was at Harvard. Are we justified in saying that \"Gerry Salton was born in the late 1920s\"? Yes, no, or maybe?", "doc_id": 898, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [738, 25509, 27992, 43567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013. Are we justified in saying that \"The Attorney is based off of greek mythology.\"? Yes, no, or maybe? Maybe\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Mentha diemenica is popular in Asian cuisines \"? Yes, no, or maybe? Maybe\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996. Are we justified in saying that \"Trainspotting was Kelly Macdonalds first movie.\"? Yes, no, or maybe? Yes\n###\nMark Miravalle (born 1959) is a professor of theology at Franciscan University of Steubenville, specializing in Mariology. He is president of \"Vox Populi Mariae Mediatrici\", a Catholic movement promoting the concepts of the Blessed Virgin Mary as Mediatrix and Co-Redemptrix. Are we justified in saying that \"Mark Miravalle was born within the last 100 years.\"? Yes, no, or maybe? Yes\n###\nPhil Lloyd is an Australian actor and scriptwriter and partner in the production company Jungleboys. He is best known for his acting role as Myles Barlow in the Australian TV series, \"Review with Myles Barlow\" and the comedy series \"At Home with Julia\", where he played Tim Mathieson, the partner of prime minister Julia Gillard. Are we justified in saying that \"\"At Home with Julia\" starred prime minister Julia Gillard\"? Yes, no, or maybe?", "doc_id": 80, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2587, 39796, 37112, 10486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Curzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South . Are we justified in saying that \"The Club will go through another name change.\"? Yes, no, or maybe? Maybe\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Every member of Nashville West was happy the band broke up.\"? Yes, no, or maybe? Maybe\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd. Are we justified in saying that \"Foley was born in West Bromwich.\"? Yes, no, or maybe? Maybe\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior. Are we justified in saying that \"The \"disconnector\" is not always efficient which leads to jamming \"? Yes, no, or maybe? Maybe\n###\nStansted Mountfitchet is an English village and civil parish in Uttlesford district, Essex, near the Hertfordshire border, 35 mi north of London. According to the 2001 census it had a population of 5,533, increasing to 6,011 at the 2011 census. The village is served by Stansted Mountfitchet railway station. Are we justified in saying that \"Stansted Mountfitchet is located in mainland europe\"? Yes, no, or maybe?", "doc_id": 434, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3180, 35833, 8350, 8979], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898. Are we justified in saying that \"The club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later after some debate and moved to its current stadium, Blundell Park, in 1898.\"? Yes, no, or maybe? Maybe\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997. Are we justified in saying that \"The song is more than 3 years old\"? Yes, no, or maybe? Yes\n###\nGordon Hendrick (February 16, 1949) is a former Republican member of the Montana Legislature. He was elected to House District 14 which represents the Superior area. Due to Montana's term limits, he was ineligible to run for re-election in 2012. He was succeeded by Republican candidate Nicholas Schwaderer for the 2013 legislature cycle. Are we justified in saying that \"Republicans have the majority in the Montana Legislature\"? Yes, no, or maybe? Maybe\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"facts about film true-but it maynot be best film ever about a ketch\"? Yes, no, or maybe? Maybe\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier. Are we justified in saying that \"New episodes of Forest Friends do not air on TiJi.\"? Yes, no, or maybe?", "doc_id": 95, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26524, 39258, 4226, 6531], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Smithereens was produced in 1998.\"? Yes, no, or maybe? Yes\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season. Are we justified in saying that \"The MVP is the best in the world\"? Yes, no, or maybe? Maybe\n###\nUdinese Calcio sensationally finished third in Serie A, much due to Oliver Bierhoff being in the form of his life, scoring 27 goals in a league season consisting of just 34 matches. Bierhoff, coach Alberto Zaccheroni and winger Thomas Helveg all left for Milan at the end of the season, ensuring Udinese had lots of work to do to stay at the level it was. Are we justified in saying that \"Bierhoff set a record for goals scored.\"? Yes, no, or maybe? Maybe\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label. Are we justified in saying that \"Lance King died in 1962\"? Yes, no, or maybe? No\n###\nAodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\". Are we justified in saying that \"Aodh Mac Cathmhaoil was born in 1570.\"? Yes, no, or maybe?", "doc_id": 26, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20616, 19042, 11919, 11885], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Achilles Club is a track and field club formed in 1920 by and for past and present representatives of Oxford and Cambridge Universities. Members have won 19 Olympic Gold Medals (most recently Steph Cook in the pentathlon), and held 38 World Records. One of its founding members was Evelyn Aubrey Montague, who is immortalized in the 1981 film \"Chariots of Fire\". Are we justified in saying that \"Members of the Achilles Club have won 19 Olympic Gold Medals but no silver medals.\"? Yes, no, or maybe? Maybe\n###\nPrincess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia. Are we justified in saying that \"Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld never ever loved her husband Grand Duke Konstantin Pavlovich of Russia.\"? Yes, no, or maybe? Maybe\n###\nMichael Shane Hollis (born May 22, 1972) is a former professional American football placekicker in the National Football League. He spent most of his nine-year professional career with the Jacksonville Jaguars, kicking for the team from 1995\u20132001 and setting several team records. He then played for the Buffalo Bills and New York Giants before retiring after an injury in 2003. Are we justified in saying that \"Michael Shane Hollis is a woman.\"? Yes, no, or maybe? No\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award. Are we justified in saying that \"Rachel Brosnahan has never talked.\"? Yes, no, or maybe? No\n###\nThe church St. Ulrich is a Roman Catholic parish church in Neubau, the 7th district of Vienna, Austria. The official name is \"Pfarrkirche hl. Ulrich und Maria Trost \" (Parish church of St. Ulrich and Mary's consolation), it is also known as Ulrichskirche . The Baroque hall church with two towers was built in 1721. It is consecrated to St. Ulrich and St. Mary. Are we justified in saying that \"Pfarrkirche hl. Ulrich und Maria Trost was built in the 18 century.\"? Yes, no, or maybe?", "doc_id": 772, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41580, 29490, 1031, 32185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler. Are we justified in saying that \"Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. He has released many albums under a lot of different labels.\"? Yes, no, or maybe? Yes\n###\nA Bhumka is the term for a traditional herbal healer in the valley of Patalkot, India. The valley is mainly home to members of the Bharia and Gond tribes, with 2,000 residents scattered between various villages and hamlets. Tribes people traditionally use herbal medicine, under the auspices of a herbal expert and holy man known as a Bhumka. Are we justified in saying that \"The people do not trust the medicine\"? Yes, no, or maybe? Maybe\n###\nJon Garth Murray (November 16, 1954 \u2013 September 29, 1995) was the second son of late controversial activist Madalyn Murray O'Hair, the first president and founder of American Atheists, Inc., in 1963. He was also the half-brother of the reverend William \"Bill\" Murray. Are we justified in saying that \"Murray's biological brother was much older than him.\"? Yes, no, or maybe? Maybe\n###\nHelvering v. Horst, 311 U.S. 112 (1940) , is an opinion of the United States Supreme Court which further developed the \u201cfruit-and-tree\u201d metaphor established in \"Lucas v. Earl\", 281 U.S. 111 (1930) . \"Horst\" is the leading case that applies the assignment of income doctrine to income from property. Are we justified in saying that \"\"Horst\" has to do with properties.\"? Yes, no, or maybe? Yes\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant. Are we justified in saying that \"People in America use Hakea gibbosa as a hedge plant.\"? Yes, no, or maybe?", "doc_id": 552, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2075, 12940, 2870, 35128], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bernardo Provenzano (] ; 31 January 1933 \u2013 13 July 2016) was a member of the Sicilian Mafia (\"Cosa Nostra\") and was suspected of having been the head of the Corleonesi, a Mafia faction that originated in the town of Corleone, and de facto \"capo di tutti capi\" (boss of all bosses) of the entire Sicilian Mafia until his arrest in 2006. Are we justified in saying that \"Bernardo Provenzano lived for more than a century.\"? Yes, no, or maybe? No\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley. Are we justified in saying that \"Bernard Taylor majored in English while in college\"? Yes, no, or maybe? Maybe\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart. Are we justified in saying that \"In 2011 Earlly Mac released an album\"? Yes, no, or maybe? Maybe\n###\nFinsbury Park TMD was a railway Traction Maintenance Depot situated in London, England. It was the first purpose built main line diesel locomotive depot opened in England and it was fully commissioned in April 1960. Finsbury Park was a steam shed under British Railways with the depot code 34G; the depot code of the diesel depot under BR was FP. The nearest railway station is Finsbury Park. Are we justified in saying that \"Finsbury Park TMD was a popular railway in London\"? Yes, no, or maybe? Maybe\n###\nNew Orleans Square is a themed land found at Disneyland Park in Anaheim, California. Based on 19th-century New Orleans, Louisiana, the roughly three-acre area was the first land to be added to Disneyland after the park's opening, at a cost of $18 million. It is exclusive to Disneyland, although a similarly themed area can be found within Adventureland at Tokyo Disneyland. Are we justified in saying that \"New Orleans Square has been closed for many years\"? Yes, no, or maybe?", "doc_id": 502, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3014, 6303, 42953, 25216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Four Cs of 21st century learning, also known as the Four Cs or 4 Cs, are four skills that have been identified by the United States-based Partnership for 21st Century Skills (P21) as the most important skills required for 21st century education: critical thinking, communication, collaboration, and creativity. Are we justified in saying that \"Communication is the most important of the Four C's.\"? Yes, no, or maybe? Maybe\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats. Are we justified in saying that \"Valenica has more football players than Madrid.\"? Yes, no, or maybe? Maybe\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001. Are we justified in saying that \"Wanker Records writes music other than punk rock.\"? Yes, no, or maybe? No\n###\n\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\". Are we justified in saying that \"After the song's release, Murray thought about going disco.\"? Yes, no, or maybe? Maybe\n###\nBig Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999. Are we justified in saying that \"Big Bad Voodoo Daddy played at the 33rd Super Bowl.\"? Yes, no, or maybe?", "doc_id": 40, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9608, 16828, 17073, 30874], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep. Are we justified in saying that \"Coldwater fish live shorter lives than tropical fish.\"? Yes, no, or maybe? No\n###\n\"Loose Talk\" was a 1954 song written by Freddie Hart (who also recorded it on Capitol, but didn't chart) and recorded by Carl Smith and was his last number one. It was at the top spot of the \"Billboard\" country and western chart for seven weeks and had a total of thirty-two weeks listed there. The B-side was \"More Than Anything Else in the World\": it peaked at number five in the same chart. Are we justified in saying that \"Loose Talk has been sung by Reagan\"? Yes, no, or maybe? Maybe\n###\nJara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa. Are we justified in saying that \"Less than 40000 people spoke Jara in the year 2000\"? Yes, no, or maybe? No\n###\nThe Vermont State Police (VSP) is the state police agency for the US state of Vermont. The force has jurisdiction throughout the entire state. The Vermont Public Safety Commission directs policy and selects the commander. The commander is Colonel Matthew Birmingham. The Vermont Public Safety Commissioner is Keith W. Flynn. There are 327 sworn state troopers. Are we justified in saying that \"The force does not have jurisdiction throughout the state\"? Yes, no, or maybe? No\n###\nDavid Scott \"Dave\" Foley (born January 4, 1963) is a Canadian actor, stand-up comedian, director, producer and writer. He is known as a co-founder of the comedy group \"The Kids in the Hall\", responsible for their eponymous sketch show and the feature-length film \"\". He played Dave Nelson in the sitcom \"NewsRadio\", voiced Flik in \"A Bug's Life\" and hosted the game show \"Celebrity Poker Showdown\". Are we justified in saying that \"David Scott probably started his acting career before January 4,1963.\"? Yes, no, or maybe?", "doc_id": 218, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20227, 24137, 40106, 906], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Finniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla. Are we justified in saying that \"Finnies is a district that includes at least eight towns.\"? Yes, no, or maybe? Yes\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer. Are we justified in saying that \"Joseph Maurice Ravel is famous in France and America.\"? Yes, no, or maybe? Maybe\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee Are we justified in saying that \"Mystery won the 1994 WFA.\"? Yes, no, or maybe? No\n###\nHenry Gabriel Murphy (1903\u20132001) was an American businessman, sportsman and Major League Baseball club owner. From June 1950 through April 1984, he was a minority stockholder in the Washington Senators/Minnesota Twins franchise of the American League. Are we justified in saying that \"Murphy was a Major League Baseball club player from 1950-1984.\"? Yes, no, or maybe? No\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana\" var. \"calva shows up in Spain.\"? Yes, no, or maybe?", "doc_id": 481, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39630, 13284, 29439, 40509], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Foals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"Foals have been covered by hole.\"? Yes, no, or maybe? Maybe\n###\nSeven Little Monsters is a children's picture book by American author and illustrator Maurice Sendak. \"Seven Little Monsters\" was published by Harper & Row in 1977 and served as the basis for the Canadian-Chinese television production of the same name (2000-2007). Are we justified in saying that \"Before Harper & Row published the book in 1977, it had been overlooked by other publishing companies.\"? Yes, no, or maybe? Maybe\n###\nJosef Jan\u00ed\u010dek (born 28 December 1947 in Prague, Czechoslovakia, now Czech Republic) is a Czech rock keyboardist, singer, accordion and guitar player. He was a former guitarist of The Primitives Group; from 1969 he played with The Plastic People of the Universe. He was also a member of Milan Hlavsa's band called \"P\u016flnoc\". Since 1990, he is a member of The Velvet Underground Revival Band. Are we justified in saying that \"Josef Jan\u00ed\u010dek has no arms.\"? Yes, no, or maybe? No\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland. Are we justified in saying that \"Phacelia pedicellata means the plant that lives\"? Yes, no, or maybe? Maybe\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"View from the Top was filmed in 2001\"? Yes, no, or maybe?", "doc_id": 156, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11099, 7652, 23358, 40856], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja. Are we justified in saying that \"Lokesh is an actor.\"? Yes, no, or maybe? Yes\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats. Are we justified in saying that \"Vinar\u00f2s Club de F\u00fatbol has large capacity seating\"? Yes, no, or maybe? Yes\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville. Are we justified in saying that \"No. 27 Squadron RAAF has gold.\"? Yes, no, or maybe? Maybe\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"A Roman Catholic Church in central Bologna is now used as a large lecture hall by the University of Bologna.\"? Yes, no, or maybe? Yes\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name. Are we justified in saying that \"The character of Sondra Huxtable was developed for the show Fatherhood with Bill Cosby.\"? Yes, no, or maybe?", "doc_id": 611, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [300, 41454, 10167, 18033], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Departure of a Grand Old Man (Russian: \u0423\u0445\u043e\u0434 \u0432\u0435\u043b\u0438\u043a\u043e\u0433\u043e \u0441\u0442\u0430\u0440\u0446\u0430 , translit.\u00a0Ukhod velikovo startza) is a 1912 Russian silent film about the last days of author Leo Tolstoy. The film was directed by Yakov Protazanov and Elizaveta Thiman, and was actress Olga Petrova's first film. Are we justified in saying that \"Olga performed in many films before This one\"? Yes, no, or maybe? No\n###\nNew American Writing is a once-a-year American literary magazine emphasizing contemporary American poetry, including a range of innovative contemporary writing. The magazine is published in association with San Francisco State University. \"New American Writing\" is published by OINK! Press, a nonprofit organization. The magazine appears in early June each year. First published in 1986. Are we justified in saying that \"The magazine appears in early summer each year\"? Yes, no, or maybe? Yes\n###\nWhat Is the What: The Autobiography of Valentino Achak Deng is a 2006 novel written by Dave Eggers. It is based on the life of Valentino Achak Deng, a Sudanese child refugee who immigrated to the United States under the Lost Boys of Sudan program. It was a finalist for the National Book Award. Are we justified in saying that \"The Autobiography of Valentino Achak Deng was a finalist and awarded the National Book Award\"? Yes, no, or maybe? Maybe\n###\nPostal codes in Brunei are alphanumeric, consisting of two letters followed by four digits in the format of YZ0000, where Y denotes the district code, Z denotes the mukim code, the first two digits denote the area or village code, and the last two digits denote the nearest post office code (e.g. the postal code for Pantai Mentiri Golf Club is BU2529). Are we justified in saying that \"Postal codes in Brunei are alphanumeric but never start with a letter.\"? Yes, no, or maybe? No\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012. Are we justified in saying that \"Tooji has won multiple Norwegian Melodi Grand Prixs.\"? Yes, no, or maybe?", "doc_id": 664, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17059, 36761, 33499, 3996], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa. Are we justified in saying that \"Jara is becoming increasingly popular as a language\"? Yes, no, or maybe? No\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015. Are we justified in saying that \"Seven different people have served as governor of Abia State since the state was created on August 27, 1991. \"? Yes, no, or maybe? No\n###\nThe Leslie Motor Car company was a motor car company located in Detroit, Michigan in 1916. This automobile company was most likely named for the city of Leslie, Michigan. It was in operation for only one year and produced an unknown number of cars. Most cars of this era, were sold or given by their owners for scrap metal drives during World War II. Are we justified in saying that \"Most cars from 1916 were sold or given away for scrap metal drives during World War II. \"? Yes, no, or maybe? Yes\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise). Are we justified in saying that \"The Magic Roundabout was known originally in French as \"Le Man\u00e8ge enchant\u00e9\"\"? Yes, no, or maybe? Yes\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Coriolano won some of Rome's greatest victories.\"? Yes, no, or maybe?", "doc_id": 411, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3026, 1784, 41756, 21713], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list. Are we justified in saying that \"Humanity is now in the 19th century. \"? Yes, no, or maybe? No\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg. Are we justified in saying that \"The song was released on March 31, 2015. \"? Yes, no, or maybe? No\n###\nJames Matthes Talent (born October 18, 1956) is an American politician and former U.S. Senator from Missouri. He is a Republican and resided in the St. Louis area while serving in elected office. He identifies with the conservative wing of the Republican party, being outspoken on judicial appointments, abortion, flag burning, and defense issues. Are we justified in saying that \"Republicans are outspoken about flag burning.\"? Yes, no, or maybe? Yes\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000. Are we justified in saying that \"The 2012 Sun Life Financial Players' Championship was held in May in 2012\"? Yes, no, or maybe? No\n###\nJurassic Park is a 1993 video game based on the film and novel of the same name. It was developed and published by Ocean Software and released for the Nintendo Entertainment System (NES). Ocean also released \"Jurassic Park\" on the handheld Game Boy console. The Game Boy version is a port of the NES version. Are we justified in saying that \"Jurassic Park was a movie\"? Yes, no, or maybe?", "doc_id": 955, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29023, 5717, 14143, 9418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "RBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes. Are we justified in saying that \"RBG Resources is a privately owned firm.\"? Yes, no, or maybe? No\n###\nThe Hill Country Film Festival is a yearly film festival in Fredericksburg, Texas, United States. It was established in 2010. The executive director is Chad Matthews, and it is presented by the Hill Country Film Society, who hold free screenings at the festival and, afterward, monthly. In 2013, \"Texas Monthly\" selected it as a \"quirky, discerning\" pick. Are we justified in saying that \"The Hill Country Film Festival will cease in 2023.\"? Yes, no, or maybe? Maybe\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing. Are we justified in saying that \"The Charter Township of Lansing is in Russia.\"? Yes, no, or maybe? No\n###\nTansu \u00c7iller (] ; born 24 May 1946) is a Turkish academician, economist, and politician who served as the 22nd Prime Minister of Turkey from 1993 to 1996. She is Turkey's first and only female prime minister to date. As the leader of the True Path Party, she went on to concurrently serve as Deputy Prime Minister of Turkey and as Minister of Foreign Affairs between 1996 and 1997. Are we justified in saying that \"As the leader of the True Path Party, she went on to concurrently serve as Deputy Prime Minister of Turkey and as Minister of Foreign Affairs between 1996 and 1997 and came third in the 2002 general election.\"? Yes, no, or maybe? Maybe\n###\nMartin Joseph O'Malley (born January 18, 1963) is an American politician and attorney who served as the 61st Governor of Maryland from 2007 to 2015. He previously served as the Mayor of Baltimore from 1999 to 2007, and was a councilman from the Third Councilmanic District in the northeast section of the city on the Baltimore City Council from 1991 to 1999. Are we justified in saying that \"Martin Joseph O'Malley did not live in baltimore when he was the mayor\"? Yes, no, or maybe?", "doc_id": 610, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38248, 31688, 5111, 14660], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Craig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\". Are we justified in saying that \"His first job was on film crews.\"? Yes, no, or maybe? Yes\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball. Are we justified in saying that \"Forever the Moment would be considered poor grammar to English speakers.\"? Yes, no, or maybe? Maybe\n###\nSupervixens is a 1975 sexploitation film by American filmmaker Russ Meyer. The cast features Meyer regulars Charles Napier, Uschi Digard, and Haji. The film also features Shari Eubank (in a dual role) in one of her only two film roles ever and Christy Hartburg in her only film role ever. Are we justified in saying that \"Supervixens was created by an american.\"? Yes, no, or maybe? Yes\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award. Are we justified in saying that \"In 2005 when Rachel was 15 years old she knew that she could have a role in the film \"The Unborn\".\"? Yes, no, or maybe? Maybe\n###\nThe 1992 Nutri-Metics Bendon Classic was a women's tennis tournament played on outdoor hard courts at the ASB Tennis Centre in Auckland in New Zealand that was part of Tier V of the 1992 WTA Tour. It was the seventh edition of the tournament and was held from 27 January February through 2 February 1992. Unseeded Robin White won the singles title. Are we justified in saying that \"The 1992 Nutri-Metics Bendon Classic had no loser.\"? Yes, no, or maybe?", "doc_id": 776, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35855, 16598, 4303, 16021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forestville Commonwealth is an archaeological site and national historic district located at Earlton in Greene County, New York. The district contains seven contributing sites. It represents the remains of a utopian community built in 1826-1827 as one of three Owenite experiments in New York State. Are we justified in saying that \" Forestville Commonwealth is one of 2 Owenite experiments in New York.\"? Yes, no, or maybe? No\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978. Are we justified in saying that \"Gary Sutherland will be inducted into the MLB Hall of Fame\"? Yes, no, or maybe? Maybe\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory. Are we justified in saying that \"The Australia national cricket team was captained by syd gregory\"? Yes, no, or maybe? Yes\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"End of the Past was published in the 20th century\"? Yes, no, or maybe? No\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC. Are we justified in saying that \"Tory Woodbury is over 40 years\"? Yes, no, or maybe?", "doc_id": 238, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10064, 2582, 24828, 36251], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Gospel According to the Other Mary is an opera/oratorio by contemporary American composer John Adams. The world premiere took place on May 31, 2012, at the Walt Disney Concert Hall in Los Angeles with Gustavo Dudamel conducting the Los Angeles Philharmonic who also premiered the staged version on March 7, 2013, at the same venue. Are we justified in saying that \"John Adams was from Los Angeles\"? Yes, no, or maybe? Maybe\n###\nCurzon Ashton Ladies Football Club is an English women's football club affiliated with Curzon Ashton F.C.. The club were known as Oldham Curzon Ladies Football Club until June 2005. They play in the North West Women's Regional League Division One South . Are we justified in saying that \"The Club will win it's next match.\"? Yes, no, or maybe? Maybe\n###\nHumans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video. Are we justified in saying that \"Humans Need Not Apply is an internet documentary\"? Yes, no, or maybe? Yes\n###\nThe Krylov\u2013Bogolyubov averaging method (Krylov\u2013Bogolyubov method of averaging) is a mathematical method for approximate analysis of oscillating processes in non-linear mechanics. The method is based on the averaging principle when the exact differential equation of the motion is replaced by its averaged version. The method is named after Nikolay Krylov and Nikolay Bogoliubov. Are we justified in saying that \"The Krylov\u2013Bogolyubov averaging method is rarely used\"? Yes, no, or maybe? Maybe\n###\nNativity in Black is the name of two Black Sabbath tribute albums that came out in the 1990s and 2000s. The albums were recorded with various heavy metal bands paying tribute to Black Sabbath for their influence on the heavy metal genre of rock music. Are we justified in saying that \"Nativity in Black isn't a Black Sabbath album. \"? Yes, no, or maybe?", "doc_id": 708, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40457, 12406, 1703, 3004], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1969 Indian vice-presidential election was held on 30 August 1969. Gopal Swarup Pathak won the election to become the fourth Vice-President of India. The election was occurred since the sitting VP, Varahagiri Venkata Giri resigned to contest the presidential election after the death of incumbent President Zakir Husain. Are we justified in saying that \"The 1969 Indian vice-presidential election was held in 20th of August\"? Yes, no, or maybe? No\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"He spent 1 year with the Duke City Gladiators.\"? Yes, no, or maybe? Maybe\n###\n\"I'm Living in Two Worlds\" is a song written by Jan Crutchfield, which was recorded and released by American country artist Bonnie Guitar. The song reached number nine on the \"Billboard\" Hot Country Singles chart and number ninety-nine on the \"Billboard\" Hot 100 in early 1966. \"I'm Living in Two Worlds\" became Guitar's first Country top-ten single and her first charting single since 1959. Are we justified in saying that \"Bonnie Guitar wrote \"I'm Living in Two Worlds.\"\"? Yes, no, or maybe? No\n###\nIn the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese). Are we justified in saying that \"A Brazilian company called Prologica made its own computers so they must also make their own laptops\"? Yes, no, or maybe? Maybe\n###\nPrince Karl Alfred of Liechtenstein (Karl Alfred Maria Johannes Baptista Heinrich Aloys Georg Hartmann Ignatius; 16 August 1910 \u2013 17 November 1985) was a Liechtensteiner prince and brother of Franz Joseph II. He was the third child and second son of Prince Aloys of Liechtenstein and Archduchess Elisabeth Amalie of Austria. Are we justified in saying that \"Prince Karl Alfred of Liechtenstein has ten names.\"? Yes, no, or maybe?", "doc_id": 933, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41486, 18189, 9009, 10563], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shrek Forever After (also known as Shrek 4, and Shrek Forever After: The Final Chapter) is an action-adventure video game based on the film of the same name. It was released on May 18, 2010, in North America. It is the fourth and final video game based on the movie series of \"Shrek\". This was also the last Shrek game to be developed by Activison. Are we justified in saying that \"Before spring of 2010, there were more than 3 Shrek video games produced by Activision.\"? Yes, no, or maybe? Yes\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues. Are we justified in saying that \"V.S. Naipaul is a site that offers original content and aggregated news and blogs.\"? Yes, no, or maybe? No\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh. Are we justified in saying that \"Stanley Anthony Woods is a generous person\"? Yes, no, or maybe? Maybe\n###\nJacques Tourneur (] ; November 12, 1904 \u2013 December 19, 1977) was a French film director known for the classic film noir \"Out of the Past\" and a series of low-budget horror films he made for RKO Studios, including \"Cat People\", \"I Walked with a Zombie\" and \"The Leopard Man\". He is also known for directing \"Night of the Demon\", that was released by Columbia Pictures. Are we justified in saying that \"Jacques Tourneur didn't speak English.\"? Yes, no, or maybe? Maybe\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Jacob Murphy is 24 years old. \"? Yes, no, or maybe?", "doc_id": 757, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38327, 32286, 6796, 18962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley. Are we justified in saying that \"Art of Dying plays rock music.\"? Yes, no, or maybe? Yes\n###\nweRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo. Are we justified in saying that \"weRead started out after 2006 on Facebook\"? Yes, no, or maybe? Yes\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing. Are we justified in saying that \"An experience point is used in music games\"? Yes, no, or maybe? No\n###\nCaptain Scarlett is a 1953 American Technicolor Adventure film directed by Thomas Carr, that was shot in Mexico. The film is set in France following the fall of Napoleon I, and stars Richard Greene playing the title role, a Robin Hood type avenger, and Brazilian actress Leonora Amar in her final screen role. Are we justified in saying that \"The film was not made in France.\"? Yes, no, or maybe? Yes\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor. Are we justified in saying that \"Van Cleef & Arpels is a producer of animals\"? Yes, no, or maybe?", "doc_id": 73, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35741, 6346, 5863, 34348], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017\u201318 Puebla season is the 70th professional season of Mexico's top-flight football league. The season is split into two tournaments\u2014the Torneo Apertura and the Torneo Clausura\u2014each with identical formats and each contested by the same eighteen teams.The Club will also play Copa MX.Rafael Garc\u00eda Torres was named the club head coach on June 5, 2017, taking over for sacked coach Jos\u00e9 Cardozo. Are we justified in saying that \"The 2017\u201318 Puebla season has tournaments for 20 teams\"? Yes, no, or maybe? No\n###\nAndrea Louise Riseborough (born 20 November 1981) is an English stage and film actress. Her film appearances include \"Birdman or (The Unexpected Virtue of Ignorance)\", \"Oblivion\", \"Welcome to the Punch\", \"Disconnect\", \"Shadow Dancer\", \"W.E.\", \"Brighton Rock\", \"Made in Dagenham\", \"Never Let Me Go\", \"Happy-Go-Lucky\", and \"Venus\". Are we justified in saying that \"Andrea Riseborough is the oldest of her siblings and the only actress.\"? Yes, no, or maybe? Maybe\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities. Are we justified in saying that \"Alberto Espinoza Barr\u00f3n was arrested more than 208 years ago.\"? Yes, no, or maybe? No\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"He plans to play soccer for the U.S one day\"? Yes, no, or maybe? Maybe\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999. Are we justified in saying that \"Daoud Abdel Sayed was born in Egypt in the 40's.\"? Yes, no, or maybe?", "doc_id": 271, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21699, 29565, 33543, 44182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH). Are we justified in saying that \"The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" will likely never be proven in our lifetimes.\"? Yes, no, or maybe? Maybe\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals. Are we justified in saying that \"We're an American Band has no sound.\"? Yes, no, or maybe? No\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"Cuban imports to the United States were stopped in 1961, but have since resumed.\"? Yes, no, or maybe? Maybe\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart. Are we justified in saying that \"Dwight Yoakam is under 60 years old.\"? Yes, no, or maybe? Maybe\n###\n\"A Hard Day's Night\" is a song by the English rock band the Beatles. Credited to Lennon\u2013McCartney, it was written by John Lennon, with some collaboration from Paul McCartney. It was released on the film soundtrack of the same name in 1964. It was also released in the UK as a single, with \"Things We Said Today\" as its B-side. Are we justified in saying that \"A Hard Day's Night was written by both John Lennon and Paul McCartney.\"? Yes, no, or maybe?", "doc_id": 276, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9608, 17347, 35737, 14903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep. Are we justified in saying that \"Coldwater fish live shorter lives than tropical fish.\"? Yes, no, or maybe? No\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris. Are we justified in saying that \"\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the Candian TV show \"The Vampire Diaries\" \"? Yes, no, or maybe? No\n###\nForest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark. Are we justified in saying that \"Forest Hill Vineyard is very expensive\"? Yes, no, or maybe? Maybe\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"Santa Lucia was commissioned by an ancient Roman Emperor. \"? Yes, no, or maybe? Maybe\n###\nAndrea M\u00f3nica Montenegro DeFreitas, known as Andrea Montenegro (born 4 March 1969 in Lima, Peru), is a Peruvian actress and model well known for her participation in various telenovelas such as Zorro, la Espada y la Rosa, Latin Lover (2001), La viuda de la Mafia (2004) and currently in Telemundo's El Clon. She has a daughter Muriel and a son Amaru. Are we justified in saying that \"Andrea M\u00f3nica Montenegro DeFreitas speaks spanish\"? Yes, no, or maybe?", "doc_id": 25, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30632, 20405, 8909, 19878], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University Church of England Academy is a secondary school located in Ellesmere Port, Cheshire. It was formed in 2009 by the merger of Ellesmere Port Specialist School of Performing Arts (located at Woodchurch Lane) and Cheshire Oaks High School (located at Stanney Lane). Are we justified in saying that \"the academy was founded one year prior to originally scheduled\"? Yes, no, or maybe? Maybe\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House. Are we justified in saying that \"Spittal is not a large city \"? Yes, no, or maybe? Yes\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"Three Little Sisters was released on the last day of July.\"? Yes, no, or maybe? Yes\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974. Are we justified in saying that \"Juan Domingo Per\u00f3n was serving from 8 October 1895 to 1 July 1974.\"? Yes, no, or maybe? No\n###\nStudio One is an American radio\u2013television anthology drama series, created in 1947 by Canadian director Fletcher Markle, who came to CBS from the CBC. It aired under several variant titles: Studio One Summer Theatre, Studio One in Hollywood, Summer Theatre, Westinghouse Studio One and Westinghouse Summer Theatre. Are we justified in saying that \"Fletcher Markle came to the Columbia Broadcasting System from the Canadian Broadcasting Corporation.\"? Yes, no, or maybe?", "doc_id": 245, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37607, 4943, 12633, 35604], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted. Are we justified in saying that \"Mads Wiel Nygaard's Endowment the hardest award to win.\"? Yes, no, or maybe? Maybe\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75. Are we justified in saying that \"Interstate 29 has lots of trucks\"? Yes, no, or maybe? Maybe\n###\nMahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja. Are we justified in saying that \"Mahalakshmi was broadcast first on 6 March 2017\"? Yes, no, or maybe? Yes\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night. Are we justified in saying that \"Night of Terror is a 1934 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall.\"? Yes, no, or maybe? No\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Alice Sue Claeys continued to play professional figure skating past 1993.\"? Yes, no, or maybe?", "doc_id": 140, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20134, 29715, 7715, 32593], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Grafton writes stories.\"? Yes, no, or maybe? Yes\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions. Are we justified in saying that \"Project Gasbuggy was located in Texas.\"? Yes, no, or maybe? No\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006. Are we justified in saying that \"Oliver Francis O'Grady was born in the 19th century.\"? Yes, no, or maybe? No\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ). Are we justified in saying that \"GSG 9 consists of an ethnically diverse team of police members\"? Yes, no, or maybe? Maybe\n###\nThe following are lists of the most populous fully defined incorporated settlements in Nigeria by population. This page consists three different tables, with different kinds of settlements; a list for \"defined cities\", listing the population, strictly within the defined city limits, a list for \"urban area\" population, and another list for the population within metropolitan areas. Are we justified in saying that \"The lists are about settlements in all of Africa.\"? Yes, no, or maybe?", "doc_id": 336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13159, 19152, 43253, 12039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2008 Emerald Bowl, part of the 2008-09 NCAA football bowl games season, was played on December 27, 2008, at AT&T Park, the home field of the Giants in San Francisco, California. The Miami Hurricanes of the ACC were matched against the California Golden Bears (based in nearby Berkeley, California) of the Pac-10, the first appearance by either team in the seven-year history of the Emerald Bowl. Are we justified in saying that \"The 2008 Emerald Bowl was held in a stadium\"? Yes, no, or maybe? Yes\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\". Are we justified in saying that \"Crawling was released in the early 21st century\"? Yes, no, or maybe? Yes\n###\nBel Ami (; also known as \"Pretty Boy\", and \"'Pretty Man\", is a South Korean romantic comedy television series starring Jang Keun-suk, IU, Lee Jang-woo and Han Chae-young. Based on the same-titled 17-volume manhwa by Chon Kye-young, it aired on KBS2 from November 20, 2013 to January 9, 2014 on Wednesdays and Thursdays at 21:55 for 16 episodes. Are we justified in saying that \"Bel Ami was written by Chon Kye-Young.\"? Yes, no, or maybe? Yes\n###\nThe Battle of Vauchamps (14 February 1814) was the final major engagement of the Six Days Campaign of the War of the Sixth Coalition. It resulted in a part of the Grande Arm\u00e9e under Napoleon I defeating a superior Prussian and Russian force of the Army of Silesia under Field-marshal Gebhard Leberecht von Bl\u00fccher. Are we justified in saying that \"The Battle of Vauchamps was a basketball game.\"? Yes, no, or maybe? No\n###\nJean le F\u00e8vre de Saint-Remy or Jean Lefebvre de Saint-Remy (c. 1394 \u2013 June 16, 1468) born in Abbeville, was a Burgundian chronicler during the Hundred Years' War and lord (\"seigneur\") of Saint Remy, la Vacquerie, Avesnes and Morienne. He is also known by the formal title of authority \"Toison d'or\" (Golden Fleece) because he served as the King of Arms to the Order of the Golden Fleece. Are we justified in saying that \"He was born in 1300 + 52\"? Yes, no, or maybe?", "doc_id": 168, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38818, 43431, 44745, 11389], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "San Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County. Are we justified in saying that \"People really enjoy having the Ferry as a means of transportation. \"? Yes, no, or maybe? Maybe\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Claeys did not place 8th in the 92 Word Championships.\"? Yes, no, or maybe? Yes\n###\nThe MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005. Are we justified in saying that \"The first test flight was reported less than 10 years ago.\"? Yes, no, or maybe? No\n###\nThe roots of the Orton Ceramic Foundation date back to the establishment of the \"Standard Pyrometric Cone Company\" in 1896 by Edward J. Orton, Jr.. In 1894, he was appointed the first Chairman of the Ceramic Engineering Department at The Ohio State University, the first ceramic engineering school in the United States. Are we justified in saying that \"The roots of Orton was sprung forth before the 97th year of the 1800's\"? Yes, no, or maybe? Yes\n###\nRiver Raid is a scrolling shooter video game designed and developed by Carol Shaw, and published by Activision in 1982 for the Atari 2600 video game console. Over a million game cartridges were sold. Activision later ported the title to the Atari 5200, ColecoVision, and Intellivision game consoles, as well as to the Commodore 64, IBM PCjr, MSX, ZX Spectrum, and Atari 8-bit family home computers. Are we justified in saying that \"Over one million, but less than 3 million cartridges of River Raid were sold.\"? Yes, no, or maybe?", "doc_id": 108, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15676, 20231, 28558, 19479], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis. Are we justified in saying that \"Edward Annis was born in the 20th century\"? Yes, no, or maybe? Yes\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"Pata Nahi Rabb Kehdeyan Rangan Ch Raazi was released in 2011.\"? Yes, no, or maybe? No\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality. Are we justified in saying that \"Marcellite Wall is most remembered as the voice of Minnie Mouse during her 12 years working at Walt Disney Productions.\"? Yes, no, or maybe? Maybe\n###\nA sodium bicarbonate rocket (sometimes called an Alka-Seltzer rocket) is a model rocket fashioned from a 35mm film canister and propelled by the pressure of a gas, often carbon dioxide, generated from the reaction of an acid with sodium bicarbonate. Sodium bicarbonate rockets are often used in science classes to demonstrate principles of chemistry and physics. Are we justified in saying that \"Sodium bicarbonate are also as model to show to students some principle. \"? Yes, no, or maybe? Yes\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\". Are we justified in saying that \"It Takes Two was Marvin Gaye's most successful song.\"? Yes, no, or maybe?", "doc_id": 187, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27786, 17411, 36401, 20434], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "American Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company. Are we justified in saying that \"American Motors Incorporated is not the same company as American Motors Corporation.\"? Yes, no, or maybe? Yes\n###\nThis is a list of United States Air Force test squadrons. It covers units considered to be part of the Air Force and serves as a break out of the comprehensive List of United States Air Force squadrons. Most units in this list are assigned to Air Force Materiel Command, however, a few reside in other Major Commands of the United States Air Force. Are we justified in saying that \"The list is classified.\"? Yes, no, or maybe? Maybe\n###\nMasquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line. Are we justified in saying that \"Mikhail Lermontov wrote Masquerade in the 1830s\"? Yes, no, or maybe? Yes\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old. Are we justified in saying that \"John Cameron Urschel is a retired professional football player\"? Yes, no, or maybe? Yes\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton). Are we justified in saying that \"The single \"Sick and Tired\" features Anthony Hamilton who was born in 2003\"? Yes, no, or maybe?", "doc_id": 927, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44386, 15384, 42919, 34986], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lady Pamela Carmen Louise Hicks (\"n\u00e9e\" Mountbatten; born 19 April 1929) is a British aristocrat. She is the younger daughter of the 1st Earl Mountbatten of Burma by his wife, Edwina Mountbatten. Through her father, Lady Pamela is a first cousin of Prince Philip, Duke of Edinburgh and a great niece of the last Empress of Russia, Alexandra Feodorovna. Are we justified in saying that \"Lady Pamela could not produce children.\"? Yes, no, or maybe? Maybe\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX. Are we justified in saying that \"One of the shows featured a dingo.\"? Yes, no, or maybe? Maybe\n###\nFriday: The Animated Series was a short-lived animated television series based on the \"Friday\" film series. The show is directed by Kevin Lofton and is co-produced and co-distributed by New Line Television, a subsidiary of New Line Cinema (the distributors of the \"Friday\" movies), MTV2, and Ice Cube's Cubevision. The series only lasted for 8 episodes. Are we justified in saying that \"Friday: The Animated Series appeared on MTV on Friday at 8. \"? Yes, no, or maybe? Maybe\n###\nMargaret Munnerlyn Mitchell (November 8, 1900 \u2013 August 16, 1949) was an American author and journalist. One novel by Mitchell was published during her lifetime, the American Civil War-era novel, \"Gone with the Wind\", for which she won the National Book Award for Most Distinguished Novel of 1936 Are we justified in saying that \"Margaret's first, middle, and last name all start with M.\"? Yes, no, or maybe? Yes\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"India and West Indies both made three consecutive world cup final appearances\"? Yes, no, or maybe?", "doc_id": 550, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25513, 42079, 20395, 21864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Waterloo Corner is a rural/urban suburb approximately 22 kilometres north of Adelaide, the capital city of South Australia. Most of the land is used for agricultural purposes, including wheat, olives, grapes and tomatoes. Port Wakefield Road, and thus a lot of heavy freight, traverses the suburb. Are we justified in saying that \"Waterloo Corner is east of Adelaide\"? Yes, no, or maybe? No\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer. Are we justified in saying that \"The princess was related to the pretender of the throne.\"? Yes, no, or maybe? Yes\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle. Are we justified in saying that \"Boon Brewery produces beer using techniques that are traditional and classic.\"? Yes, no, or maybe? No\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"View from the Top was seen by Amy.\"? Yes, no, or maybe? Maybe\n###\nJeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008. Are we justified in saying that \"Jeffrey B. Miller was a state police commissioner from 2003 until march 24 2008\"? Yes, no, or maybe?", "doc_id": 258, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11282, 37960, 45240, 35032], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lois Cleveland Chiles (born April 15, 1947) is an American actress and former fashion model known for her roles as Dr. Holly Goodhead in the 1979 James Bond film \"Moonraker\", and as a hit and run driver in 1987's \"Creepshow 2\", as well as such films as \"The Great Gatsby\", \"The Way We Were\", \"Death on the Nile\" and \"Broadcast News\". Are we justified in saying that \"Lois will star in future films.\"? Yes, no, or maybe? Maybe\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group. Are we justified in saying that \"He was the only performer on the album.\"? Yes, no, or maybe? Maybe\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau. Are we justified in saying that \"The 3rd Macau International Movie Festival subsequently honored Chinese films on December 7, 2012. \"? Yes, no, or maybe? No\n###\nJenni Falconer (born 12 February 1976) is a Scottish radio and television presenter best known for her roles on the ITV daytime show \"This Morning\", where she is a regular travel reporter. Falconer was a regular presenter of the National Lottery Draws on BBC One. Are we justified in saying that \"She works many national lottery draws\"? Yes, no, or maybe? Maybe\n###\nMineral County Airport (FAA LID: 9S4) is a county-owned public-use airport located two nautical miles (3.7 km) southeast of the central business district of Superior, a town in Mineral County, Montana, United States. According to the FAA's National Plan of Integrated Airport Systems for 2011-2015, it is categorized as a \"general aviation\" facility. Are we justified in saying that \"Mineral County Airport is in the southern hemisphere.\"? Yes, no, or maybe?", "doc_id": 510, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4597, 37109, 23251, 2572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise). Are we justified in saying that \"The magic roundabout was created in norway\"? Yes, no, or maybe? No\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"New Day is a 1949 book by Jamaican author V. S. Reid. It is over 24 years old\"? Yes, no, or maybe? Yes\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide. Are we justified in saying that \"Jersey Boys was released in 67 countries.\"? Yes, no, or maybe? Maybe\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983. Are we justified in saying that \"Sherwood Stewart ranked high in doubles tennis in 1983.\"? Yes, no, or maybe? Yes\n###\nKate Kendall (born 27 July 1973) is an Australian actress best known for her role in the long-running Nine Network Australian drama \"Stingers\". She joined the cast of long-running television soap opera \"Neighbours\" in 2013 as the established character Lauren Turner. Are we justified in saying that \"Kate Kendall's name starts with A.\"? Yes, no, or maybe?", "doc_id": 282, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18877, 5161, 2528, 4170], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Viktor Sarianidi led the Soviet invasion of Afghanistan.\"? Yes, no, or maybe? No\n###\nNogiBingo! ( stylized as NOGIBINGO!) is a Japanese television variety show starring Japanese idol girl group Nogizaka46. Ijily Okada, who is known for many AKB48 related show such as \"AKB48 Nem\u014dsu TV\", hosted the program. The show firstly aired on July 3, 2013, as part of the variety show \"Nogizaka46 x HKT48 Kanbangumi Battle!\", and it became an independent show from the second season. Are we justified in saying that \"NogiBingo! became an independent show in the third season\"? Yes, no, or maybe? No\n###\nValentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood. Are we justified in saying that \"Valentine is an American slasher film directed by Tom Savage.\"? Yes, no, or maybe? No\n###\nThe Chullachaki (Quechua, \"one-footed\", from \"chulla\" or \"ch'ulla\" = single, odd, unpaired, asymmetric, \"chaki\" = foot; spelling sometimes also used in Spanish) or Chullachaqui (hispanicized spelling), also known as the Shapishico, is a mythical forest creature of the Peruvian and Brazilian Amazonian jungle. Are we justified in saying that \"There is no such thing as mythical beasts in Amazon culture.\"? Yes, no, or maybe? No\n###\nAziyad\u00e9 (1879; also known as Constantinople) is a novel by French author Pierre Loti. Originally published anonymously, it was his first book, and along with \"Le Mariage de Loti\" (1880, also published anonymously), would introduce the author to the French public and quickly propel him to fame; his anonymous persona did not last long. Are we justified in saying that \"\"Le Mariage de Loti\" is the sequel to Aziyad\u00e9.\"? Yes, no, or maybe?", "doc_id": 158, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30189, 11014, 9265, 29828], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Savoy Brown, originally known as the Savoy Brown Blues Band, are an English blues rock band formed in Battersea, south west London in 1965. Part of the late 1960s blues rock movement, Savoy Brown primarily achieved success in the United States, where they promoted their albums with non-stop touring. Are we justified in saying that \"Savoy Brown never crossed an ocean\"? Yes, no, or maybe? No\n###\nArgonotes, the unofficial band of the Toronto Argonauts is an all volunteer organization committed to bringing quality musical entertainment and a \"traditional football atmosphere\" to all Argonauts home football games. Comprising more than 50 musicians on most game days, Argonotes is the largest musical organization associated with the CFL. Are we justified in saying that \"Argonotes is an official band.\"? Yes, no, or maybe? No\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film was released on the last day of July.\"? Yes, no, or maybe? Yes\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"Youth in Guatemala are blue eyed.\"? Yes, no, or maybe? Maybe\n###\nRed Earth, White Lies: Native Americans and the Myth of Scientific Fact is a book by Native American author Vine Deloria, originally published in 1995. The book's central theme is to criticize the scientific consensus which has, in his words, created \"a largely fictional scenario describing prehistoric North America\". Are we justified in saying that \"Vine Deloria wrote a fictional book about prehistoric North America.\"? Yes, no, or maybe?", "doc_id": 311, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36572, 35043, 30769, 10331], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"\nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. He also owns a boat.\"? Yes, no, or maybe? Maybe\n###\nJenni Falconer (born 12 February 1976) is a Scottish radio and television presenter best known for her roles on the ITV daytime show \"This Morning\", where she is a regular travel reporter. Falconer was a regular presenter of the National Lottery Draws on BBC One. Are we justified in saying that \"She is an expert at presenting the lottery \"? Yes, no, or maybe? Maybe\n###\nSelma Diamond (August 6, 1920 \u2013 May 13, 1985) was an American comedic actress and radio and television writer, known for her high-range, raspy voice, and her portrayal of Selma Hacker on the first two seasons of the NBC television comedy series \"Night Court\". Are we justified in saying that \"Selma Diamond was married once\"? Yes, no, or maybe? Maybe\n###\nJames Brown (born February 25, 1951), commonly called \"J.B.\", is an American sportscaster known for being the host of \"The NFL Today\" on CBS Sports and \"Thursday Night Football\" on CBS Sports and NFL Network. He is also the Special Correspondent for CBS' news division. Also, he is best known as the former host of the FOX network's NFL pregame show, \"Fox NFL Sunday\" for 11 years. Are we justified in saying that \"James Brown was born in the same month as Valentine's Day.\"? Yes, no, or maybe? Yes\n###\n\"In Due Time\" is the lead single from Killswitch Engage's sixth studio album, \"Disarm the Descent\". The song is the band's first single to feature vocalist Jesse Leach since 2003's \"The Element of One\". The song charted at no. 23 on the Active rock chart and no. 26 on the Mainstream Rock chart. Are we justified in saying that \"\"The Element of One\" did far better than \"In Due Time\"\"? Yes, no, or maybe?", "doc_id": 625, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41325, 12038, 22495, 2579], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Douglas Daniel Weight (born January 21, 1971) is an American professional ice hockey coach, executive and former player. He is currently the head coach and assistant general manager for the New York Islanders. During his 19-year National Hockey League career, he played for the New York Rangers, Edmonton Oilers, Carolina Hurricanes, Anaheim Ducks, St. Louis Blues and the New York Islanders. Are we justified in saying that \"Douglas Daniel Weight is on television coaching.\"? Yes, no, or maybe? Maybe\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"1974 New York Mets season is famous\"? Yes, no, or maybe? Maybe\n###\nWKKF \"(102.3 FM)\" - branded as Kiss 102.3 - is a Top 40 (CHR) station licensed to Ballston Spa, New York and serving the Capital District and Adirondacks. The station is owned by iHeartMedia and broadcasts at 102.3 FM at 4,100 watts ERP from a transmitter in Clifton Park, New York on a tower shared with WDCD-FM and WTMM-FM. Are we justified in saying that \"iHeart Media has over 100 stations\"? Yes, no, or maybe? Maybe\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall. Are we justified in saying that \"There are many national museums in Beijing. \"? Yes, no, or maybe? Maybe\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge. Are we justified in saying that \"The word \"ridge\" appears four times in this context.\"? Yes, no, or maybe?", "doc_id": 774, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27922, 2862, 8661, 35656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title. Are we justified in saying that \"The 2011 Sudirman Cup was held more than 6666 days ago.\"? Yes, no, or maybe? No\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song. Are we justified in saying that \"The word \"round\" appears multiple times in the lyrics of one of the songs on \"Shake it Up\" by The Cars.\"? Yes, no, or maybe? Yes\n###\nChristmas Bounty is a 2013 television film directed by Gil Junger. It was produced by WWE Studios and stars Francia Raisa, Mike \"The Miz\" Mizanin and Will Greenberg. It premiered on ABC Family during their 25 Days of Christmas block on November 26, 2013. Are we justified in saying that \"Junger also directed Christmas Bounty 2.\"? Yes, no, or maybe? Maybe\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006. Are we justified in saying that \"Anime Speed and Anime Speed Newtype Edition are the only two albums to have featured anime music in 2005 and 2006.\"? Yes, no, or maybe? Maybe\n###\nHerv\u00e9 Le Tellier (born 21 April 1957) is a French writer and linguist, and a member of the international literary group Oulipo (Ouvroir de Litt\u00e9rature Potentielle, which translates roughly as \"workshop of potential literature\"). Other notable members have included Raymond Queneau, Georges Perec, Italo Calvino, Jacques Roubaud, Jean Lescure and Harry Mathews. Are we justified in saying that \"Oulipo joined writers and other literary enthusiasts on the international state.\"? Yes, no, or maybe?", "doc_id": 367, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30540, 34695, 15985, 44453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution. Are we justified in saying that \"most students only get 2 year degrees\"? Yes, no, or maybe? Maybe\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Colorado Avalanche forward is Canadian\"? Yes, no, or maybe? Yes\n###\nLik\u00ebng\u00eb are pork sausages flavored with salt, pepper and seed of Fennel (far\u00eb mbrai), made in Piana degli Albanesi and Santa Cristina Gela. \"Lik\u00ebng\u00eb\" is the Undefinite Singular, \"Lik\u00ebnga\" is the Definite Singular and is cognate with the Italian Lucanica and the Greek Loukaniko. Are we justified in saying that \"Likenge is cognate with the Italian Lucanica.\"? Yes, no, or maybe? Yes\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"The Nydala Abbey was still in operation in 1500.\"? Yes, no, or maybe? Maybe\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\". Are we justified in saying that \"Bonzo Dog Band's second album was released two years before Tadpoles.\"? Yes, no, or maybe?", "doc_id": 127, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30123, 29092, 2998, 16051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dobbs Ferry is a village in Westchester County, New York. The population was 11,093 at the 2016 census. The Village of Dobbs Ferry is located in, and is a part of, the town of Greenburgh. The village ZIP code is 10522. Most of the Village falls into the boundaries of the Dobbs Ferry Union Free School District. Are we justified in saying that \"Dobbs Ferry is named after a ferry\"? Yes, no, or maybe? Maybe\n###\nMajid (born 1975) is a Danish rapper of Moroccan-Berber origin. Residing in Aved\u00f8re near Copenhagen, Denmark he was a contributor to Danish act Outlandish, which also hails from Br\u00f8ndby Strand. Majid contributed to their tours and performed as a special guest in the warm-up for their acts. Are we justified in saying that \"Majid is a main act along with Danish act Outlandish on their mutual tour.\"? Yes, no, or maybe? No\n###\nThe Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation. Are we justified in saying that \"nigerian national socce team under 20's is also known as the flying eagles\"? Yes, no, or maybe? Yes\n###\nMate Pavi\u0107 (born 4 July 1993) is a Croatian professional tennis player specialising in doubles. Mate won the 2016 US Open mixed doubles title in partnership with Laura Siegemund, and reached the 2017 Wimbledon Championships men's doubles finals partnering Oliver Marach. Are we justified in saying that \"Mate Pavi\u0107 is an American tennis player\"? Yes, no, or maybe? No\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats. Are we justified in saying that \"The football stadium \"El Cervol\" was built in 1965 B.C.\"? Yes, no, or maybe?", "doc_id": 642, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [135, 5318, 14429, 9134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Emmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\". Are we justified in saying that \"The 5th Canadian Screen Awards were held four years after the 1st awards were given.\"? Yes, no, or maybe? Yes\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur. Are we justified in saying that \"George Edward Foreman was an American boxer who won 2 gold medals at the olympics.\"? Yes, no, or maybe? Yes\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\". Are we justified in saying that \"Northern Sweden contains the headquarters of the company.\"? Yes, no, or maybe? Yes\n###\nMosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft. Are we justified in saying that \"Mosiula played football in Highschool\"? Yes, no, or maybe? Maybe\n###\nHercules and Love Affair is the debut studio album by American electronic music band Hercules and Love Affair, released on March 10, 2008 by DFA Records. The album was produced by Andrew Butler and Tim Goldsworthy. Andrew Raposo (of fellow DFA band Automato) and Tyler Pope (of !!!) contributed bass to the album, while Antony Hegarty co-wrote and performed vocals on select songs. Are we justified in saying that \"Hercules and Love Affair is the debut studio album by American electronic music band Hercules and Love Affair, released on in the third month of the year that equals 50.2 multiplied by 40 by DFA Records. \"? Yes, no, or maybe?", "doc_id": 994, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34586, 6334, 17764, 36971], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place. Are we justified in saying that \"Paul Cormier has led several other college basketball teams throughout his career.\"? Yes, no, or maybe? Maybe\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes. Are we justified in saying that \"Hook, Line and Sinker is an American fishing television program produced in Hobart, Tasmania \"? Yes, no, or maybe? No\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"Spanish Mastiff is a good guard dog\"? Yes, no, or maybe? Maybe\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio. Are we justified in saying that \"Christopher Lawrence (born 24 December 1926) is a classical musician, author, and conductor. \"? Yes, no, or maybe? No\n###\nWriting Degree Zero (French: \"Le degr\u00e9 z\u00e9ro de l'\u00e9criture\" ) is a book of literary criticism by Roland Barthes. First published in 1953, it was Barthes' first full-length book and was intended, as Barthes writes in the introduction, as \"no more than an Introduction to what a History of Writing might be.\" Are we justified in saying that \"Writing Degree Zero was the first piece of literary criticism Barthes wrote. \"? Yes, no, or maybe?", "doc_id": 861, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1433, 12862, 34190, 30860], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "High Bridge is a census-designated place (CDP) in Snohomish County, Washington, United States. The population was 2,994 at the 2010 census. High Bridge includes the Echo Lake community and the former Echo Lake CDP, which was superseded by the larger High Bridge CDP in 2010. Are we justified in saying that \"The population of High Bridge is now over 3,000.\"? Yes, no, or maybe? Maybe\n###\nThe 2000 Singapore Challenge, also known as the 2000 Godrej Singapore Challenge for sponsorship reasons, was a One Day International cricket tournament which took place between 20\u201327 August 2000. The tournament was held in Singapore. The tournament was won by South Africa who defeated Pakistan by 93 runs by the Duckworth\u2013Lewis method. Are we justified in saying that \"The 2000 Singapore Challenge was a highly anticipated tournament\"? Yes, no, or maybe? Maybe\n###\n\"Close Every Door\" is a song from the musical \"Joseph and the Amazing Technicolor Dreamcoat\" by Tim Rice and Andrew Lloyd Webber. It is the penultimate song of the first act of the musical, sung by Joseph while imprisoned for his supposed relationship with Potiphar's wife. Along with \"Any Dream Will Do\", it is one of the most popular songs from the musical. Are we justified in saying that \"\"Close Every Door\" is a song from the musical \"Joseph and the Amazing Technicolor Dreamcoat\" by Tim Rice and Andrew Lloyd Webber. There was no music in it.\"? Yes, no, or maybe? No\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films. Are we justified in saying that \"Ileana Carusio visited the USA this year\"? Yes, no, or maybe? Maybe\n###\nAttitude City is the third studio album by the American comedy duo Ninja Sex Party. The album was released on July 17, 2015. Six tracks from the album, \"Party of Three,\" \"Dragon Slayer,\" \"Attitude City,\" \"Why I Cry,\" \"Peppermint Creams,\" and \"Road Trip\" were all released as singles on their YouTube channel prior to its release. Are we justified in saying that \"Ninja Sex Party released many videos before July 17 2015\"? Yes, no, or maybe?", "doc_id": 807, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19692, 39200, 28806, 41580], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Latin American Boom was a flourishing of literature, poetry and criticism in Latin America during the 1960s and 1970s, when writers from this region explored new ideas and came to international renown in a way that had not happened previously. Major figures of the boom include Julio Cort\u00e1zar, Gabriel Garc\u00eda M\u00e1rquez, Carlos Fuentes, Jorge Luis Borges, and Mario Vargas Llosa. Are we justified in saying that \"The Latin American Boom happened in 2002\"? Yes, no, or maybe? No\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree was born in a public hospital. \"? Yes, no, or maybe? Maybe\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016. Are we justified in saying that \"Before September of 2017, the group Fifth Harmony made at three albums.\"? Yes, no, or maybe? Yes\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler. Are we justified in saying that \"Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. He has released many albums under a lot of different labels.\"? Yes, no, or maybe? Yes\n###\nHave a Little Faith is a Hallmark Hall of Fame television movie. The film debuted on ABC on November 27, 2011, as the first \"Hallmark Hall of Fame\" film broadcast since CBS cancelled the series earlier in 2011. It was the first \"Hallmark Hall of Fame\" film broadcast on ABC since 1995. Are we justified in saying that \"ABC did not broadcast any Hallmark films for 20 years over the turn of the last century.\"? Yes, no, or maybe?", "doc_id": 366, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18237, 42702, 22846, 9056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949. Are we justified in saying that \"The Girl from Jones Beach is an american film\"? Yes, no, or maybe? Yes\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups. Are we justified in saying that \"Quesadillas are often eaten in Mexico.\"? Yes, no, or maybe? Yes\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\"). Are we justified in saying that \"His music was not popular\"? Yes, no, or maybe? Maybe\n###\nZina Lynna Garrison (born November 16, 1963) is a former professional tennis player from the United States. During her career, she was a women's singles runner-up at Wimbledon in 1990, a three-time Grand Slam mixed doubles champion, and a women's doubles gold medalist and singles bronze medalist at the 1988 Olympic Games. She is currently coaching Taylor Townsend. Are we justified in saying that \"Taylor Townsend is Garrison's first student.\"? Yes, no, or maybe? Maybe\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\". Are we justified in saying that \"He lived for over 90 years\"? Yes, no, or maybe?", "doc_id": 68, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13891, 10366, 34538, 20647], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Benny Bell (born Benjamin Samberg or Benjamin Zamberg, March 21, 1906 \u2013 July 6, 1999) was an American singer-songwriter who reached popularity in the 1940s, with a comeback in the 1970s. He is particularly remembered for his risqu\u00e9 but cheerfully optimistic songs. Are we justified in saying that \"Benny Bell released more songs in the 1940s than the 1970s.\"? Yes, no, or maybe? Maybe\n###\nHeresy is a comedy talk show on BBC Radio 4, created and originally hosted by David Baddiel, now hosted by Victoria Coren Mitchell. In the show, the presenter and a panel of guests commit \"heresy\" by challenging people's most deeply received opinions on a subject, in front of a studio audience. Are we justified in saying that \"Victoria Coren MItchell is the host of Heresy\"? Yes, no, or maybe? Yes\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016. Are we justified in saying that \"Fifth Harmony released two albums before the departure of Camila Cabello.\"? Yes, no, or maybe? Yes\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry. Are we justified in saying that \"A man whose name rhymes with \"luck hairy\" made rock music over six decades ago.\"? Yes, no, or maybe? Yes\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"Zombie Juice is only known for being a rapper and a director\"? Yes, no, or maybe?", "doc_id": 208, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42196, 16287, 33612, 35564], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality. Are we justified in saying that \"Marcellite Wall was also the voice of Maggie Simpson.\"? Yes, no, or maybe? Maybe\n###\nDual-role transvestism is the formal diagnosis used by psychologists and physicians to describe people who wear clothes of the opposite sex to experience being the opposite sex temporarily, but don't have a sexual motive or want gender reassignment surgery. The International Classification of Diseases (ICD-10) list three diagnostic criteria for \"Dual-role transvestism\" (F64.1): Are we justified in saying that \"Dual-role transvestism is what Tom has.\"? Yes, no, or maybe? Maybe\n###\nSeton Catholic Preparatory High School is a college preparatory, co-educational Catholic high school in Chandler, Arizona, United States. Named after St. Elizabeth Ann Seton, the school was established in 1954 and is staffed by the Sisters of Charity of Seton Hill. Are we justified in saying that \"Seton Catholic Preparatory High School costs thousands\"? Yes, no, or maybe? Maybe\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks. Are we justified in saying that \"Cocaine costs under 10 usd\"? Yes, no, or maybe? Maybe\n###\nEl\u00ednr\u00f3s L\u00edndal is an entrepreneur in Fashion design. She established ELLA fashion label in 2008, one of the first Slow Fashion brands in the world. Elinr\u00f3s was the brands creative director and CEO. ELLA launched] it\u00b4s first fashion line in April 2011. Are we justified in saying that \"She does fashion design in the 2000's\"? Yes, no, or maybe?", "doc_id": 684, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11037, 8163, 11916, 25069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gorgo is a 1961 British-American science fiction monster film directed by Eug\u00e8ne Louri\u00e9. The film focuses on Gorgo, an ancient large sea monster brought back to London for exploitation, and Ogra, his even larger mother, who rampages across London to search for him. The film was featured in an episode of the cult movie-mocking television show \"Mystery Science Theater 3000\". Are we justified in saying that \"The director also made another movie featuring large sea monster\"? Yes, no, or maybe? Maybe\n###\nMaris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy. Are we justified in saying that \"Maris Soule died in 1939.\"? Yes, no, or maybe? No\n###\nA store-within-a-store, also referred to as shop-in-shop, is an agreement in which a retailer rents a part of the retail space to be used by a different company to run another, independent store. This concept was originally an idea proposed by the great philosopher and multi millionaire entrepreneur \"Joseph Westbrook\" of East Sussex, England. Are we justified in saying that \"Joseph Westbrook is from Sussex.\"? Yes, no, or maybe? Yes\n###\nThe Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures. Are we justified in saying that \"The Golden Fetter was based on a true story.\"? Yes, no, or maybe? Maybe\n###\nStormRider was a simulator ride at Tokyo DisneySea. It simulated going into a weather storm in a futuristic airplane (a \"StormRider\") to dissipate the storm. The attraction opened on September 4, 2001, in the Port Discovery land of Tokyo DisneySea. The attraction closed on May 17, 2016 and replaced by a new Finding Nemo/Finding Dory simulator ride called Nemo & Friends SeaRider. Are we justified in saying that \"The attraction opened on the 17th day of the fourth month\"? Yes, no, or maybe?", "doc_id": 199, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27263, 33185, 45029, 21707], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Requiem\" is the seventh episode in the fifth season, and the 101st overall episode, of the American crime drama television series \"NCIS\". It first aired on CBS in the United States on November 6, 2007. The episode was written by Shane Brennan and directed by Tony Wharmby. Are we justified in saying that \"\"Requiem\" was the 7th episode in the 5th season making it the first episode after the 100th episode and was watched by more than 20007 viewers.\"? Yes, no, or maybe? Maybe\n###\nThis Is England '86 is a 2010 British drama miniseries written by Shane Meadows and Jack Thorne, a spin-off from the 2006 film \"This Is England\". Set three years later, it focuses on the mod revival scene rather than the skinhead subculture, with the gang variously adopting an eclectic mix of clothing styles. Are we justified in saying that \"This Is England '86 had more than one writer\"? Yes, no, or maybe? Yes\n###\nFenella Kernebone is an Australian radio and television presenter, MC and interviewer, based in Sydney, with a long record working across the arts, film, music, design, architecture and culture. Her most recent hosting roles include the presenter of By Design on Radio National and The Sound Lab on Triple J. In June 2016, she was appointed Head of Curation for TEDxSydney. Are we justified in saying that \"Sydney is Fenella Kernebones daughter.\"? Yes, no, or maybe? No\n###\nMisty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue. Are we justified in saying that \"Misty Knight sold billions.\"? Yes, no, or maybe? Maybe\n###\nTim Witherspoon (born December 27, 1957) is an American former professional boxer who competed from 1979 to 2003. He is a two-time world heavyweight champion, having held the WBC title in 1984, and the WBA title in 1986. Upon winning his second world title, Witherspoon joined Floyd Patterson and Muhammad Ali as the only boxers to win multiple world heavyweight championships. Are we justified in saying that \"Tim Witherspoon was born before November\"? Yes, no, or maybe?", "doc_id": 988, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11413, 17875, 4751, 10447], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There are 26 states of Brazil, (Portuguese: \"estados\" ) which are the federal states of Brazil, plus the Federal District which contains the capital city, Bras\u00edlia. Below are these states listed in order of the area, from Amazonas with the greatest area, to the Federal District with the least. Brazil has a total area of 8,515,767\u00a0km, and is ranked 5th in the world. Are we justified in saying that \"Brazil has a total area less than 0.8 km.\"? Yes, no, or maybe? No\n###\nHistorical period drama is a film genre in which stories are based on historical events and famous persons. Some historical dramas attempt to accurately portray a historical event or biography, to the degree that the available historical research will allow. Other historical dramas are fictionalised tales that are based on an actual person and their deeds. Are we justified in saying that \"HIstorical period dramas are expensive to make\"? Yes, no, or maybe? Maybe\n###\nLawrence Henry Johnson (1861 in Germany \u2013 1947) was a Minnesota Republican politician and a Speaker of the Minnesota House of Representatives. Johnson, a bridge contractor and engineer, came to Minnesota in 1884, and was elected to the Minnesota House of Representatives in 1900. He served five terms, serving as speaker from 1907 to 1909. Johnson died in 1947. Are we justified in saying that \"Lawrence Johnson was an engineer\"? Yes, no, or maybe? Yes\n###\nHideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff. Are we justified in saying that \"Hideki Kamiya was born less than 52 weeks ago.\"? Yes, no, or maybe? No\n###\nAmdoxovir is a nucleoside reverse transcriptase inhibitor (NRTI) undergoing research for the treatment of HIV/AIDS. It was discovered by Raymond F. Schinazi (Emory University) and C.K. Chu (University of Georgia). It is being developed by RFS Pharma. Currently, it is in Phase II clinical studies. Are we justified in saying that \"Amdoxovir is available with a prescription. \"? Yes, no, or maybe?", "doc_id": 340, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15777, 3620, 12118, 34441], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II. Are we justified in saying that \"The Spanish Army would have carried the Campo-Giro before the year 1917,\"? Yes, no, or maybe? Yes\n###\nAmericana Deluxe is the second studio album by Big Bad Voodoo Daddy. This album is also sometimes called \"Big Bad Voodoo Daddy\", as the album cover prominently displays a stylized \"Big Bad Voodoo Daddy\" logo and does not feature the phrase \"Americana Deluxe\" on it. However, the liner notes and the band's website clearly show that the true title is indeed \"Americana Deluxe\". Are we justified in saying that \"Big Bad Voodoo Daddy is a heavy metal band. \"? Yes, no, or maybe? Maybe\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"a power chord and the fifth cord are the same\"? Yes, no, or maybe? Yes\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes. Are we justified in saying that \"private foundations are never used for estate planning purposes.\"? Yes, no, or maybe? No\n###\nThe Pistol model 2000 (also \"md. 2000\") is a semi-automatic pistol designed and manufactured by RomArm via the Cugir Arsenal of Romania. The pistol, chambered in 9\u00d719mm Luger is an almost-identical copy of the Magnum Research Jericho 941 (Baby Eagle). The pistol is the standard sidearm of the Romanian Army. Are we justified in saying that \"The Romanian Army does not use the Pistol model 2000 very often.\"? Yes, no, or maybe?", "doc_id": 760, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11893, 36887, 2009, 35242], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton). Are we justified in saying that \"Wooden Leather ends with r.\"? Yes, no, or maybe? Yes\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"The Santa Cova Funicular is a cheap railway\"? Yes, no, or maybe? Maybe\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success. Are we justified in saying that \"\"Thief\", a styled maze chase, was the greatest game released that year.\"? Yes, no, or maybe? Maybe\n###\n\"Are You Sitting Comfortably?\" is a 1969 song by the progressive rock band The Moody Blues. It was written jointly by band members Justin Hayward and Ray Thomas. It was recorded and released in 1969 on the Moody Blues Album \"On the Threshold of a Dream\". Are we justified in saying that \"Ray Thomas came up with the song title \"Are you Sitting Comfortably?\"\"? Yes, no, or maybe? Maybe\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\" Are we justified in saying that \"Johns Creek has a population of 92,000.\"? Yes, no, or maybe?", "doc_id": 600, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13657, 522, 24300, 35667], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi. Are we justified in saying that \"The movie The Godfather inspired the 1995 Indian crime film Aatank Hi Aatank directed by Dilip Shankar.\"? Yes, no, or maybe? Yes\n###\nThe 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\" Are we justified in saying that \"Winners of the Stanley Cup in in 1903 were give a piece of a metal.\"? Yes, no, or maybe? Yes\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards. Are we justified in saying that \"Many universities in South Florida rely on this technology to teach their students. \"? Yes, no, or maybe? Maybe\n###\nArt of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley. Are we justified in saying that \"Art of Dying is a country band\"? Yes, no, or maybe? No\n###\nThe 2007 North Indian Ocean cyclone season was an event in the annual cycle of tropical cyclone formation. The North Indian Ocean cyclone season has no official bounds, but cyclones tend to form between April and December, with peaks in May and November. These dates conventionally delimit the period of each year when most tropical cyclones form in the northern Indian Ocean. Are we justified in saying that \"The 2007 North Indian Ocean had at least one cyclone.\"? Yes, no, or maybe?", "doc_id": 541, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25866, 8400, 3753, 39365], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The Most Valuable Player was given to more than one player.\"? Yes, no, or maybe? Yes\n###\nCommon Law Cabin (original title \"How Much Lovin' Does a Normal Couple Need?\") is a 1967 exploitation film directed by Russ Meyer. The movie features Alaina Capri and Meyer regulars Babette Bardot and Jack Moran. It was co-written by Russ Meyer and Jack Moran, and filmed on location on the Colorado River in Arizona. Other portions of the film were shot in the Coachella Valley, California. Are we justified in saying that \"Common Law was filmed in Canada\"? Yes, no, or maybe? No\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan. Are we justified in saying that \"Ed Cebula is a famous pinball machine designer. \"? Yes, no, or maybe? Maybe\n###\nThe McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous. Are we justified in saying that \"A McLauren MP4/1 is very rare\"? Yes, no, or maybe? Maybe\n###\nThe Game Plan is a 2007 American family comedy film directed by Andy Fickman and written by Nichole Millard, Kathryn Price and Audrey Wells and starring Dwayne \"The Rock\" Johnson (marking the last film in which Johnson uses his ring name \"The Rock\" in billing). It follows an NFL quarterback who finds out he has an 8-year-old daughter from a previous relationship. Are we justified in saying that \"Nichole Millard once wrote a movie that starred Audrey Wells, which had a plot point involving a football player.\"? Yes, no, or maybe?", "doc_id": 619, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27846, 18198, 29033, 2466], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"For a Minute\" is a song by English recording trio M.O. It was co-written by Jess Glynne and produced by Bless Beats and Loadstar. The song was released by Operator Records as a digital download on 13 April 2014 in the United Kingdom, marking the trio's debut single after buzz tracks \"On Ya\", \"Hot\", \"Wait Your Turn\", \"Come Let Me Show You\", and \"Ain't Got Time\". Are we justified in saying that \"\"For a Minute\" was written by 3 people.\"? Yes, no, or maybe? Maybe\n###\nFrancis Gary Powers (August 17, 1929 \u2013 August 1, 1977) \u2013 often referred to as simply Gary Powers \u2013 was an American pilot whose Central Intelligence Agency (CIA) U-2 spy plane was shot down while flying a reconnaissance mission in Soviet Union airspace, causing the 1960 U-2 incident. Are we justified in saying that \"Gary Powers was an American pilot who was shot down\"? Yes, no, or maybe? Yes\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season. Are we justified in saying that \"The team did not work hard enough.\"? Yes, no, or maybe? Maybe\n###\nHuevos a la mexicana is a popular breakfast dish in Mexican cuisine. Finely chopped tomato, green chili pepper and onion is lightly fried in a hot skillet. Eggs are added and stirred until set. The heat is turned off and the coriander leaves are mixed in the eggs, adding salt. Refried beans is a common accompaniment. Are we justified in saying that \"A skillet is used in the recipe.\"? Yes, no, or maybe? Yes\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\"). Are we justified in saying that \"Ernest Guiraud wrote his final piece in 1892.\"? Yes, no, or maybe?", "doc_id": 840, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39208, 25171, 16126, 14333], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"Tricia Wygal married one of The Flirts.\"? Yes, no, or maybe? Maybe\n###\nThe 1976 European Cup Winners' Cup Final was a football match between West Ham United of England and Anderlecht of Belgium. The final was held at Heysel Stadium in Brussels on 5 May 1976. It was the final match of the 1975\u201376 European Cup Winners' Cup tournament and the 16th European Cup Winners' Cup Final. Are we justified in saying that \"West Ham United of England has never played a Belgium team\"? Yes, no, or maybe? No\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels. Are we justified in saying that \"Louis Glenn Marson will join the New York Yankees in 2020\"? Yes, no, or maybe? Maybe\n###\nSpanglish is a 2004 American romantic comedy-drama film written and directed by James L. Brooks and starring Adam Sandler, T\u00e9a Leoni, Paz Vega, and Cloris Leachman. It was released in the United States on December 17, 2004 by Columbia Pictures. The film grossed $55 million worldwide on an $80 million production budget, and received mixed reviews from critics. Are we justified in saying that \"Spanglish received Best Movie of the Year.\"? Yes, no, or maybe? Maybe\n###\nMount Weeks, formerly Round Mountain, is a mountain located in Coos County, New Hampshire. Mt. Weeks is the northeasternmost of the Pliny Range of the White Mountains and the highest point within the city limits of Berlin, New Hampshire. Mount Weeks is flanked to the southwest by South Weeks, and faces Terrace Mountain to the northwest across Willard Notch. Are we justified in saying that \"Mount Weeks is not located in Berlin, Germany.\"? Yes, no, or maybe?", "doc_id": 153, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4397, 19662, 26241, 20329], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eric Black is an American journalist. He was a longtime reporter for the Minnesota Star Tribune newspaper, and has also been a Twin Cities blogger. He is a columnist for online newspaper MinnPost, primarily writing about politics and the historical background of current issues. Are we justified in saying that \"eric black is from missouri\"? Yes, no, or maybe? Maybe\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign. Are we justified in saying that \"The 23rd Infantry are still in service.\"? Yes, no, or maybe? No\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times. Are we justified in saying that \". He was capped for the Belgian national team 42 times.\n\"? Yes, no, or maybe? No\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians. Are we justified in saying that \"Adam Ant was touring during 1992.\"? Yes, no, or maybe? Maybe\n###\nSverre Peak ( ) is a small peak 0.5\u00a0nautical miles (0.9\u00a0km) off the north end of Pettersen Ridge in the Conrad Mountains of Queen Maud Land. Discovered and photographed by the German Antarctic Expedition, 1938-39. Mapped by Norway from air photos and surveys by the Norwegian Antarctic Expedition, 1956\u201360, and named for Sverre Pettersen, steward with Norwegian Antarctic Expedition, 1957-58. Are we justified in saying that \"A nautical mile is 1.8 kilometers.\"? Yes, no, or maybe?", "doc_id": 18, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33532, 32688, 37024, 22140], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Piazza Colonna is a piazza at the center of the Rione of Colonna in the historic heart of Rome, Italy. It is named for the marble Column of Marcus Aurelius, which has stood there since AD 193. The bronze statue of Saint Paul that crowns the column was placed in 1589, by order of Pope Sixtus V. The Roman Via Lata (now the Via del Corso) runs through the piazza's eastern end, from south to north. Are we justified in saying that \"The Piazza Colonna is named after the Column of Marcus Aurelius.\"? Yes, no, or maybe? Yes\n###\nLee Webb was the anchor of \"The 700 Club\", the flagship program of The Christian Broadcasting Network (CBN), and \"Newswatch\", a half-hour daily news program also on CBN. He was born in Pompano Beach, Florida. Since September 2013, he has served as the vice-president of broadcasting for Ligonier Ministries in Sanford, Florida. Are we justified in saying that \"Lee Webb has been in broadcasting his entire working career.\"? Yes, no, or maybe? Maybe\n###\nThe Sauber C33 is a Formula One racing car designed by Sauber to compete in the 2014 Formula One season. It was driven by Esteban Guti\u00e9rrez and Adrian Sutil, who joined the team after Nico H\u00fclkenberg returned to Force India. The C33 was designed to use Ferrari's new 1.6-litre V6 turbocharged engine, the 059/3. Are we justified in saying that \"The Sauber C33 was designed to use Ferrari's new 1.6-litre V6 \"? Yes, no, or maybe? Yes\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s. Are we justified in saying that \"The album released 1994\"? Yes, no, or maybe? Yes\n###\nTanya McQueen is an American reality television personality and interior designer on TV's . She made her debut on \"Extreme Makeover\" in an October 2005 episode titled, \"The Teas Family\". On August 2, 2011, McQueen and fellow Extreme Makeover personality Tracy Hutson debuted the show \"Picker Sisters\" on Lifetime. Are we justified in saying that \"Tanya McQueen made her debut in Extreme Takeover.\"? Yes, no, or maybe?", "doc_id": 405, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10418, 33739, 28189, 16583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"Kasey Peters retired from American football before he turned 35 years old.\"? Yes, no, or maybe? Yes\n###\nRon & Carol Cope Stadium at Foster Field, is a football stadium located in Kearney, Nebraska, on the University of Nebraska\u2013Kearney campus. In 2005, the university named the stadium after Ron & Carol Cope, who were long-time supporters of the University of Nebraska System. The field is named after Charlie Foster, a former coach and athletic director at Nebraska\u2013Kearney. Are we justified in saying that \"Ron and Carol Cope don't like football. \"? Yes, no, or maybe? No\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion. Are we justified in saying that \"Edward Gibbon FRS had work that was more important than \"The History of the Decline and Fall of the Roman Empire.\"\"? Yes, no, or maybe? No\n###\nLeft Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker. Are we justified in saying that \"Chisholm was a man.\"? Yes, no, or maybe? Yes\n###\nRichard Church Thompson (October 8, 1957 \u2013 July 27, 2016) was an American illustrator and cartoonist best known for his syndicated comic strip \"Cul de Sac\" and the illustrated poem \"Make the Pie Higher\". He was given the Reuben Award for Outstanding Cartoonist of the Year for 2010. Are we justified in saying that \"Richard Church Thompson starts with an A.\"? Yes, no, or maybe?", "doc_id": 147, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30340, 30629, 4448, 56], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008. Are we justified in saying that \"Jeffrey B. Miller tried to obtain a PhD\"? Yes, no, or maybe? Maybe\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games. Are we justified in saying that \"Hannah won four British senior national titles.\"? Yes, no, or maybe? Yes\n###\nThe Peoria Rivermen was a professional ice hockey team in the American Hockey League. They played in Peoria, Illinois, USA at the Carver Arena. On June 14, 2013, it was announced that the team would relocate to Utica, New York after the 2012\u201313 AHL season, and be known as the Utica Comets. Are we justified in saying that \"The Peoria Rivermen had a total of 23 hockey players on it.\"? Yes, no, or maybe? Maybe\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898. Are we justified in saying that \"Grimsby Pelham Football Club is a professional football club in Cleethorpes, Lincolnshire, England.\"? Yes, no, or maybe? No\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc Are we justified in saying that \"Indian people flocked to see the movie Fukrey when it came out at the cinema\"? Yes, no, or maybe?", "doc_id": 137, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31009, 39, 532, 36864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement. Are we justified in saying that \"Princess Caroline of Gloucester had no siblings\"? Yes, no, or maybe? Maybe\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\". Are we justified in saying that \"The Last of Us was released in the 21st century.\"? Yes, no, or maybe? Yes\n###\nHarry Spencer Davis (born 24 September 1991) is an English professional footballer, who plays as a defender for Scottish Championship side St Mirren. Davis previously played with Crewe Alexandra. Early in his career, he was loaned by Crewe to Nantwich Town, Stafford Rangers and Curzon Ashton. Are we justified in saying that \"harry davis was not an athletic person\"? Yes, no, or maybe? No\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall has an upper level\"? Yes, no, or maybe? Maybe\n###\nThe Great American Conference (GAC) is a collegiate athletic conference of eleven schools, with headquarters located in Russellville, Arkansas. It is affiliated in the National Collegiate Athletic Association (NCAA)'s Division II level. Athletic competition began play during the 2011\u201312 school year. Member schools are located in Arkansas and Oklahoma in the South Central United States. Are we justified in saying that \"The Great American Conference is a conference containing over 11 schools\"? Yes, no, or maybe?", "doc_id": 936, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32774, 16763, 30652, 27808], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Adriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro is the most popular player\"? Yes, no, or maybe? Maybe\n###\n\"Outro\" is a song by French electronic music artist M83, released as the final track on the group's sixth studio album, \"Hurry Up, We're Dreaming\" (2011). It is a dramatic, symphonic rock song which has evoked \"heartbreak, nostalgia, anticipation, jubilation and triumph\". Are we justified in saying that \"Outro was sung by Clinton.\"? Yes, no, or maybe? Maybe\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree Victoria Murphy played Tricia Dingle, Eva Strong, and Dakota Davies.\"? Yes, no, or maybe? Yes\n###\nLakeshore Technical College (or LTC) is a technical and community college located in Cleveland, Wisconsin, which is centrally located between the lakeshore cities of Sheboygan and Manitowoc. It is a member of the 16 schools in the Wisconsin Technical College System. The college was originally named Lakeshore Technical Institute (LTI). Are we justified in saying that \"Even though Sheboygan and Manitowoc are in Wisconsin, Madison is not.\"? Yes, no, or maybe? Maybe\n###\nThe Korea Aerospace Industries KF-X/Indonesian Aerospace IF-X is a South Korean and Indonesian program to develop an advanced multirole fighter for the Republic of Korea Air Force (ROKAF) and Indonesian Air Force (TNI-AU), spearheaded by South Korea with Indonesia as the primary partner. It is South Korea's second fighter development program following the FA-50. Are we justified in saying that \"Prior to the KF-X/IF-X program, South Korea had not taken initiative to develop fighters\"? Yes, no, or maybe?", "doc_id": 155, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40459, 24874, 1942, 14543], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910. Are we justified in saying that \"Yi Bangja and Crown Prince Euimin were born in 1901.\"? Yes, no, or maybe? Yes\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy. Are we justified in saying that \"The Kamwina Nsapu terrorist group used guns to attack the police.\"? Yes, no, or maybe? Maybe\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee Are we justified in saying that \"He was 52 when he released the novel\"? Yes, no, or maybe? Maybe\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"Santa Lucia showcases stunning architecture.\"? Yes, no, or maybe? Yes\n###\nVivekananda Degree College is the only Degree college in Ichoda Mandal which is established in 2006 and is affiliated to Kakatiya University of Telangana, India. The college has its campus at Ichoda, Adilabad. The college runs degree courses in Computer Science, Arts, Science, Commerce and Management. Are we justified in saying that \"Vivekananda Degree College was established more than 10,000 seconds ago.\"? Yes, no, or maybe?", "doc_id": 847, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36614, 13012, 42126, 45371], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic. Are we justified in saying that \"Lausche is located in the French border state of Saxony.\"? Yes, no, or maybe? No\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees. Are we justified in saying that \"Jaron Long will end up the the Hall of Fame for his accomplishments. \"? Yes, no, or maybe? Maybe\n###\nInteractive Investor (II) is an online trading and investment platform based in London. The group offers retail investors an investment service to manage and trade shares, funds and bonds via trading accounts, ISAs and SIPPs. Its website provides content which is intended to support investors in making the difficult and complex decisions associated with online trading and investment. Are we justified in saying that \"Interactive Investor is an UK company.\"? Yes, no, or maybe? Yes\n###\nDrifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016. Are we justified in saying that \"The fifth series was released in 2016.\"? Yes, no, or maybe? No\n###\nIn cooking, coddled eggs are gently or lightly cooked eggs. They can be partially cooked, mostly cooked, or hardly cooked at all (as in the eggs used to make Caesar salad dressing, which are only slightly poached for a thicker end-product). Poached eggs are eggs that, arguably, are coddled in a very specific way: they are very gently cooked, in simmering water. Are we justified in saying that \"Eggs taste very bad\"? Yes, no, or maybe?", "doc_id": 384, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35968, 29149, 39133, 32687], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"You & Me is the flirts best song of the 1980s\"? Yes, no, or maybe? Maybe\n###\nFloridana Beach is an unincorporated community in Brevard County, Florida, United States. It is located on a barrier island southeast of the city of Melbourne and east of the town of Grant-Valkaria. It is just south of the unincorporated community of Melbourne Shores, and north of the unincorporated community of Sunnyland Beach. Are we justified in saying that \"Sunnyland Beach and Melbourne Shores are also located on Brevard County.\"? Yes, no, or maybe? Maybe\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"Sadhu Kokila chose to direct Suntaragaali because romantic action films are his favorite thing to work on.\"? Yes, no, or maybe? Maybe\n###\nUSS \"Chicago\" (CA-136) was a \"Baltimore\"-class heavy cruiser laid down on 28 July 1943 at Philadelphia, Pennsylvania, US, by the Philadelphia Navy Yard. Launched on 20 August 1944, she was sponsored by Mrs. Edward J. Kelly, wife of the Mayor of Chicago, Illinois, and commissioned at the Philadelphia Navy Yard on 10 January 1945, Captain Richard R. Hartung, USN, in command. Are we justified in saying that \"The USS Chicago was sponsored by the wife of the Mayor of Chicago.\"? Yes, no, or maybe? Yes\n###\nWilliam Elden Bolcom (born May 26, 1938) is an American composer and pianist. He has received the Pulitzer Prize, the National Medal of Arts, a Grammy Award, the Detroit Music Award and was named 2007 Composer of the Year by Musical America. Bolcom taught composition at the University of Michigan from 1973\u20132008. He is married to mezzo-soprano Joan Morris. Are we justified in saying that \"Bolcom started teaching composition at the University of Michigan in the summer of the year preceding 1794\"? Yes, no, or maybe?", "doc_id": 791, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7733, 39168, 7756, 14182], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Serial Killers Ink is a website dedicated to selling \"murderabilia\" (collectibles related to murders, murderers or other violent crimes) and serial killer art, interviewing convicted serial killers and also serves as a meeting place for those interested or involved in the murderabilia industry. Are we justified in saying that \"Serial Killers Ink sells murder memorabilia.\"? Yes, no, or maybe? Yes\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"Ra\u00fal Alberto Osella played FIFA U-21 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina.\"? Yes, no, or maybe? No\n###\nOpal Koboi is a fictional character from the Artemis Fowl series by Eoin Colfer. After the character's introduction in in the series as a supporting antagonist, Colfer again used Koboi as the main antagonist of the fourth, sixth, and eighth books in the series, giving her the status of archenemy to Artemis Fowl II. Are we justified in saying that \"Koboi and Fowl are friends in the 8th book.\"? Yes, no, or maybe? No\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"The station has a lot of phone-in shows.\"? Yes, no, or maybe? Maybe\n###\nA cardigan is a type of knitted garment that has an open front. Commonly cardigans have buttons: a garment that is tied is instead considered a robe. A more modern version of the garment has no buttons and hangs open by design. By contrast, a pullover does not open in front but must be \"pulled over\" the head to be worn. It may be machine- or hand-knitted. Are we justified in saying that \"Cardigans has buttons however it can have buttons as well no buttons.\"? Yes, no, or maybe?", "doc_id": 952, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14331, 4393, 30563, 40957], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Europrop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas. Are we justified in saying that \"EPI has many products.\"? Yes, no, or maybe? No\n###\nCairn Energy PLC is one of Europe's leading independent oil and gas exploration and development companies and is listed on the London Stock Exchange. Cairn has discovered and developed oil and gas reserves in a variety of locations around the world. Cairn Energy has a primary listing on the London Stock Exchange and is a constituent of the FTSE 250 Index. Are we justified in saying that \"Cairn is shrinking its operations\"? Yes, no, or maybe? Maybe\n###\nFor Screening Purposes Only is the debut album by UK dance-punk trio Test Icicles. After being released in 2005, the album was critically praised for being unique and compelling in an increasingly homogenous indie music scene. Following the group's split in February 2006, the album remains Test Icicles' only LP. Are we justified in saying that \"The album was praised as unique\"? Yes, no, or maybe? Yes\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"Dr. Jekyll Is Mr. Hyde is a popular south korean movie\"? Yes, no, or maybe? Maybe\n###\nFrench opera is one of Europe's most important operatic traditions, containing works by composers of the stature of Rameau, Berlioz, Bizet, Debussy, Poulenc and Messiaen. Many foreign-born composers have played a part in the French tradition as well, including Lully, Gluck, Salieri, Cherubini, Rossini, Meyerbeer, Offenbach and Verdi. Are we justified in saying that \"Rameau was from Spain\"? Yes, no, or maybe?", "doc_id": 99, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7179, 4916, 10013, 40928], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ripponlea is an inner suburb of Melbourne, Victoria, Australia, named after the adjoining Rippon Lea Estate. It is 7\u00a0km south east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, Ripponlea had a population of 1,478. Are we justified in saying that \"Melbourne is very windy.\"? Yes, no, or maybe? Maybe\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles. Are we justified in saying that \"The 1970 Swedish open was held in 1969\"? Yes, no, or maybe? No\n###\nPangani Falls Dam is a dam in Tanzania, which is part of the Pangani Hydro Systems. The dam is located at Koani in the Muheza District of the Tanga Region, about 8\u00a0km south of another power station at Hale. The Pangani falls power station has two turbines and has an installed capacity of 68 MW . Are we justified in saying that \"The Pangani Falls power station produces all of the electricity for the Tanga Region.\"? Yes, no, or maybe? Maybe\n###\nUna questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga. Are we justified in saying that \"Una questione privata was a 1993 film based on a novel about WWII.\"? Yes, no, or maybe? Yes\n###\nEdmund Quincy III ( ; 1681\u20131737) was an American merchant and judge. He was the son of Col. Edmund Quincy II (1627-1698) II and his second wife, Elizabeth Gookin. He married Dorothy Flynt and had 7 children. Four lived to adulthood, including Edmund Quincy IV and Dorothy Quincy, who was the topic of a famous poem by Oliver Wendell Holmes, Sr. Are we justified in saying that \"Edmund Quincy III married his second wife Dorothy Flynt.\"? Yes, no, or maybe?", "doc_id": 525, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15994, 14343, 12645, 16848], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph Ernst Friedrich von Forcade de Biaix didn't live to see his 70th birthday.\"? Yes, no, or maybe? Yes\n###\nHarry Brand (October 20, 1895 \u2013 February 22, 1989) was an American press agent. Described as \"the mastermind who made Shirley Temple the most famous child star in history, Betty Grable a GI Joe pinup girl and Marilyn Monroe a sex goddess,\" Brand was the head of publicity at 20th Century Fox from 1935 until 1962. Are we justified in saying that \"Shirley Temple knew Harry Brand \"? Yes, no, or maybe? Yes\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"Antonio Lewis was the highest paying member of Flatbush ZOMBIES\"? Yes, no, or maybe? Maybe\n###\nThe Zurich derby is a football match between rivals FC Z\u00fcrich and Grasshopper Club Z\u00fcrich. The two teams were founded in Z\u00fcrich, Switzerland. Grasshopper in 1886 and Z\u00fcrich in 1896. Grasshoppers are known as the club of the elite and FCZ are known as the club of the workers. The derby is unique in Switzerland as it is the only rivalry between two teams from the same city. Are we justified in saying that \"These two teams do not like each other. \"? Yes, no, or maybe? Maybe\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards. Are we justified in saying that \"Jake Deckard was a part of at least one R rated movie\"? Yes, no, or maybe?", "doc_id": 289, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35049, 4708, 27403, 7939], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Letter Black, formerly known as Breaking the Silence, is a Christian rock band that was formed in 2006 in Uniontown, Pennsylvania. The band consists of lead vocalist Sarah Anthony; her husband, lead guitarist and vocalist Mark Anthony; and drummer Justin Brown. Are we justified in saying that \"The Letter Black have popular music videos\"? Yes, no, or maybe? Maybe\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts. Are we justified in saying that \"Phacelia mutabilis is pink.\"? Yes, no, or maybe? Maybe\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge. Are we justified in saying that \"Kew Bridge station is named for the borough it is located in.\"? Yes, no, or maybe? No\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75. Are we justified in saying that \"Interstate 29 has cars with Canadian license plates on it.\"? Yes, no, or maybe? Maybe\n###\nFranco Mari (Born 23 January 1947) is an Italian actor and comedian. Better known as Rupert Sciamenna, his best known character, he is famous for his participation in television programs such as Mai dire... on Italia 1 in many sketches with Marcello Macchia. Are we justified in saying that \"Rupert Sciamenna is a fictional character.\"? Yes, no, or maybe?", "doc_id": 44, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4173, 20284, 13382, 19204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Animation Domination was an animated programming block which originally aired from May 1, 2005, until September 21, 2014, on the Fox network. The block aired on Sunday evenings through the entirety of that night's primetime schedule (unless preempted, usually by sports telecasts). Are we justified in saying that \"Animation Domination was geared to teenagers during the 2006 season.\"? Yes, no, or maybe? Maybe\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) . Are we justified in saying that \"Sabanc\u0131 University is located more than 10 km away from Istandbul's city center. \"? Yes, no, or maybe? Yes\n###\nRobert Mills Delaney, sometimes incorrectly spelled Delany (1903-1956) was an American composer. He studied with Nadia Boulanger and Arthur Honegger in Paris, and was best known for his 1928 choral symphony, John Brown's Song, based on Stephen Benet's Pulitzer Prize winning poem \"John Brown's Body\". Are we justified in saying that \"Robert Delaney was at least 52 years of age at the time of his death.\"? Yes, no, or maybe? Yes\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign. Are we justified in saying that \"23rd Infrantry was in both World Wars.\"? Yes, no, or maybe? Yes\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives. Are we justified in saying that \"Christina Applegate and Will Arnett have starred together previously.\"? Yes, no, or maybe?", "doc_id": 789, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5039, 23406, 14293, 7912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Peter Joseph Wysocki (October 3, 1948 \u2013 June 14, 2003) was an American football linebacker who played his entire six-year career with the Washington Redskins from 1975 to 1980 in the National Football League (NFL). Wysocki previously played four seasons in the Canadian Football League (CFL) for the Hamilton Tiger-Cats, Toronto Argonauts and Saskatchewan Roughriders. Are we justified in saying that \"Peter Joseph Wysocki ended up dying from CTE due to injuries sustained during his playing career.\"? Yes, no, or maybe? Maybe\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson. Are we justified in saying that \"Great Balls of Fire took a few years to write.\"? Yes, no, or maybe? Maybe\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide. Are we justified in saying that \"GLO started operating later than 1996.\"? Yes, no, or maybe? Yes\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Destiny can be written in Arabic.\"? Yes, no, or maybe? Yes\n###\nAleksandr Danilovich Aleksandrov (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u0301\u043b\u043e\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 , alternative transliterations: \"Alexandr\" or \"Alexander\" (first name), and \"Alexandrov\" (last name)) (August 4, 1912 \u2013 July 27, 1999), was a Soviet/Russian mathematician, physicist, philosopher and mountaineer. Are we justified in saying that \"Aleksandr was an intelligent person, and also adventurous. \"? Yes, no, or maybe?", "doc_id": 750, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23706, 18945, 38146, 13493], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"The final of the 1983 Prudential Cup was a popular event.\"? Yes, no, or maybe? Maybe\n###\nMargaret Mary \"Maggie\" Nichols (born September 12, 1997, in Little Canada, Minnesota) is an American collegiate artistic gymnast for the University of Oklahoma. She is one of only nine NCAA gymnasts to have scored a perfect 10 on all four events, and the first to do so for Oklahoma. Are we justified in saying that \"Nichols is also an avid bowler.\"? Yes, no, or maybe? Maybe\n###\nIntermountain Healthcare is a not-for-profit healthcare system and is the largest healthcare provider in the Intermountain West. Intermountain Healthcare provides hospital and other medical services in Utah and Idaho and also offers integrated managed care under the insurance brand SelectHealth. Intermountain Healthcare is headquartered in Salt Lake City, Utah, and has some 37,000 employees. Are we justified in saying that \"Intermountain Healthcare is a not-for-profit healthcare system and is the largest healthcare provider in the Intermountain West. Intermountain Healthcare provides hospital and other medical services in Utah and Idaho and also offers integrated managed care under the insurance brand SelectHealth. Intermountain Healthcare is headquartered in Salt Lake City, Utah, and has 5 employees.\"? Yes, no, or maybe? No\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"Nydala Abbey belongs to church of England.\"? Yes, no, or maybe? No\n###\nCruel Intentions: The Musical is a jukebox musical adapted from the film \"Cruel Intentions\" by Jordan Ross and Lindsey Rosin with music direction and arrangements by Zach Spound. After two sold-out engagements in Los Angeles, the show made its New York City debut at Le Poisson Rouge in 2017. Are we justified in saying that \"Jordan Ross wrote the music for Cruel Intentions.\"? Yes, no, or maybe?", "doc_id": 169, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14357, 8223, 25392, 21436], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kate Kendall (born 27 July 1973) is an Australian actress best known for her role in the long-running Nine Network Australian drama \"Stingers\". She joined the cast of long-running television soap opera \"Neighbours\" in 2013 as the established character Lauren Turner. Are we justified in saying that \"Kate Kendall has only acted on television.\"? Yes, no, or maybe? Maybe\n###\nChristmas Bounty is a 2013 television film directed by Gil Junger. It was produced by WWE Studios and stars Francia Raisa, Mike \"The Miz\" Mizanin and Will Greenberg. It premiered on ABC Family during their 25 Days of Christmas block on November 26, 2013. Are we justified in saying that \"Christmas Bounty is one of the most famous WWE wrestling movies ever made.\"? Yes, no, or maybe? Maybe\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Broadway Rose was released in the 1920's\"? Yes, no, or maybe? Yes\n###\nNelson is an American rock band founded by singer/songwriters Matthew and Gunnar Nelson (twin sons of Ricky Nelson and Kristin Nelson). The band achieved success during the early 1990s with their double platinum debut album \"After the Rain\", which featured the number-one hit \"(Can't Live Without Your) Love and Affection\". Are we justified in saying that \"Nelson sold albums.\"? Yes, no, or maybe? Yes\n###\nFrankenstein Castle (German: \"Burg Frankenstein\" ) is a hilltop castle in the Odenwald overlooking the city of Darmstadt in Germany. It is thought that this castle may have been an inspiration for Mary Shelley when she wrote her 1818 Gothic novel \"Frankenstein\". Are we justified in saying that \"Frankenstein Castle was scene by Clinton.\"? Yes, no, or maybe?", "doc_id": 357, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14168, 9250, 34345, 27966], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o emigrated to Japan when he was 19\"? Yes, no, or maybe? No\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75. Are we justified in saying that \"Interstate 29 is under construction at this time.\"? Yes, no, or maybe? Maybe\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"He used to play soccer with his childhood friends\"? Yes, no, or maybe? Maybe\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"CUHK is a co-ed institution. \"? Yes, no, or maybe? Maybe\n###\nAloe ( or ), also written \"Alo\u00eb\", is a genus containing over 500 species of flowering succulent plants. The most widely known species is \"Aloe vera\", or \"true aloe\", so called because it is cultivated as the standard source of so-called \"aloe vera\" for assorted pharmaceutical purposes. Other species, such as \"Aloe ferox\", also are cultivated or harvested from the wild for similar applications. Are we justified in saying that \"Aloe is used mostly for the lips\"? Yes, no, or maybe?", "doc_id": 16, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18749, 13778, 18665, 6838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944. Are we justified in saying that \"USNS \"Lone Jack was made in PA\"? Yes, no, or maybe? Yes\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Coriolano: eroe senza patria was a drama film with some romantic elements\"? Yes, no, or maybe? Maybe\n###\nThe third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale. Are we justified in saying that \"Gossip Girl, is still found on tv\"? Yes, no, or maybe? Maybe\n###\nThe Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter. Are we justified in saying that \"Renee Probert-Price left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great grand daughter and her husband\"? Yes, no, or maybe? No\n###\nUdinese Calcio sensationally finished third in Serie A, much due to Oliver Bierhoff being in the form of his life, scoring 27 goals in a league season consisting of just 34 matches. Bierhoff, coach Alberto Zaccheroni and winger Thomas Helveg all left for Milan at the end of the season, ensuring Udinese had lots of work to do to stay at the level it was. Are we justified in saying that \"Biefhoff has won a championship.\"? Yes, no, or maybe?", "doc_id": 904, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24126, 41788, 40987, 13836], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Recently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct. Are we justified in saying that \"Several hundred mammals have gone extinct since 1500\"? Yes, no, or maybe? No\n###\nBright Lights, Big City is a rock musical with music, lyrics and book written by Scottish composer Paul Scott Goodman based on the 1984 novel by Jay McInerney. It follows a week in the life of Jamie, a successful young writer who loses himself in the chaos of 1980s New York City. The piece premiered Off-Broadway in New York City in 1999 and was revived in a small London production in 2010. Are we justified in saying that \"Goodman wrote Bright Lights, Big City based on his own experiences. \"? Yes, no, or maybe? No\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles. Are we justified in saying that \"Poor Pretty Eddie is a 1975 American film, the cast included Leslie Uggams and Shelly Winters as side characters, with Michael Christian with the starring roll.\"? Yes, no, or maybe? Maybe\n###\nAn election campaign was held ahead of a general election for the 54th Parliament of New South Wales on Saturday, 24 March 2007. The result\u2014a win for the social-democratic Australian Labor Party and its new leader Morris Iemma\u2014was widely perceived as a foregone conclusion, with opposition leader Peter Debnam conceding as much the week before the poll. Are we justified in saying that \"Australian Labor Party is a party in Japan.\"? Yes, no, or maybe? No\n###\nMosiula Mea'alofa \"Lofa\" Tatupu (born November 15, 1982) is a former American football linebacker who played six seasons in the National Football League (NFL). He was an assistant linebackers coach with the Seattle Seahawks. He played college football for the University of Southern California (USC). Tatupu was drafted by the Seattle Seahawks in the second round of the 2005 NFL Draft. Are we justified in saying that \"Tatupu retired before his 28th birthday. \"? Yes, no, or maybe?", "doc_id": 853, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13683, 18761, 16937, 27052], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Patti Clare (born 3 March 1976) is an English actress, known for playing the character of Mary Taylor in the ITV soap opera \"Coronation Street\" since 2008. She is a three-time winner of the British Soap Award for Best Comedy Performance (2011, 2013, 2016). Are we justified in saying that \"Patti Clare will receive a role in an upcoming Disney film\"? Yes, no, or maybe? Maybe\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990). Are we justified in saying that \"Zale Dalen is a Canadian. He is proud of his film the hounds of Notre Dame\"? Yes, no, or maybe? Maybe\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race. Are we justified in saying that \"Tomas is a cyclist from brazil\"? Yes, no, or maybe? No\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ. Are we justified in saying that \"Frank is an African American author and blogger. \"? Yes, no, or maybe? Maybe\n###\nEucommia jeffersonensis is an extinct species of flowering plant in the family Eucommiaceae. It is known from a fossil fruit found in latest Eocene deposits of Oregon, United States. \"E.\u00a0jeffersonensis\" is one of five described fossil species from North America assigned to the modern genus \"Eucommia\". The other species are \"E.\u00a0constans\", \"E.\u00a0eocenica\", \"E.\u00a0montana\", and \"E.\u00a0rolandii\". Are we justified in saying that \"The letter \"E\" in E. montana stands for \"Eucommia\".\"? Yes, no, or maybe?", "doc_id": 917, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10212, 17027, 24468, 26586], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "George Montgomery (born April 26, 1962) is a retired American basketball player. He played basketball at Illinois, and was a second-round draft selection of the Portland Trail Blazers in the 1985 NBA Draft, though he never played in the NBA. He is the biological father of Warriors center JaVale McGee, but did not raise his son. Are we justified in saying that \"George Montgomery died in Alabama.\"? Yes, no, or maybe? Maybe\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"\"I'm So Sorry\" is the 2nd promotional single from the album Smoke + Mirrors\"? Yes, no, or maybe? Yes\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"I'm So Sorry is an epic ballad.\"? Yes, no, or maybe? Maybe\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989. Are we justified in saying that \"Girilal Jain was born in1924.\"? Yes, no, or maybe? Yes\n###\nLakeshore Technical College (or LTC) is a technical and community college located in Cleveland, Wisconsin, which is centrally located between the lakeshore cities of Sheboygan and Manitowoc. It is a member of the 16 schools in the Wisconsin Technical College System. The college was originally named Lakeshore Technical Institute (LTI). Are we justified in saying that \"It is the only technical college in wisconsin\"? Yes, no, or maybe?", "doc_id": 895, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27771, 2144, 39845, 28012], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city. Are we justified in saying that \"Kingfisher Ultra Indian Derby is named after Kingfisher beer.\"? Yes, no, or maybe? Maybe\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd. Are we justified in saying that \"Steve Carell wanted to play the main character\"? Yes, no, or maybe? Maybe\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\". Are we justified in saying that \"The students who write the journal are heavily involved in politics\"? Yes, no, or maybe? Maybe\n###\nThe Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December 1898. The significance of this pair of lions was their unusual behavior, such as the number of men killed and the manner of the attacks. Are we justified in saying that \"The Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December of the 98th year in the eighteenth century.\"? Yes, no, or maybe? No\n###\nNo Said Date is the debut studio album by American rapper and Wu-Tang Clan member Masta Killa. The album was released on June 1, 2004, by Nature Sounds. The album features guest appearances from Raekwon, Ghostface Killah, Streetlife, Prodigal Sunn, Killah Priest, Method Man, Ol' Dirty Bastard, Allah Real, Inspectah Deck and GZA. Are we justified in saying that \"Wu-Tang Clan, as a rap group, released the album No Said Date in June 2004.\"? Yes, no, or maybe?", "doc_id": 650, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18885, 29549, 35949, 23824], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"Kapil Batra Films Production House was initially reluctant to release the film because of its long title. \"? Yes, no, or maybe? Maybe\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12. Are we justified in saying that \"Upper Grosvenor Street has a z.\"? Yes, no, or maybe? No\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro likes baseball\"? Yes, no, or maybe? Maybe\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Izzadeen Mohamed Naufer was born in the morning hours of January 17th, 1981.\"? Yes, no, or maybe? Maybe\n###\nO lieb, so lang du lieben kannst is a poem written by Ferdinand Freiligrath, a 19th-century German writer. In 1847, Hungarian composer Franz Liszt set the poem to music (soprano voice and piano), and eventually adapted it into his famous Liebestr\u00e4ume No. 3. The work is one of Liszt's most famous and poignant. \"Liebestr\u00e4ume\" in German means \"Dreams of Love\". Are we justified in saying that \"Dreams of Love showcased more than one musical number\"? Yes, no, or maybe?", "doc_id": 414, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36703, 41534, 14881, 4948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Faer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting. Are we justified in saying that \"Dungeons and dragons is very cool these days\"? Yes, no, or maybe? Maybe\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau. Are we justified in saying that \"The 3rd Macau International Movie Festival ceremony has an E in it.\"? Yes, no, or maybe? Yes\n###\nClear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Chris Vrenna of Nine Inch Nails/Tweaker, it was released in July 2000 on the now-defunct label Risk Records. After \"Clear Hearts, Grey Flowers\" the band formally split up and moved on to establish other projects. Are we justified in saying that \"Jack Off Jill produced no albums after July 2000.\"? Yes, no, or maybe? Yes\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity. Are we justified in saying that \"Lex Talionis Fraternitas' existence dates back to the 1960s.\"? Yes, no, or maybe? Yes\n###\nThe Hyundai Genesis Coup\u00e9 is a rear-wheel drive sports coupe from Hyundai Motor Company, released on October 13, 2008 for the Korean market. It is Hyundai's first rear-wheel drive sports coupe, and shares its basic platform with the Hyundai Genesis luxury sedan. Are we justified in saying that \"The Hyundai Genesis Coup\u00e9 was released to America.\"? Yes, no, or maybe?", "doc_id": 164, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39757, 14323, 14358, 25349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places. Are we justified in saying that \"Doing well in advertising will get your home in the U.S. National Register of Historic Places.\"? Yes, no, or maybe? Maybe\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists. Are we justified in saying that \"The Caidas del Cielo cast had some English born actors.\"? Yes, no, or maybe? Maybe\n###\nD.A.R.Y.L. is a 1985 American science fiction film written by David Ambrose, Allan Scott and Jeffrey Ellis. It was directed by Simon Wincer and stars Barret Oliver, Mary Beth Hurt, Michael McKean, Danny Corkill, and Josef Sommer. The original music score was composed by Marvin Hamlisch. Are we justified in saying that \"D.A.R.Y.L. is a 90's American science fiction film written by David Ambros, Allan Scott and Jeffrey Ellis.\"? Yes, no, or maybe? No\n###\nDemoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\". Are we justified in saying that \"Demoniac were formed over 10 years ago\"? Yes, no, or maybe? Yes\n###\nNate Albert (born 1970) is an American music executive, songwriter, producer and guitar player. He is currently the Executive Vice President of A&R at Capitol Records a division of Universal Music Group. He was formerly Senior Vice President of A&R at Republic Records, where he worked with such artists as The Weeknd, Florence & the Machine, Phantogram and the Lonely Island. Are we justified in saying that \"Nate Albert sings in Phantogram\"? Yes, no, or maybe?", "doc_id": 915, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19207, 27968, 41112, 1052], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Metal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed. Are we justified in saying that \"The PlayStation is still used today.\"? Yes, no, or maybe? Maybe\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback. Are we justified in saying that \"Hard Landing was originally written as a comedy. \"? Yes, no, or maybe? Maybe\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats. Are we justified in saying that \"Club Deportivo Denia was founded in 1927 its stadium holds 3,000 balls.\"? Yes, no, or maybe? Maybe\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced. Are we justified in saying that \"Splice is a horror scifi movie that was filmed in Paris.\"? Yes, no, or maybe? Maybe\n###\nVinar\u00f2s Club de F\u00fatbol is a football team based in Vinar\u00f2s, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1965, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"El Cervol\", which has a capacity of 9,600 seats. Are we justified in saying that \"Vinar\u00f2s Club de F\u00fatbol was founded in 1965 and was speculated to not have been made in the Valencian Community, but another community.\"? Yes, no, or maybe?", "doc_id": 769, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11215, 4951, 26430, 12892], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "My Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington. Are we justified in saying that \"Guy de la B\u00e9doy\u00e8re hosts My Famous Family by himself.\"? Yes, no, or maybe? No\n###\nAndr\u00e9 Olbrich (born 3 May 1967, in D\u00fcsseldorf, Germany) is a German guitarist, composer and backing vocalist, most known as the co-founder and lead guitarist of power metal band Blind Guardian, in which he serves as one of the main composers with other co-founder Hansi K\u00fcrsch. Are we justified in saying that \"Hansi K\u00fcrsch is a German guitarist, composer and backing vocalist, most known as the co-founder and lead guitarist of power metal band Blind Guardian\"? Yes, no, or maybe? No\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada. Are we justified in saying that \"The ACT is based in the northeastern United States, Quebec, and Canada.\"? Yes, no, or maybe? Yes\n###\nCaddyshack is a 1980 American comedy film directed by Harold Ramis and written by Brian Doyle-Murray, Ramis and Douglas Kenney. It stars Michael O'Keefe, Chevy Chase, Rodney Dangerfield, Ted Knight, and Bill Murray. Doyle-Murray also has a supporting role. The film was later dedicated to producer Douglas Kenney, who died shortly after the film's release. Are we justified in saying that \"Caddyshak, a comedy film was released in 1980 right after the death of 1/3 writers.\"? Yes, no, or maybe? Yes\n###\nJohn Davison Rockefeller III (March 21, 1906 \u2013 July 10, 1978) was a philanthropist and third-generation member of the prominent Rockefeller family. He was the eldest son of philanthropists John D. Rockefeller Jr. and Abby Aldrich Rockefeller. His siblings were Abby, Nelson, Laurance, Winthrop, and David. Are we justified in saying that \"John Rockefeller was seventy-two when he passed away. \"? Yes, no, or maybe?", "doc_id": 562, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41381, 35027, 5972, 36672], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers. Are we justified in saying that \"oleg smirnov won the super league with Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk\"? Yes, no, or maybe? Maybe\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns. Are we justified in saying that \"The 8.8 cm Flak was the gun of choice for female soldiers\"? Yes, no, or maybe? Maybe\n###\nFoaly is a fictional character in the Artemis Fowl series written by Eoin Colfer. He is the technical consultant to the Lower Elements Police (LEP). He is the most intelligent centaur on and under the Earth, considers himself to be an unappreciated genius, and is the inventor of most of the advanced technology possessed by the fairy world, rivaled only by Opal Koboi. Are we justified in saying that \"Eoin Colfer never wrote a book.\"? Yes, no, or maybe? No\n###\nRelient K is the debut studio album by American rock band Relient K. Many of the tracks are newer versions of those found on their 1998 demo \"All Work & No Play\". Typical of early Relient K albums, the lyrics use pop culture references for teaching and to illustrate Biblical principles. As of late 2006/early 2007, this album has sold around 400,000 copies. Are we justified in saying that \"Relient K band is from America.\"? Yes, no, or maybe? Yes\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele. Are we justified in saying that \"Stanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward but he is retired from football now.\"? Yes, no, or maybe?", "doc_id": 185, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1796, 42279, 34736, 37412], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Anne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952. Are we justified in saying that \"The radio play has the same name as Anne Frank's diary. \"? Yes, no, or maybe? Yes\n###\nHow Murray Saved Christmas is a 2014 animated musical television special, directed by Peter Avanzino and written by Mike Reiss. The voice actors include Jerry Stiller, Sean Hayes, Kevin Michael Richardson, Jason Alexander, John Ratzenberger and Dennis Haysbert. Are we justified in saying that \" How Murry Saved Christmas was produced by Peter Avanzino. \"? Yes, no, or maybe? Maybe\n###\nAucuba chinensis is a shrub or small tree, native to southern China, Taiwan, Burma and northern Vietnam. Typically it grows to 6 meters tall, though it can be larger. The leaves are thick, dark green above and light green below, sometimes with teeth along the margins. Are we justified in saying that \"The teeth only grow on the lower leaves.\"? Yes, no, or maybe? Maybe\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Idris was a no nonsense leader.\"? Yes, no, or maybe? Maybe\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests. Are we justified in saying that \"Maya & Marty was a variety show that ended on May 31, 2016, after just one season.\"? Yes, no, or maybe?", "doc_id": 417, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1649, 27390, 25627, 31626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Whitechapel is a British television drama series produced by Carnival Films, in which detectives in London's Whitechapel district dealt with murders which replicated historical crimes. The first series was first broadcast in the UK on 2 February 2009 and depicted the search for a modern copycat killer replicating the murders of Jack the Ripper. Are we justified in saying that \"Whitechapel premiered on late 2009.\"? Yes, no, or maybe? No\n###\nThe Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park. Are we justified in saying that \"Newnes railway line reopened last year.\"? Yes, no, or maybe? No\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils. Are we justified in saying that \"Lana Turner did not appear in many movies after 1969\"? Yes, no, or maybe? Yes\n###\nKathleen Delaney is an American actress, voice actress, singer, and dancer who works on Broadway and on the properties of 4Kids Entertainment. She is best known as the voice of Hina in the 4Kids dub of One Piece, Mai Valentine in uncut versions of \"Yu-Gi-Oh!\" and Rouge in \"Sonic X\" and the succeeding games until 2010, when she was replaced by Karen Strassman. Are we justified in saying that \"Kathleen Delaney is an American actress who has often worked in children's films. \"? Yes, no, or maybe? Maybe\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8. Are we justified in saying that \"Dave Dennis was born 19 days after New Years holiday\"? Yes, no, or maybe?", "doc_id": 798, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25579, 27623, 3492, 34671], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1972 Grantland Rice Bowl was an NCAA College Division game following the 1972 season, between the Louisiana Tech Bulldogs and the Tennessee Tech Golden Eagles. Louisiana Tech quarterback Denny Duron was named outstanding offensive player, while his teammate linebacker Joe McNeely was named outstanding defensive player. Are we justified in saying that \"The 1972 Grantland Rice Bowl took place in the year of the rat in the chinese zodiac.\"? Yes, no, or maybe? Yes\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013. Are we justified in saying that \"Kimberly Ane Peirce is 40 plus\"? Yes, no, or maybe? Yes\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points. Are we justified in saying that \"The 1994 Nebraska Cornhuskers football team was coached by Tom Osborne\"? Yes, no, or maybe? Yes\n###\nLast Place is the fifth studio album by the American indie rock band Grandaddy, released on March 3, 2017 on 30th Century Records. Self-produced and recorded by the band's frontman and primary recording artist Jason Lytle, the album is the first by Grandaddy since \"Just Like the Fambly Cat\" (2006) and the band's prior break-up. Are we justified in saying that \"Last Place was released in the 19th century.\"? Yes, no, or maybe? No\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015. Are we justified in saying that \"The famous politician Okezie Ikpeazu is the current governor of Abia State, he is the first ever person to be elected as the governor of the large Abia State\"? Yes, no, or maybe?", "doc_id": 509, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2139, 18639, 18832, 6884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rampage is an upcoming American action adventure monster film directed by Brad Peyton and written by Ryan Engle. It is based on the 1980s arcade video game of the same name. The film stars Dwayne Johnson, Naomie Harris, Malin \u00c5kerman, Joe Manganiello, Jake Lacy, Marley Shelton, and Jeffrey Dean Morgan. New Line Cinema will release the film on April 20, 2018 in 3D and IMAX. Are we justified in saying that \"The movie will bomb at the box office.\"? Yes, no, or maybe? Maybe\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano. Are we justified in saying that \"Suzanna Todd was involved in Memento\"? Yes, no, or maybe? Yes\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"The film Punjabi was released to critical and box office success in 2012. \"? Yes, no, or maybe? Maybe\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland. Are we justified in saying that \"Phacelia pedicellata is not native to the United States\"? Yes, no, or maybe? No\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line. Are we justified in saying that \"Takoma is in the countryside of Maryland. \"? Yes, no, or maybe?", "doc_id": 227, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44585, 31928, 22579, 21185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1947 KLM Douglas DC-3 Copenhagen accident was the crash of a KLM Royal Dutch Airlines flight from Amsterdam to Stockholm via Copenhagen on 26 January. The accident occurred shortly after the Douglas DC-3 took off from Kastrup in Denmark. All 22 passengers and crew on board were killed in the accident. Are we justified in saying that \"The capacity of the Douglas DC-3 was more than 25.\"? Yes, no, or maybe? Maybe\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"Campbell, a character based on Reid himself, was a boy in 1865 in the book New Day.\"? Yes, no, or maybe? Maybe\n###\nBrushstrokes in Flight is a 1984 sculpture by Roy Lichtenstein, installed at the John Glenn Columbus International Airport in Columbus, Ohio. It is part of the \"Brushstrokes\" series of artworks that includes several paintings and sculptures whose subject is the actions made with a house-painter's brush. Are we justified in saying that \"The John Glenn Columbus International Airport in Columbus, Ohio has had other artwork installed besides the series Brushstrokes.\"? Yes, no, or maybe? Maybe\n###\nUSS \"Fletcher\" (DD/DDE-445), named for Admiral Frank Friday Fletcher, was the lead \"Fletcher\"-class destroyer , and served in the Pacific during World War II. She received fifteen battle stars for World War II service, and five for Korean War service. Are we justified in saying that \"USS \"Fletcher\" got 15 battle stars for its WWII service\"? Yes, no, or maybe? Yes\n###\nChief Crazy Horse is a 1955 American CinemaScope Technicolor Western film directed by George Sherman starring Victor Mature, Suzan Ball and John Lund. The film is a fictionalized biography of the Lakota Sioux Chief Crazy Horse. It was also known as \"Valley of Fury\". Are we justified in saying that \"Chief Crazy Horse was not a real person\"? Yes, no, or maybe?", "doc_id": 257, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19882, 21147, 19862, 34869], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sasco is a ghost town located in Pinal County, Arizona, west of Red Rock. Sasco, which is an acronym for the Southern Arizona Smelter Company, was a company town with a large smelter that served several mines. Once an impressive and little-known ghost town, today Sasco is a common sporting destination with shotgun shells, airsoft bb's, paintball splatter, and litter in the area. Are we justified in saying that \"There is a large Airsoft tournament held every year at the old Southern Arizona Smelter Company site.\"? Yes, no, or maybe? Maybe\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson. Are we justified in saying that \"Great Balls of Fire! came out in the 20th century.\"? Yes, no, or maybe? Yes\n###\nThe Lawrence Brown House, better known as the L.B. Brown House is the home of Lawrence Bernard Brown a self-made businessman, community leader, and master carpenter. The importance of the L.B. Brown House is that it may be the only home built by a former enslaved person, left in Florida. The house \"stands as a living testimony to one person's triumph over adversity.\" Are we justified in saying that \"L.B. Brown House has a moat.\"? Yes, no, or maybe? Maybe\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex. Are we justified in saying that \"Muccan Station is a large railway station\"? Yes, no, or maybe? No\n###\nBaya M. Harrison, Jr. (1912 in Tampa, Florida \u2013 1975) was a politician and an attorney in Florida. He served as Chairman of the Florida Board of Control from 1960\u20131964. Harrison greatly impacted the State University System of Florida and helped desegregate Florida colleges and universities. He served as President of the Florida Bar in 1957. Are we justified in saying that \"Baya M. Harrison, Jr. was born after 1913\"? Yes, no, or maybe?", "doc_id": 442, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18602, 30473, 36791, 38655], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Memento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano. Are we justified in saying that \"Memento was released in 2000\"? Yes, no, or maybe? Yes\n###\nJohn Ormsby Vandeleur (1765 \u2013 28 November 1828) was an Irish barrister, landowner and politician from Kilrush in County Clare. He sat in the House of Commons of Ireland from 1790 to 1800, and then in the House of Commons of the United Kingdom from 1801 to 1802. Are we justified in saying that \"John has a rough 60 something years \"? Yes, no, or maybe? Maybe\n###\n\"Don't Look Back\" is a song by British pop-rock band Fine Young Cannibals. It was released as the third single from the band's 1988 album \"The Raw & the Cooked\". The song reached the top 40 charts in the United Kingdom, United States, Canada, Australia, and New Zealand. Are we justified in saying that \"Fine Young Cannibals have released an album.\"? Yes, no, or maybe? Yes\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball. Are we justified in saying that \"Forever the moment is a movie that my grandchildren will be able to see.\"? Yes, no, or maybe? Maybe\n###\nLatin Jam Workout is a Latin Dance Fitness Program created by professional athlete and choreographer JP Santana. Founded in 2007 in Los Angeles, California, Latin Jam Workout combines techno and Latin music with dance and aerobic movements. It is a fusion of Latin dance steps such as Salsa, Merengue, Raeggaeton, Cumbia, Samba, Soca, Belly-Dancing and the faster-paced rhythms of Pop and Techno. Are we justified in saying that \"Latin Jam Workout has existed for twenty-five years.\"? Yes, no, or maybe?", "doc_id": 833, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [826, 21844, 36072, 19989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Accelrys is a software company headquartered in the United States, with representation in Europe and Asia. It provides software for chemical, materials and bioscience research for the pharmaceutical, biotechnology, consumer packaged goods, aerospace, energy and chemical industries. Are we justified in saying that \"accelrys was profitable the past 7 years\"? Yes, no, or maybe? Maybe\n###\nShould the World Fail to Fall Apart is the first album by the British solo artist Peter Murphy, formerly of the gothic rock band Bauhaus. The album contains Murphy's covers of Magazine's \"The Light Pours Out of Me\" and Pere Ubu's \"Final Solution.\" It was released in 1986. Are we justified in saying that \"It was released in 1986 and sold very well.\"? Yes, no, or maybe? Maybe\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"Sebo is a professional skateboarder and artist.\"? Yes, no, or maybe? Yes\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"There is more than 1 junction\"? Yes, no, or maybe? Maybe\n###\nLady Pamela Carmen Louise Hicks (\"n\u00e9e\" Mountbatten; born 19 April 1929) is a British aristocrat. She is the younger daughter of the 1st Earl Mountbatten of Burma by his wife, Edwina Mountbatten. Through her father, Lady Pamela is a first cousin of Prince Philip, Duke of Edinburgh and a great niece of the last Empress of Russia, Alexandra Feodorovna. Are we justified in saying that \"Hicks is the oldest child\"? Yes, no, or maybe?", "doc_id": 55, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10161, 40916, 41485, 11149], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"Zuikaku took part in the Battle of Leyte Gulf.\"? Yes, no, or maybe? Yes\n###\nFan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers. Are we justified in saying that \"Sanse plays home games at multiple stadiums.\"? Yes, no, or maybe? Maybe\n###\nMaria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\". Are we justified in saying that \"Mario Ho became one of the top ranked female poker players in the world by becoming a World Series of Poker record-breaker.\"? Yes, no, or maybe? Yes\n###\nAlexander Stewart Jolly (1887\u20131957) was a Sydney-based architect, published poet and children\u2019s author in the early 20th century. His buildings are primarily in Sydney's northern suburbs and the north coast of New South Wales. His architectural work was strongly influenced by Frank Lloyd Wright\u2019s School in Chicago, as well as the Arts and Crafts movement of the time. Are we justified in saying that \"Alexander Stewart Jolly enjoyed arts and crafts \"? Yes, no, or maybe? Maybe\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\". Are we justified in saying that \"Peter John \"P. J.\" Carlesimo has a M.\"? Yes, no, or maybe?", "doc_id": 133, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34829, 4922, 13010, 44882], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the Are we justified in saying that \"\"Eternally\" was written by Charlies Chapin\"? Yes, no, or maybe? No\n###\nJordan Klepper (born March 9, 1979) is an American comedian, writer, producer, political commentator, actor and television host. He is best known for being a correspondent on \"The Daily Show\" for 170 episodes between 2014-2017, and after his departure therefrom for hosting the satirical Comedy Central program \"The Opposition with Jordan Klepper\" beginning in the fall of 2017. Are we justified in saying that \"Jordan Klepper worked for The Daily Show.\"? Yes, no, or maybe? Yes\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s. Are we justified in saying that \"John Henry Newman died in the Fall of 1890.\"? Yes, no, or maybe? Yes\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer. Are we justified in saying that \"Loui Jover washes paintings\"? Yes, no, or maybe? No\n###\nJames Proud is a British entrepreneur, and former CEO of Hello, a technology company that created the personal sleep tracker Hello Sense. Founded in 2012, Hello raised over $30 million in venture capital funding and $2.4 million from a Kickstarter campaign for Hello Sense before ultimately shutting down in June 2017. James Proud received an inaugural Thiel Fellowship in 2011. Are we justified in saying that \"Hello Sense was shut down by July 2017.\"? Yes, no, or maybe?", "doc_id": 428, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29076, 6972, 43390, 26967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Exergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid. Are we justified in saying that \"Exergonix Inc was once a part of another company.\"? Yes, no, or maybe? Yes\n###\nMary Pierce (born 15 January 1975) is a French retired tennis professional who played on the Women's Tennis Association (WTA) tour. Born in Canada, she is a citizen of Canada, and the United States. Pierce played for France in team competitions and in the Olympics. Are we justified in saying that \"Born in United States, she is a citizen of Canada, and the United States\"? Yes, no, or maybe? No\n###\nAn Act for naturalizing Louis Sekeyhaye, George Frederick Handel, and others (13 Geo. I), later given the short title of Handel's Naturalisation Act 1727, was a 1727 Act of the Parliament of Great Britain with the intent of naturalising and granting British citizenship to German-born composer George Frideric Handel and other foreign citizens. Are we justified in saying that \"Handel's Naturalisation Act 1727 included many other people in the title in the longer version\"? Yes, no, or maybe? Yes\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night. Are we justified in saying that \"Sally Blane made more money than Wallace Ford.\"? Yes, no, or maybe? Maybe\n###\nIra Heiden (born September 22, 1966) is an American television and film actor, perhaps best known for his role in the 1987 horror film \"\" as Will Stanton. Ira's other film roles include the 1988 film \"Elvira, Mistress of the Dark\" and his most recent film is the 1996 film \"Timelock\". Are we justified in saying that \"Heiden was in Forrest Gump.\"? Yes, no, or maybe?", "doc_id": 855, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4384, 38397, 32604, 45339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929. Are we justified in saying that \"State Route 360 can be found in Wisconsin.\"? Yes, no, or maybe? No\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines is not motivated by profit.\"? Yes, no, or maybe? Yes\n###\nThe Legend of Paradise Island is a Hawaiian Musical Fantasy in two acts, with book, music, and lyrics by Carmen Lombardo and John Jacob Loeb. The book was adapted by Francis Swann. The show was produced by Guy Lombardo at the Jones Beach Marine Theater. The show opened on June 22, 1961. Are we justified in saying that \"The Legend of Paradise Island featured singing.\"? Yes, no, or maybe? Yes\n###\nClaus Biederstaedt (born 28 June 1928 in Stargard, today Poland) is a German actor and voice actor. He studied in Hamburg and began his career working with Joseph Offenbach. Among the actors for whom he has dubbed have been Yves Montand, Peter Falk, Marlon Brando, Vittorio Gassman, and James Garner. Are we justified in saying that \"Claus Biederstaedt is not necessarily always born in Stargard.\"? Yes, no, or maybe? No\n###\nAlrifai is a Lebanese multinational nut retailing company headquartered in Beirut, Lebanon, and a wholly owned subsidiary of Alrifai International Holding Ltd. It is the largest nut retailing chain in the Middle East and the company with the biggest market share in Lebanon. Are we justified in saying that \"Alrifal holds the second biggest share of the nut market in Lebanon\"? Yes, no, or maybe?", "doc_id": 863, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14060, 36521, 23572, 32985], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o was born in Germany\"? Yes, no, or maybe? No\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines is based in Puerto Rico.\"? Yes, no, or maybe? Yes\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"He was co director of the stanford program\"? Yes, no, or maybe? Yes\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate. Are we justified in saying that \"The company will make a bar with no toffee.\"? Yes, no, or maybe? Maybe\n###\nIn the Ugric mythology, Kaltes-Ekwa (Khanty, Kaltes Ankw) was the mother of the hero Mir-Susne-Hum and the wife of the god Num-Torum, who defeated her in heaven. She was also a goddess of the moon associated with the month April; a birth giving goddess (she is called upon by women in child-birth); goddess of fate; goddess of dawn and a shape-shifter, often shown manifested as a hare. Are we justified in saying that \"Num-Torum is the god of June.\"? Yes, no, or maybe?", "doc_id": 830, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11597, 38787, 3055, 24472], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael George Stroka (May 9, 1938 in Passaic, New Jersey \u2013 April 14, 1997) was an American actor on soap operas like ABC-TV's \"Dark Shadows\", in which he played Aristede, Bruno Hess, and Laszlo Ferrari from 1969 to 1970. In addition, he made a cameo appearance as a pallbearer in the MGM film, \"House of Dark Shadows\", the first of two feature films based on the ABC soap opera. Are we justified in saying that \"Michael George Stroka was born in the 30's.\"? Yes, no, or maybe? Yes\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984 on time, and is 1412 m in length.\"? Yes, no, or maybe? Maybe\n###\nThe Monument to Vasil Levski (Bulgarian: \u041f\u0430\u043c\u0435\u0442\u043d\u0438\u043a \u043d\u0430 \u0412\u0430\u0441\u0438\u043b \u041b\u0435\u0432\u0441\u043a\u0438 , \"Pametnik na Vasil Levski\") in the centre of Sofia, the capital of Bulgaria, is one of the first monuments to be built in the then newly liberated Principality of Bulgaria. It commemorates the hanging of Bulgarian national hero and major revolutionary figure Vasil Levski on the same spot on 18 February 1873. Are we justified in saying that \"It commemorates the accomplishment of Vasil\"? Yes, no, or maybe? No\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Tillya tepe is worth millions of dollars.\"? Yes, no, or maybe? Maybe\n###\nMajor General Sen\u00e9n Casas Regueiro (30 July 1934 in Bomb\u00ed - 6 August 1996) was a Cuban politician. He was the Cuban Minister of Transportation from 1989 till his death. He was also the First Deputy Minister of Defence and the Chief of Staff of the Cuban Army. He was a brother of another Cuban politician Julio Casas Regueiro. Are we justified in saying that \"Julio Casas Regueiro had two brothers.\"? Yes, no, or maybe?", "doc_id": 938, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30545, 35244, 28651, 14038], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Artur Edler von Mecenseffy (23 June 1865, Vienna \u2014 6 October 1917, Asiago) was an Austro-Hungarian Army officer who held the rank of \"Feldmarschall-leutnant\" (\"lieutenant field marshal\") and served during World War I, becoming the highest ranking officer of Austria-Hungary to be killed on the battlefield. Are we justified in saying that \"Artur did not die before 6 October 1917.\"? Yes, no, or maybe? Yes\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"Lourdes Ver\u00f3nica Ar\u00e9valos Elias is a highly paid model\"? Yes, no, or maybe? Maybe\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality. Are we justified in saying that \"Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) married Richard B. Wall and was an American artist and voice actress.\"? Yes, no, or maybe? Maybe\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College. Are we justified in saying that \"There aren't any Major League Baseball teams based in California.\"? Yes, no, or maybe? Maybe\n###\nHaverstraw is a village incorporated in 1854 in the town of Haverstraw in Rockland County, New York, United States. It is located north of Congers, southeast of West Haverstraw, east of Garnerville, northeast of New City, and west of the Hudson River at its widest point. According to the 2013 U.S. Census estimate, the population was 12,102, an increase from the 2010 Census population of 11,910. Are we justified in saying that \"The population for Haverstraw had a higher census estimate in 2011 than the year before, 2010. \"? Yes, no, or maybe?", "doc_id": 20, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12782, 4802, 45074, 10309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stainer & Bell Limited is a British publisher of classical sheet music and books, based in London. Stainer, founded in 1907, publish the works of a number of significant twentieth-century composers, including Charles Villiers Stanford, Gustav Holst, Ralph Vaughan Williams, and Herbert Howells. They also publish a number of earlier composers, including Henry VIII, William Byrd, and Henry Purcell. Are we justified in saying that \"Stainer & Bell Limited was founded in the 20th century\"? Yes, no, or maybe? Yes\n###\nLorca F\u00fatbol Club, S.A.D. is a Spanish football team based in Lorca, in the autonomous community of the Region of Murcia. Founded in 2003, it currently plays in Segunda Divisi\u00f3n, holding home games at Estadio Francisco Art\u00e9s Carrasco, which has a capacity of 8,120. Are we justified in saying that \"They do not participate in the Segunda Division.\"? Yes, no, or maybe? No\n###\nThe Texas A&M Aggie baseball team represents Texas A&M University in NCAA Division I college baseball. The Aggies have competed in the Southeastern Conference since 2013. The Aggies play home games at Olsen Field at Blue Bell Park. The team is led by head coach Rob Childress. Are we justified in saying that \"The Aggies play all games at Olsen Field.\"? Yes, no, or maybe? No\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh. Are we justified in saying that \"TSS was once owned by a government agency\"? Yes, no, or maybe? Yes\n###\nWestbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008. Are we justified in saying that \"Westbury Senior High School has a pre-k.\"? Yes, no, or maybe?", "doc_id": 533, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41729, 29545, 27296, 7659], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Eliza Mahoney (May 7, 1845 \u2013 January 4, 1926) was the first African American to study and work as a professionally trained nurse in the United States, graduating in 1879. Mahoney was one of the first African Americans to graduate from a nursing school, and she prospered in a predominantly white society. She also challenged discrimination against African Americans in nursing. Are we justified in saying that \"Mahoney was an avid supporter of all prejudice.\"? Yes, no, or maybe? No\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"He was very strict\"? Yes, no, or maybe? Maybe\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169. Are we justified in saying that \"Innylay is a well known locality. \"? Yes, no, or maybe? Maybe\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets. Are we justified in saying that \"Georgia bears many players for the Redskins.\"? Yes, no, or maybe? Maybe\n###\nTodd Wash (born July 19, 1968) is an American football coach who is the defensive coordinator for the Jacksonville Jaguars of the National Football League (NFL). From 2013 to 2015 he was the defensive line coach and run game coordinator for the Jacksonville Jaguars. Are we justified in saying that \"If one subtracts the \"t\" from the beginning of Todd's first name, one is left with a word that can mean \"strange\".\"? Yes, no, or maybe?", "doc_id": 634, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35755, 20104, 31829, 23254], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough. Are we justified in saying that \"Merry Christmas, Charlie Manson! aired over 7 years ago\"? Yes, no, or maybe? Yes\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\". Are we justified in saying that \"Terry Jones was paid 3,000,000 dollars to sing at the Hollywood Bowl.\"? Yes, no, or maybe? Maybe\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree Victoria Murphy was born in July\"? Yes, no, or maybe? No\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide. Are we justified in saying that \"Chinese critics had mixed reactions about Jersey Boys.\"? Yes, no, or maybe? Maybe\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Sushil Kumar Shinde to become the 11th Vice President of India\"? Yes, no, or maybe?", "doc_id": 719, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21381, 41524, 33914, 25597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue. Are we justified in saying that \"the drake hotel is a venue\"? Yes, no, or maybe? Yes\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Allen S Weiner is currently a professor at Stanford\"? Yes, no, or maybe? No\n###\nMaricopa County ( ) is a county in the south-central part of the U.S. state of Arizona. As of the 2010 census, its population was 3,817,117, making it the state's most populous county, and the fourth-most populous in the United States. It is more populous than 23 states. The county seat is Phoenix, the state capital and fifth-most populous city in the country. Are we justified in saying that \"The state capital starts with a P\"? Yes, no, or maybe? Yes\n###\nThe Little Girl Next Door is a 1912 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. The film starred William Garwood and Marguerite Snow in the lead roles. Prints of the film are in the Library of Congress and other collections. Are we justified in saying that \"The Little Girl Next Door is a 1807 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. \"? Yes, no, or maybe? No\n###\nFire!! was an African-American literary magazine published in New York City in 1926 during the Harlem Renaissance. The publication was started by Wallace Thurman, Zora Neale Hurston, Aaron Douglas, John P. Davis, Richard Bruce Nugent, Gwendolyn Bennett, Lewis Grandison Alexander, Countee Cullen, and Langston Hughes. After it published one issue, its quarters burned down, and the magazine ended. Are we justified in saying that \"Thousands of issues were published.\"? Yes, no, or maybe?", "doc_id": 485, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36549, 39627, 17514, 18049], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines covers news related to design.\"? Yes, no, or maybe? Yes\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"Foals have been covered by tool.\"? Yes, no, or maybe? Maybe\n###\nConnacht Rugby (Irish: \"Rugba\u00ed Connachta\" ) is one of the four professional provincial rugby teams from the island of Ireland. Connacht competes in the Pro14 and the European Rugby Challenge Cup. The team represents the IRFU Connacht Branch, which is one of four primary branches of the IRFU, and is responsible for rugby union throughout the geographical Irish province of Connacht. Are we justified in saying that \"Connacht Rugby is well-known.\"? Yes, no, or maybe? Maybe\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal. Are we justified in saying that \"After the Empire of Japan invaded and occupied the Northeast in 1931, the Japanese Communist Party organized small anti-Japanese guerrilla units\"? Yes, no, or maybe? No\n###\n\"Touch Me With Your Love\" is a song by Beth Orton, released as the fourth single from 1996 album \"Trailer Park\". It contains 4 songs, and was released on C.D. and vinyl. The release peaked at #60 in the UK official singles chart. It was also released in Australia with a different track listing, and was the first release by Orton to have a promotional video made for it. Are we justified in saying that \"\"Touch Me With Your Love\" achieved a ranking higher than #60 at certain points after its release in the UK official singles chart.\"? Yes, no, or maybe?", "doc_id": 733, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17985, 7654, 3263, 27218], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice. Are we justified in saying that \"Bruno Mingeon was born in Savoie in 1988.\"? Yes, no, or maybe? No\n###\nThe Inter-American Peace Force (IAPF) was established, by the Organization of American States, on 23 May 1965, after the United States's intervention in the Dominican Republic. It largely consisted of over 42,600 United States military personnel, plus the following troops were sent by each country; Are we justified in saying that \"The Inter-American Peace Force has more than American troops\"? Yes, no, or maybe? Yes\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C. Are we justified in saying that \"Luton Town Ladies Football Club was founded in an even numbered year.\"? Yes, no, or maybe? No\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"Kidsty Pike has in recent years flowed away from the English Lake DIstrict\"? Yes, no, or maybe? No\n###\nBremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society. Are we justified in saying that \"Maine is smaller than Bremen\"? Yes, no, or maybe?", "doc_id": 364, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14592, 10904, 17344, 22060], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "High Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX. Are we justified in saying that \"Hugh Noon Toons was terrible at ratings.\"? Yes, no, or maybe? Maybe\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium. Are we justified in saying that \"moulton loves phoenix capital\"? Yes, no, or maybe? Maybe\n###\nCarlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features. Are we justified in saying that \"A spring U.S. theater release is what awaited this film after Sundance.\"? Yes, no, or maybe? Yes\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long. Are we justified in saying that \"The route is barely used \"? Yes, no, or maybe? Maybe\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles. Are we justified in saying that \"Dick Crealy and Peaches Bartkowicz beat eachother in the 1970 Swedish Open.\"? Yes, no, or maybe?", "doc_id": 81, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39879, 13757, 30986, 29083], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville. Are we justified in saying that \"There is another airport in Greenville County.\"? Yes, no, or maybe? Maybe\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Coriolano: eroe senza patria inspired many future films\"? Yes, no, or maybe? Maybe\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster. Are we justified in saying that \"Mithun Chakraborty has been a Grandmaster at least two times.\"? Yes, no, or maybe? Yes\n###\nThe Red Hill Fire Observation Station consists of a fire lookout tower, cabin and pit privy located on the summit of Red Hill, a 2990 ft Catskill Mountain peak in Denning, New York, United States. It is the southernmost fire tower in the Catskill Park. Are we justified in saying that \"The Red Hill Fire Observation Station is located in Catskill Park.\"? Yes, no, or maybe? Yes\n###\nRobert Paul Irvine (born 24 September 1965) is an English celebrity chef who has appeared on and hosted a variety of Food Network programs including \"\", \"Worst Cooks in America\", \"\", \"A Hero's Welcome, Operation Restaurant, All-Star Academy, Guy's Grocery Games, Chopped: Impossible\" and \"Restaurant Express\". Are we justified in saying that \"Irvine was born in the year after 1963.\"? Yes, no, or maybe?", "doc_id": 139, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40928, 33595, 11862, 10699], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Una questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga. Are we justified in saying that \"Una questione privata was a 1993 film based on a novel about WWII.\"? Yes, no, or maybe? Yes\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson. Are we justified in saying that \"The team is representing Iceland. \"? Yes, no, or maybe? Yes\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht. Are we justified in saying that \"Khan Kluay 2 has a short ending.\"? Yes, no, or maybe? Maybe\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold. Are we justified in saying that \"Fabien Onteniente directed a film in 2016 that sold over 3 million tickets.\"? Yes, no, or maybe? Yes\n###\nDave Ward, born 12 July 1959, is a British Trade Unionist and General Secretary of the Communication Workers\u2019 Union (CWU), which was formed through the merger of the Union of Communication Workers and the National Communications Union in 1995. The CWU is the largest Trade Union in the United Kingdom for people working in the Postal and Telecommunications industry with over 200,000 members. Are we justified in saying that \"Trade Unions are good for zebras.\"? Yes, no, or maybe?", "doc_id": 801, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3133, 27334, 701, 32295], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg. Are we justified in saying that \"Ferrero SpA is worth $24.2 billion. \"? Yes, no, or maybe? Maybe\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997. Are we justified in saying that \"The album Goodnight Sweetheart has more then three songs.\"? Yes, no, or maybe? Yes\n###\nThe Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013. Are we justified in saying that \"Only Korean people saw The Attorney.\"? Yes, no, or maybe? Maybe\n###\nStanley Elphinstone Kerr (March 30, 1894 \u2013 December 14, 1976) was an American humanitarian, clinical biochemist and educator. He was the father of Malcolm H. Kerr, former president of the American University of Beirut, and the grandfather of NBA player, general manager, broadcaster, and coach Steve Kerr. Are we justified in saying that \"Kerr had an unknown impact on the game of basketball\"? Yes, no, or maybe? Maybe\n###\nThe 2017\u201318 Puebla season is the 70th professional season of Mexico's top-flight football league. The season is split into two tournaments\u2014the Torneo Apertura and the Torneo Clausura\u2014each with identical formats and each contested by the same eighteen teams.The Club will also play Copa MX.Rafael Garc\u00eda Torres was named the club head coach on June 5, 2017, taking over for sacked coach Jos\u00e9 Cardozo. Are we justified in saying that \"The 2017\u201318 Puebla season was unsuccessful\"? Yes, no, or maybe?", "doc_id": 160, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20024, 34274, 34242, 44170], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Erik \"The Red\" Denmark (born about 1980) is an American competitive eater and a member of the International Federation of Competitive Eating. He currently lives in Seattle, Washington and is nicknamed after Erik the Red, who was a Viking that founded the first Nordic settlement in Greenland. Are we justified in saying that \"He has been to Greenland.\"? Yes, no, or maybe? Maybe\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season. Are we justified in saying that \"Matthew Mansfield was born within the last 9876 days.\"? Yes, no, or maybe? No\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"They started making music in their basement\"? Yes, no, or maybe? Maybe\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\". Are we justified in saying that \"Terry Jones was born in nineteen hundred fifty three.\"? Yes, no, or maybe? Maybe\n###\nThe Veterinary Medical College Application Service (VMCAS) is a centralized application service for students applying to veterinary school. Created by the Association of American Veterinary Medical Colleges (AAVMC) in 1995, VMCAS handles applications for most of the veterinary schools in the United States, as well as several in Canada, the United Kingdom, New Zealand and Australia. Are we justified in saying that \"prior to 1995 the United States, as well as Canada, the United Kingdom, New Zealand and Australia.did not have a centralized application service for veterinary students\n\"? Yes, no, or maybe?", "doc_id": 924, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24636, 24396, 16382, 14685], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute. Are we justified in saying that \"54-40 plays metal rock.\"? Yes, no, or maybe? No\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942. Are we justified in saying that \"Susan Peters left a large legacy after she died.\"? Yes, no, or maybe? Maybe\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Ralph D. Malone left Miami Dolphins to join another team\"? Yes, no, or maybe? Maybe\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award. Are we justified in saying that \"Rachel Brosnahan is an American actress who played her roles in only three films.\"? Yes, no, or maybe? Maybe\n###\nAnn Hui On-Wah, MBE (; Hepburn: \"Kyo Anka\"; born 23 May 1947) is a Hong Kong film director, producer, screenwriter and actress. She is one of the most critically acclaimed Hong Kong New Wave filmmakers. She is known for her films about social issues in Hong Kong. Are we justified in saying that \"Ann Hui On-Wah was born in the winter.\"? Yes, no, or maybe?", "doc_id": 27, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17760, 17244, 21750, 16187], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"Spanish Mastiff is very friendly towards children\"? Yes, no, or maybe? Maybe\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015. Are we justified in saying that \"Tom Osborne coached the Arkansas team in the 1985 season.\"? Yes, no, or maybe? No\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie. Are we justified in saying that \"Cher performs song from other artists.\"? Yes, no, or maybe? Yes\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012. Are we justified in saying that \"Anna Sun is a song that was released in 2019\"? Yes, no, or maybe? No\n###\nThe Borrowers is a Hallmark Hall of Fame TV special first broadcast in 1973 on NBC. This made for television special is adapted from the 1952 Carnegie Medal-winning first novel of author Mary Norton's \"The Borrowers\" series: \"The Borrowers\". The film stars Eddie Albert, Tammy Grimes and Judith Anderson and was directed by Walter C. Miller. Are we justified in saying that \"Tammy Grimes is a television actress\"? Yes, no, or maybe?", "doc_id": 992, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35352, 41565, 39082, 26185], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ. Are we justified in saying that \"Frank Viola is European\"? Yes, no, or maybe? No\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison. Are we justified in saying that \"Alix Bancourt is blind.\"? Yes, no, or maybe? Maybe\n###\nThe East\u2013West Shrine Game is an annual postseason college football all-star game played each January since 1925. The game is sponsored by the fraternal group Shriners International, and the net proceeds are earmarked to some of the Shrine's charitable works, most notably the Shriners Hospitals for Children. The game's slogan is \"Strong Legs Run That Weak Legs May Walk\". Are we justified in saying that \"The East-West Shrine Game is an annual postseason college football all-star game.\"? Yes, no, or maybe? Yes\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters. Are we justified in saying that \"Larry Ruvo is not the VP/GM of Southern Wine and Spirits of Nevada.\"? Yes, no, or maybe? No\n###\nKimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996). Are we justified in saying that \"Beck worked with Alfred Hitchcock before she worked on her most famous movie role.\"? Yes, no, or maybe?", "doc_id": 873, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41899, 1460, 40329, 37735], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Insiders is a news and talk show produced by ABC News hosted by veteran political journalist Barrie Cassidy. Similarly to the Sunday morning talk shows in the United States, it analyses and discusses Australian politics with the use of a panel of political journalists and columnists and interviews with prominent politicians and commentators. Are we justified in saying that \"Insiders mainly focus on Australian politics.\"? Yes, no, or maybe? Yes\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg. Are we justified in saying that \"Marie Hedwig Auguste of Sulzbach was a man.\"? Yes, no, or maybe? No\n###\nSwinburne Online is the online arm of Swinburne University of Technology which is an Australian university based in Melbourne, Victoria. Swinburne Online was founded in 2011 after a 50-50 joint venture between Swinburne University of Technology and SEEK Learning seeking to capitalise on increasing demand for off-campus education. Are we justified in saying that \"Swingborn Online is an online university that is partnered with Swinburne University of Technology.\"? Yes, no, or maybe? Yes\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"Ticino is the swiss' most traveled canton\"? Yes, no, or maybe? Maybe\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"Lourdes Ver\u00f3nica Ar\u00e9valos Elias was born on a cold winter day.\"? Yes, no, or maybe?", "doc_id": 675, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31537, 37196, 3537, 41261], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Babar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\". Are we justified in saying that \"Nelvana Limited is a Canadian based company\"? Yes, no, or maybe? Maybe\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Lucy's son has a book written about him.\"? Yes, no, or maybe? Yes\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The Barbary ostrich was also referred to as The North African ostrich by scientists.\"? Yes, no, or maybe? Maybe\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory. Are we justified in saying that \"It was written by Emmanuel Chabrier in 1887\"? Yes, no, or maybe? Yes\n###\nFrank John Gorshin, Jr. (April 5, 1933 \u2013 May 17, 2005) was an American character actor, impressionist, and comedian. He was perhaps best known as an impressionist, with many guest appearances on \"The Ed Sullivan Show\" and \"Tonight Starring Steve Allen\". His most famous acting role was as the Riddler on the live-action television series \"Batman\". Are we justified in saying that \"Frank John Gorshin, Jr. created \"The Ed Sullivan Show\"\"? Yes, no, or maybe?", "doc_id": 841, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18668, 39583, 30765, 26636], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale. Are we justified in saying that \"Gossip Girl, is found on streaming services\"? Yes, no, or maybe? Maybe\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\". Are we justified in saying that \"Brown University should encourage more unique projects.\"? Yes, no, or maybe? Maybe\n###\nThe Stanchester Hoard is a hoard of 1,166 Roman coins dating from the fourth to early fifth century found at Wilcot, in the Vale of Pewsey, Wiltshire, England in 2000. The find was considered important because of the large quantity of unclipped silver coins contained within. It was also the latest dated example of Roman coins found in Wiltshire. Are we justified in saying that \"The Stanchester Hoard contains gold Roman coins\"? Yes, no, or maybe? Maybe\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"Johan Martin Schr\u00f6der was born before 1990\"? Yes, no, or maybe? Yes\n###\nKalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others. Are we justified in saying that \"Ravi M worked on movie sets.\"? Yes, no, or maybe?", "doc_id": 244, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14946, 27965, 38100, 17151], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues. Are we justified in saying that \"In May 2006, Ethereal changed it's name to Wireshark.\"? Yes, no, or maybe? Yes\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia. Are we justified in saying that \"There are 12 schools in the Big 12 Conference.\"? Yes, no, or maybe? No\n###\nThe Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis. Are we justified in saying that \"Jose Celestino Mutis was founded in the summer of 1955.\"? Yes, no, or maybe? Maybe\n###\nJack Thomas Chick (April 13, 1924 \u2013 October 23, 2016) was an American cartoonist and publisher, best known for his evangelical fundamentalist Christian \"Chick tracts\", which presented his perspective on a variety of issues through sequential-art morality plays. Are we justified in saying that \"Jack Thomas Chick was born in April\"? Yes, no, or maybe? Yes\n###\nJaeden Wesley Lieberher (born January 4, 2003) is an American actor. He is known for starring as Bill Denbrough in the horror film \"It\" (2017), and for his leading roles in the films \"St. Vincent\", as Oliver Bronstein, \"Midnight Special\", as Alton Meyer, \"The Confirmation\", as Anthony, \"The Book of Henry\", as Henry Carpenter. Are we justified in saying that \"Jaeden Wesley Lieberher was born after the 20th century\"? Yes, no, or maybe?", "doc_id": 519, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28492, 36, 42813, 23742], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bang the Drum Slowly is a 1973 American sports drama film directed by John D. Hancock, about a baseball player of limited intellect who has a terminal illness, and his brainier, more skilled teammate. It is film adaptation of the 1956 baseball novel of the same name by Mark Harris. It was previously dramatized in 1956 on the \"U.S. Steel Hour\" with Paul Newman, Albert Salmi and George Peppard. Are we justified in saying that \"Bang the Drum slowly tells the story of 2 baseball players.\"? Yes, no, or maybe? Yes\n###\nThe Joan Ganz Cooney Center (informally, the Cooney Center) is an independent, non-profit, non-partisan research and innovation group founded by Sesame Workshop in order to advance children\u2019s literacy skills and foster innovation in children\u2019s learning through digital media. Are we justified in saying that \"The Joan Ganz Cooney Center is no longer run by the Sesame Workshop.\"? Yes, no, or maybe? Maybe\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"V.S. Reid is a popular Jamaican author.\"? Yes, no, or maybe? Maybe\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) . Are we justified in saying that \"The first academic lesson started in 1994.\"? Yes, no, or maybe? No\n###\nIn theoretical physics, particularly in discussions of , Mach's principle (or Mach's conjecture) is the name given by Einstein to an imprecise hypothesis often credited to the physicist and philosopher Ernst Mach. The idea is that local inertial frames are determined by the large scale distribution of matter, as exemplified by this anecdote: Are we justified in saying that \"Mach did not name Mach's principle.\"? Yes, no, or maybe?", "doc_id": 200, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33514, 29924, 8806, 37197], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of English and Spanish language placenames in the Falkland Islands. Most of the Spanish language names are quite different in origin to their English equivalents, and many have religious resonances. Some names were given by the Spanish \"conquistadores\", while others were given later by the Argentine government. Are we justified in saying that \"This is a list of English and Spanish language placenames in the United States\"? Yes, no, or maybe? No\n###\nThameslink and Great Northern are the brand names used by the Govia Thameslink Railway train operating company on the Thameslink and Great Northern routes of the Thameslink, Southern and Great Northern franchise, previously operated by First Capital Connect. Are we justified in saying that \"It was previously operated by a company that starts with an F\"? Yes, no, or maybe? Yes\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"The census occurred four years prior to 2015.\"? Yes, no, or maybe? Yes\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games. Are we justified in saying that \"Hannah Kate Whelan is born in the year of the Monkey, according to the chinese zodiac\"? Yes, no, or maybe? Yes\n###\nUncommon Danger is the second novel by British thriller writer Eric Ambler, published in 1937. In his autobiography, \"Here Lies\", Ambler explains the original title was \"Background To Danger\", but his British publisher disliked the word 'background', so it was published in all English-speaking countries except the US as \"Uncommon Danger\". Are we justified in saying that \"Eric Ambler would have preferred his second novel to be titled \"Background To Danger\".\"? Yes, no, or maybe?", "doc_id": 399, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10289, 44537, 17264, 29046], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff. Are we justified in saying that \"Hideki Kamiya does not like being a game designer\"? Yes, no, or maybe? Maybe\n###\nHoodlum is a 1997 American crime drama film that gives a fictionalized account of the gang war between the Italian/Jewish mafia alliance and the Black gangsters of Harlem that took place in the late 1920s and early 1930s. The film concentrated on Ellsworth \"Bumpy\" Johnson (Laurence Fishburne), Dutch Schultz (Tim Roth), and Lucky Luciano (Andy Garc\u00eda). Are we justified in saying that \"Hoodlum is a crime drama film from 1997 and is about a fictional gang war between the Italian/Jewish mafia alliance and the black gangsters in Harlem, and is set in the late \"? Yes, no, or maybe? Maybe\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million. Are we justified in saying that \"Over 70,000 homes will benefit from the Eolica Sarichioi Wind Farm project.\"? Yes, no, or maybe? No\n###\nNational Security is a 2003 action comedy film, directed by Dennis Dugan, starring Martin Lawrence and Steve Zahn. In addition to Lawrence and Zahn, \"National Security\" boasts an additional cast of Bill Duke, Eric Roberts, Colm Feore, Matt McCoy, and others. Are we justified in saying that \"Dennis Dugan was very proud of the movie National Security.\"? Yes, no, or maybe? Maybe\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\"). Are we justified in saying that \"Christy performed a song in German for Eurovision 1976.\"? Yes, no, or maybe?", "doc_id": 13, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4683, 9046, 38601, 4916], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corey Gibson, known professionally as Corey Chorus, is an American songwriter, record producer, vocal producer, sound engineer and publisher, known for having written songs such as Cheers (Drink to That) of Rihanna, Chica Bomb by Dan Balan, Made in the USA by Demi Lovato. Are we justified in saying that \"He is know as Corey Chorus because he is an American songwriter.\"? Yes, no, or maybe? Maybe\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts. Are we justified in saying that \"changeable phacelia is the most common borage family plant in Baja California\"? Yes, no, or maybe? Maybe\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label. Are we justified in saying that \"Bad Company later made music that was slightly pop influenced\"? Yes, no, or maybe? Maybe\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles. Are we justified in saying that \"The 1970 Swedish open was held in 1969\"? Yes, no, or maybe? No\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band. Are we justified in saying that \"Metropolis Records is the only record label to release their album.\"? Yes, no, or maybe?", "doc_id": 738, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16571, 32520, 9838, 12235], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Left Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker. Are we justified in saying that \"Left Hand Spring isn't located in Canada.\"? Yes, no, or maybe? Yes\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines. Are we justified in saying that \"Resil B. Mojares got his degree from a prestigious university\"? Yes, no, or maybe? Maybe\n###\nThe New York Red Bulls II are an American professional soccer team based in Harrison, New Jersey. They are the New York Red Bulls reserve team that plays in the United Soccer League (USL), one of two second-tier leagues in the American soccer pyramid. Are we justified in saying that \"The Bulls sometimes play in the minor leagues\"? Yes, no, or maybe? Maybe\n###\nHedera helix (common ivy, English ivy, European ivy, or just ivy) is a species of flowering plant in the family Araliaceae, native to most of Europe and western Asia. A rampant, clinging evergreen vine, it is a familiar sight in gardens, waste spaces, on house walls, tree trunks and in wild areas across its native habitat. Are we justified in saying that \"Hedera helix is seen in parts on europe\"? Yes, no, or maybe? Yes\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"A model who represented Prague for Miss Universe 2006 didn't win the title.\"? Yes, no, or maybe?", "doc_id": 433, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26309, 3919, 21476, 37363], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Die Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen. Are we justified in saying that \"Die Antwoord is a South African hip hop group formed in Cape Town in 2002\"? Yes, no, or maybe? No\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\". Are we justified in saying that \"Two Lovers and a Bear won an award for Emmanuel. He was also at the first awards ceremony and picked up his first win. He won Best Art Direction or Production Design in 2017 and 2013 which was the award ceremonies first season.\"? Yes, no, or maybe? Yes\n###\nAodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\". Are we justified in saying that \" Aodh Mac Cathmhaoil disliked writing immensely\"? Yes, no, or maybe? Maybe\n###\nLike the Roman: The Life of Enoch Powell is a 1998 book by the English writer Simon Heffer. It is a biography of the politician Enoch Powell. The title is taken from Powell's 1968 Rivers of Blood speech when Powell quoted Virgil's \"Aeneid\": \"As I look ahead, I am filled with foreboding; like the Roman, I seem to see the River Tiber foaming with much blood\". Are we justified in saying that \"Enoch Powell was a great man.\"? Yes, no, or maybe? Maybe\n###\nPeeya Rai Chowdhary is an Indian actress. Peeya Rai was married to model Shayan Munshi in 2006, but separated from him in 2010. She played Lakhi in Gurinder Chadha's \"Bride and Prejudice,\" Rita in the movie \"The Bong Connection\" (where she worked with husband Munshi) and played \"Kiran\" in the TV show \"Hip Hip Hurray\". She studied at National College, Mumbai. Are we justified in saying that \"Peeya Rai was not married to Munshi while she was in the TV show Hip Hip Hurray.\"? Yes, no, or maybe?", "doc_id": 237, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5973, 36706, 39445, 37706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash. Are we justified in saying that \"The Middlewich Folk and Boat Festival began after the Second World War\"? Yes, no, or maybe? Maybe\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting. Are we justified in saying that \"Dungeons and dragons is the best game ever\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall is the most popular mall in Augusta County.\"? Yes, no, or maybe? Maybe\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"Young people are happy in Guatemala. \"? Yes, no, or maybe? Maybe\n###\nThe Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice. Are we justified in saying that \"The Mannlicher\u2013Sch\u00f6nauer rifle is a popular rifle, used by modern military forces.\"? Yes, no, or maybe?", "doc_id": 183, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36827, 35375, 43416, 19614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single, Are we justified in saying that \"The song Inbetweener by English Britpop band Sleeper is over 3 years old\"? Yes, no, or maybe? Yes\n###\nGuy Nicholson Turnbow (March 28, 1908 \u2013 October 4, 1975) was an American football tackle who played two seasons with the Philadelphia Eagles of the National Football League. He played college football at the University of Mississippi and attended Brookhaven High School in Brookhaven, Mississippi. Are we justified in saying that \"Guy played football at Brookhaven High School.\"? Yes, no, or maybe? Maybe\n###\nSystem of a Down is the debut studio album by Armenian-American metal band System of a Down, released on June 30, 1998, by American Recordings and Columbia Records. The album was later certified gold by the Recording Industry Association of America on February 2, 2000. Two years later, after the success of System of a Down's next album, \"Toxicity\", the album was certified platinum. Are we justified in saying that \"The album was released in the sixth month.\"? Yes, no, or maybe? Yes\n###\nGeorge Corrie (born 16 September 1973) is an English footballer, born in Workington, who played for ten years as a midfielder for American USL Second Division side Wilmington Hammerheads, of which he was the captain. He joined the Hammerheads in 1999 after six seasons with Conference North team Workington A.F.C.. Are we justified in saying that \"George Corrie is from Europe\"? Yes, no, or maybe? Yes\n###\nThe Basketbowl was a college basketball game between Michigan State University and the University of Kentucky held on December 13, 2003 at Ford Field, a domed American football stadium in Detroit, Michigan. Kentucky won the game 79\u201374, never trailing throughout the contest. Are we justified in saying that \"Kentucky won the game with a 5 point lead \"? Yes, no, or maybe?", "doc_id": 576, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17279, 16594, 11192, 40285], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Little Casterton is a small village and civil parish in Rutland, England. The population of the civil parish at the 2001 census was 148, increasing to 218 at the 2011 census. It is about two miles (3 km) north of Stamford on a minor road that runs to the south of the River Gwash between Great Casterton and Ryhall. Are we justified in saying that \"Little Casterton's population increased due to natural births.\"? Yes, no, or maybe? Maybe\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"The population has since increased due to the government trying to settle the people there\"? Yes, no, or maybe? Maybe\n###\nThe Holiday Bowl is a post-season NCAA Division I Football Bowl Subdivision college football bowl game that has been played annually since 1978 at Qualcomm Stadium in San Diego, California, United States. Since the 2014 edition, it has featured a matchup of Pac-12 and Big Ten teams. Are we justified in saying that \"The Pac-12 team has always won the Holiday Bowl.\"? Yes, no, or maybe? Maybe\n###\n\"Girl in a Country Song\" is the debut single by American country music duo Maddie & Tae, co-written with Aaron Scherz and released in July 2014. The song is an answer to the \"bro-country\" subgenre in contemporary country music, specifically in how women are portrayed by men, with lyrics containing references to a variety of popular recent country songs. Are we justified in saying that \"The bro-country sub genre of country music is sexist.\"? Yes, no, or maybe? Maybe\n###\nGerard A. \"Gerry\" Salton (8 March 1927 in Nuremberg \u2013 28 August 1995), was a Professor of Computer Science at Cornell University. Salton was perhaps the leading computer scientist working in the field of information retrieval during his time, and \"the father of information retrieval\". His group at Cornell developed the SMART Information Retrieval System, which he initiated when he was at Harvard. Are we justified in saying that \"Salton was the only professor at Cornell.\"? Yes, no, or maybe?", "doc_id": 512, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13586, 39872, 19270, 30091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Ligier JS17 was a Formula One car designed by G\u00e9rard Ducarouge and Michel Beaujon for use by the Ligier team during the season. Powered by a Talbot-badged Matra V12, the JS17 was driven to two Grand Prix wins by Jacques Laffite. It was updated to JS17B specification for the season until it was replaced later that year by the JS19. Are we justified in saying that \"Jacques Laffite was a dual champion.\"? Yes, no, or maybe? Yes\n###\nBusby is a census-designated place (CDP) in Big Horn County, Montana, United States. It is on the Northern Cheyenne reservation. The population was 745 at the 2010 census. The town is near the site of the Battle of the Rosebud and the associated Rosebud Battlefield State Park, where General George Custer forces encountered Sioux and Cheyenne forces led by Crazy Horse. Are we justified in saying that \"rosebud battlefield state park is big\"? Yes, no, or maybe? Maybe\n###\nFuhrmann & Schmidt Brewing Company was formed in 1906 and was located at Commerce and Washington Streets in Shamokin, Pennsylvania. Fuhrmann & Schmidt was the successor company to the Eagle Brewing Company (1854 \u2013 1878), the M. Markel & Company (1878 \u2013 1893) and Phillip H Fuhrmann (1893 \u2013 1906). Are we justified in saying that \"Fuhrmann & Schmidt Brewing Company has a tasty selection of beer.\"? Yes, no, or maybe? Maybe\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\"). Are we justified in saying that \"Sanation was a french political movement\"? Yes, no, or maybe? No\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983. Are we justified in saying that \"Robert L. \"Rusty\" White was born less than 6666 days ago.\"? Yes, no, or maybe?", "doc_id": 956, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23544, 40833, 4610, 9024], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Petasites is the least well known genus of plants in the sunflower family.\"? Yes, no, or maybe? Maybe\n###\nSalli Elise Richardson (born November 23, 1967) is an American television and film actress and director. Richardson is known for her role as Angela on the 1994 hit comedy/action film \"A Low Down Dirty Shame\" and for her role as Dr. Allison Blake on the Syfy comedy-drama series \"Eureka\" (2006\u20132012). Are we justified in saying that \"Salli Elise Richardson starred in the Syfy comedy-drama series \"Eureka\" (2006\u20132012)\"? Yes, no, or maybe? Yes\n###\nDanville is an unincorporated community and census-designated place (CDP) in Ferry County, Washington, United States. Danville is located on Washington State Route 21 near the Canada\u2013United States border, 31 mi north-northeast of Republic, the Ferry County seat. Danville has a post office with ZIP code 99121. The population at the 2010 census was 34. Are we justified in saying that \"Danville is an incorporated community and census-designated place in the US. \"? Yes, no, or maybe? No\n###\n1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire. Are we justified in saying that \"The American version of the show is the most popular. \"? Yes, no, or maybe? Maybe\n###\nThe Sheshan Basilica, officially the National Shrine and Minor Basilica of Our Lady of Sheshan () and also known as Basilica of Mary, Help of Christians is a prominent Roman Catholic church in Shanghai, China. Its common name comes from its location on the western peak of Sheshan Hill, located in Songjiang District, to the west of Shanghai's metropolitan area. Are we justified in saying that \"The Sheshan Basilica is located in a country next to India.\"? Yes, no, or maybe?", "doc_id": 46, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31563, 15691, 30700, 22342], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Circus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Critics felt the movie deserved the two Israeli Film Academy Awards it did not win.\"? Yes, no, or maybe? Maybe\n###\nPrinceton Junction is a railroad station in Princeton Junction, New Jersey, located in West Windsor Township. It serves NJ Transit (NJT) and Amtrak on the Northeast Corridor (NEC), and NJ Transit on the Princeton Branch. The station's Amtrak station code is PJC. Are we justified in saying that \"Princeton Junction has a railroad station called Princeton Junction in New Jersey that goes to Princeton.\"? Yes, no, or maybe? Maybe\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator. Are we justified in saying that \"Wishart received a scholarship to play football.\"? Yes, no, or maybe? Maybe\n###\nCari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite. Are we justified in saying that \"She has no friends\"? Yes, no, or maybe? Maybe\n###\nSilver Bow County is a county in the State of Montana. As of the 2010 census, the population was 34,200. Its county seat is Butte. In 1977, the city and county governments consolidated to form the single entity of Butte-Silver Bow. Additionally, the town of Walkerville is a separate municipality from Butte and is within the county. Are we justified in saying that \"Montana is made up of Walkerville, along with additional entities.\"? Yes, no, or maybe?", "doc_id": 921, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21970, 9258, 38092, 32694], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Penthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue. Are we justified in saying that \"Penthouse was later released on dvd.\"? Yes, no, or maybe? Maybe\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States in the central time zone.\"? Yes, no, or maybe? Maybe\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\". Are we justified in saying that \"Paint It Black is the first song on the album.\"? Yes, no, or maybe? Yes\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line. Are we justified in saying that \"The Takoma Langle Crossroads Transit Center is located in Maryland, which is in the East of the US. It is a very large bus transfer.\"? Yes, no, or maybe? Maybe\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"You can visit Storybrooke, Maine \"? Yes, no, or maybe?", "doc_id": 188, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16907, 8443, 37808, 44104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ashcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"New York's most populated county is Ontario County\"? Yes, no, or maybe? Maybe\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals. Are we justified in saying that \"Udinese Calcio won the finals on Serie A\"? Yes, no, or maybe? No\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only. Are we justified in saying that \"After achieving early underground fame,the Sisters of Mercy band is a touring outfit only.\"? Yes, no, or maybe? Yes\n###\nThe Cabinet of Dr. Caligari is a 2005 American independent film, and a remake of the 1920 silent film of the same name. It was directed by David Lee Fisher and released in the U.S. at the ScreamFest Film Festival on October 22, where it won three prizes: the Audience Choice Award, Best Cinematography and Best Special Effects. Are we justified in saying that \"ScreamFest is an independent film festival.\"? Yes, no, or maybe? Maybe\n###\nWarriors of Virtue is a 1997 Chinese-American martial arts fantasy film directed by Ronny Yu and starring Angus Macfadyen, Mario Yedidia, and Marley Shelton. It was released in English, Mandarin and Cantonese-language versions. The creature effects were designed by Academy Award-nominated special effect production house Alterian, Inc. Are we justified in saying that \"Alterian Inc have been nominated for many awards in the past.\"? Yes, no, or maybe?", "doc_id": 250, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32206, 1878, 43650, 22888], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mystery-Bouffe (Russian: \u041c\u0438\u0441\u0442\u0435\u0440\u0438\u044f-\u0411\u0443\u0444\u0444 ; Misteriya-Buff) is a socialist dramatic play written by Vladimir Mayakovsky in 1918/1921. Mayakovsky stated in a preface to the 1921 edition that \"in the future, all persons performing, presenting, reading or publishing \"Mystery-Bouffe\" should change the content, making it contemporary, immediate, up-to-the-minute.\" Are we justified in saying that \"Mystery Bouffe will be made into a movie in 2021\"? Yes, no, or maybe? Maybe\n###\nSaat Din Mohabbat In (English: \"Seven days in love\" ) is an upcoming Pakistani romantic drama film directed by Meenu-Farjad, produced by Dawn Films and IMGC Global Entertainment and written by Fasih Bari Khan. The film features Mahira Khan and Sheheryar Munawar in lead roles and is also their second mutual film after \"Ho Mann Jahaan\". Are we justified in saying that \"Khan and Munawar have been in more than one film together.\"? Yes, no, or maybe? Yes\n###\nCarolyn Keene is the pseudonym of the authors of the Nancy Drew mystery stories and The Dana Girls mystery stories, both produced by the Stratemeyer Syndicate. In addition, the Keene pen name is credited with the Nancy Drew spin-off, \"River Heights and the Nancy Drew Notebooks. Are we justified in saying that \"The Nancy Drew series were not written by a woman named Carolyn Keene\"? Yes, no, or maybe? Yes\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015. Are we justified in saying that \"The 1985 Nebraska Cornhuskers played only in Texas.\"? Yes, no, or maybe? No\n###\nThe Dr. Samuel D. Mercer House was built in 1885 at 3920 Cuming Street in the historic Walnut Hill neighborhood of Omaha, Nebraska, United States. Samuel Mercer was the chief surgeon of the Union Pacific Railroad, and the founder of Omaha's first hospital. Are we justified in saying that \"Construction of the Mercer house involved people who build things.\"? Yes, no, or maybe?", "doc_id": 131, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40479, 6990, 39153, 15788], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016. Are we justified in saying that \"Amanda Knox killed Meredith Kercher.\"? Yes, no, or maybe? Maybe\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city. Are we justified in saying that \"Ogallala was founded over a hundred years ago.\"? Yes, no, or maybe? Maybe\n###\n\"May the Bird of Paradise Fly Up Your Nose\" is a 1965 novelty song performed by Little Jimmy Dickens. It was Dickens' most successful single on the U.S. country music chart. It spent two weeks at No. 1 that November, and stayed on the chart for a total of 18 weeks. On the overall \"Billboard\" Hot 100 the song peaked at No. 15. Are we justified in saying that \"\"May the Bird of Paradise Fly Up Your Nose\" never made the top 10 of \"Billboard\" Hot 100\"? Yes, no, or maybe? Yes\n###\nCardinal Newman College is a Catholic sixth form college close to the centre of Preston. The college was graded \"\"outstanding\"\" by Ofsted in May 2009. The college was then granted \"\"Beacon college\"\" status by the Learning and Skills Improvement Service in November 2010. Are we justified in saying that \"Cardinal Newman College was recognized for being outstanding in 2009.\"? Yes, no, or maybe? Yes\n###\nMelbourne Heart FC Futsal was a futsal club based in Melbourne, Victoria, founded in 2012. They played in the F-League, the top tier of Australian Futsal. The club was disbanded before the start of the 2014 season after the A-League team were bought by Manchester City FC. Are we justified in saying that \"Melbourne Heart FC Futsal was created before the A-League team.\"? Yes, no, or maybe?", "doc_id": 899, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35981, 27132, 15519, 38194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Regent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day. Are we justified in saying that \"Regent Power plans to expand its production in 2021.\"? Yes, no, or maybe? Maybe\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD). Are we justified in saying that \"Liberal Citizens Action emerged from the Liberal Federation.\"? Yes, no, or maybe? Yes\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles. Are we justified in saying that \"The movie was a box office success\"? Yes, no, or maybe? Maybe\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively. Are we justified in saying that \"Shadowgun Legends is a famous video game. \"? Yes, no, or maybe? Maybe\n###\nEnrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America. Are we justified in saying that \"Enrique Leff created Latin America with his 25 books that he wrote\"? Yes, no, or maybe?", "doc_id": 748, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [45187, 1150, 19565, 23813], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier. Are we justified in saying that \"Pablo Picassa had a relationship with Fernande Olivier and was said to have painted over 90 portraits of her.\"? Yes, no, or maybe? No\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall. Are we justified in saying that \"People protested turning the fortress into a national museum. \"? Yes, no, or maybe? Maybe\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town. Are we justified in saying that \"Harbour Place Shopping Centre is a shopping centre.\"? Yes, no, or maybe? Yes\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Izzadeen Mohamed Naufer plays left wing roughly twice as much as he plays right wing.\"? Yes, no, or maybe? Maybe\n###\nIn the fall of 1997, Elton John set out on tour to promote his latest album \"The Big Picture\" with the Big Picture Tour. The album was a commercial success reaching No. 9 on the US \"Billboard\" 200 and No. 3 on the UK Albums Chart. The 1997 tour started off in North America and ended in Europe. Are we justified in saying that \"Elton John starts with a B.\"? Yes, no, or maybe?", "doc_id": 608, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27090, 20955, 8324, 22687], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies. Are we justified in saying that \"Yahoo Serious is a bit of a silly man\"? Yes, no, or maybe? Maybe\n###\nPaul Revere ( ; December 21, 1734 O.S.May 10, 1818) was an American silversmith, engraver, early industrialist, and Patriot in the American Revolution. He is best known for his midnight ride to alert the colonial militia in April 1775 to the approach of British forces before the battles of Lexington and Concord, as dramatized in Henry Wadsworth Longfellow's poem, \"Paul Revere's Ride\" (1861). Are we justified in saying that \"Paul Revere never wore pajamas.\"? Yes, no, or maybe? Maybe\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999. Are we justified in saying that \"Daoud Abdel Sayed is not translated to \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\"? Yes, no, or maybe? No\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty. Are we justified in saying that \"Brown tried to cover up the crime.\"? Yes, no, or maybe? Maybe\n###\nCofield Mundi is a South African singer and songwriter born in Johannesburg, South Africa. Raised in a musical family, she began singing and performing from a young age and wrote her first song at the age of 12. Her aunt is South African born actress and singer Jill Kirkland, famous for her role in the movie \"Katrina\". Are we justified in saying that \"South Africa was too confining for Mundi\"? Yes, no, or maybe?", "doc_id": 324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27678, 20212, 3300, 8648], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonders of the Universe is a 2011 book by the theoretical physicists Brian Cox and Andrew Cohen. The book is about cosmology and the universe, and is explained in a way that is accessible to a general reader. The book is based on a series with the same name \"Wonders of the Universe\". Are we justified in saying that \"Wonders of The Universe had many seasons\"? Yes, no, or maybe? Maybe\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) . Are we justified in saying that \"Sabanc\u0131 University is more the 20 KM away from the Istanbul's city center.\"? Yes, no, or maybe? Yes\n###\nHarold Buttleman, Daredevil Stuntman (also known as Buttleman) is a 2003 film written and directed by Francis Stokes; the only movie he has directed. It won the Jury Prize at the Deep Ellum Film Festival in 2003. It was awarded the audience award in the Had to Be Made Film Festival in 2005. Are we justified in saying that \"Harold Buttleman, Daredevil Stuntman won awards two years apart.\"? Yes, no, or maybe? Yes\n###\nStephen Tyrone Colbert ( , ; born May 13, 1964) is an American comedian, television host, actor, and writer. He is best known for hosting the satirical Comedy Central program \"The Colbert Report\" from 2005 to 2014, and hosting the CBS talk program \"The Late Show with Stephen Colbert\" beginning in September 2015. Are we justified in saying that \"Tyrone is the middle name of Colbert.\"? Yes, no, or maybe? Yes\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery. Are we justified in saying that \"The Anchor Bankside has been a pub for 800 years.\"? Yes, no, or maybe?", "doc_id": 999, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7711, 9492, 32137, 33695], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Uni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats. Are we justified in saying that \"Uni\u00f3n Deportiva Vall de Ux\u00f3 is a popular team in Spain\"? Yes, no, or maybe? Maybe\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley. Are we justified in saying that \"Bernard often wrote tragedies \"? Yes, no, or maybe? Maybe\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement. Are we justified in saying that \"Princess Caroline died when she was 1 year old\"? Yes, no, or maybe? No\n###\nIn the United States, a utilities commission, utility regulatory commission (URC), public utilities commission (PUC) or public service commission (PSC) is a governing body that regulates the rates and services of a public utility. In some cases, government bodies with the title \"public service commission\" may be civil service oversight bodies, rather than utilities regulators. Are we justified in saying that \"The utilities feature will continue to be upgraded to better suit the needs of everyone.\"? Yes, no, or maybe? Maybe\n###\nJames Montgomery (born May 12, 1949) is an American blues musician, best known as the lead singer, blues harp player, frontman, and bandleader of The James Montgomery Blues Band (a.k.a. The James Montgomery Band). Montgomery collaborates with many star performers and recording artists. He is also the past President of The New England Blues Society. Are we justified in saying that \"James Montgomery was born on an odd day\"? Yes, no, or maybe?", "doc_id": 726, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36778, 22728, 25045, 28336], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Westbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008. Are we justified in saying that \"Westbury Senior High School is a very dirtyschool\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"The Six-Day War resulted in a lot of ammo used.\"? Yes, no, or maybe? Maybe\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University. Are we justified in saying that \"Falwell was a member of The Old Time Gospel Hour Quartet.\"? Yes, no, or maybe? No\n###\nGlenn Martin Christopher Francis Quinn (May 28, 1970 \u2013 December 3, 2002) was an Irish actor in television and film, known for playing Mark Healy in the American sitcom \"Roseanne\", and Doyle, a half-demon, on \"Angel\", a spin-off series of \"Buffy the Vampire Slayer\". Are we justified in saying that \"Glenn Martin Christopher Francis Quinn is an Irish actor known for playing a half-demon, half-angel.\"? Yes, no, or maybe? Maybe\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina. Are we justified in saying that \"Irina Selyutina won the doubles\"? Yes, no, or maybe?", "doc_id": 347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35520, 27209, 11956, 41435], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941. Are we justified in saying that \"Christopher was killed at Pearl Harbor in Nov 1941\"? Yes, no, or maybe? No\n###\nPiedmont Avenue is a major thoroughfare in Atlanta, beginning in Downtown Atlanta and ending at its continuation as Piedmont Road (Georgia State Route 237) just before crossing under Interstate 85. Along the way, Piedmont Avenue passes through Midtown Atlanta where several historic properties are located on the street. Are we justified in saying that \"Piedmont Avenue passes through all areas of Atlanta.\"? Yes, no, or maybe? Maybe\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player. Are we justified in saying that \"David Krakauer starts with D.\"? Yes, no, or maybe? Yes\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008. Are we justified in saying that \"Laura Elena Z\u00fa\u00f1iga Huizar was born in the same decade as \"Miss Bala\" was released.\"? Yes, no, or maybe? No\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972. Are we justified in saying that \"William V. Bidwill Sr. had more than one brother\"? Yes, no, or maybe?", "doc_id": 10, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14741, 8612, 3949, 19599], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards. Are we justified in saying that \"Jake Deckard not only won Performer of the Year in 2008, he also got married.\"? Yes, no, or maybe? Maybe\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts. Are we justified in saying that \"There were plans to broadcast the British sitcom Drifters in 2012 but it was delayed until 2013.\"? Yes, no, or maybe? Maybe\n###\nThe Arboretum Curie, also known as the Arboretum du Col des Trois Soeurs, is a small arboretum located at 1470 metres altitude in the Col des Trois Soeurs near La Panouse, Loz\u00e8re, Languedoc-Roussillon, France. It was created circa 1975 to study conifers suitable for reforestation, and according to Arbez et al., now contains 77 taxa (primarily conifers). Are we justified in saying that \"The Arboretum Curie is a very popular tourist attraction in France.\"? Yes, no, or maybe? Maybe\n###\nOtard, also known as Chateau de Cognac, is a French cognac house founded in 1795 by Jean-Baptiste Antoine Otard. The company has remained in the hands of the same family since its establishment. The firm is based in the Ch\u00e2teau des Valois (Ch\u00e2teau de Cognac), Cognac, Charente, its home since 1796. Are we justified in saying that \"Cognac was started in 1794\"? Yes, no, or maybe? No\n###\nGiovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg. Are we justified in saying that \"Giovanni Ferrero is still alive to this day.\"? Yes, no, or maybe?", "doc_id": 886, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33755, 22601, 20572, 18108], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording. Are we justified in saying that \"Hamilton is based on the life of Alexa Hamilton.\"? Yes, no, or maybe? No\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Alice Claeys children compete in figure skating like their mother.\"? Yes, no, or maybe? Maybe\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"the Lord's Cricket Ground could only fit 2,000 spectators at a time. \"? Yes, no, or maybe? Maybe\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\". Are we justified in saying that \"The Lord of the Rings: The Fellowship of the Ring is one of the greatest and most influential fantasy films ever made.\"? Yes, no, or maybe? Maybe\n###\nRecurring was the fourth and final Spacemen 3 studio album, finally released (after considerable delay) in February 1991, some time after the band had broken up. By the time the album was recorded, relations between the band had soured to the extent that the record is in 2 parts - the first side by Peter Kember, and the second by Jason Pierce. Are we justified in saying that \"Recurring was released in nineteen hundred ninety nine.\"? Yes, no, or maybe?", "doc_id": 827, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37833, 35950, 29492, 29128], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German. Are we justified in saying that \"The song was inspired by a poem.\"? Yes, no, or maybe? Yes\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"\"Fight or Flight\" is still running.\"? Yes, no, or maybe? Maybe\n###\nA Bhumka is the term for a traditional herbal healer in the valley of Patalkot, India. The valley is mainly home to members of the Bharia and Gond tribes, with 2,000 residents scattered between various villages and hamlets. Tribes people traditionally use herbal medicine, under the auspices of a herbal expert and holy man known as a Bhumka. Are we justified in saying that \"There are only about 2 dealers in the valley\"? Yes, no, or maybe? Maybe\n###\nSophie Tucker (January 13, 1887 \u2013 February 9, 1966) was a Ukrainian-born American singer, comedian, actress, and radio personality. Known for her stentorian delivery of comical and risqu\u00e9 songs, she was one of the most popular entertainers in America during the first half of the 20th century. She was widely known by the nickname \"The Last of the Red Hot Mamas\". Are we justified in saying that \"Ms. Tucker's favorite way to entertain her fans was through her comedy acts.\"? Yes, no, or maybe? Maybe\n###\nBarbro Martinsson (born 16 August 1935) is a former Swedish cross country skier who competed during the 1960s. Born in Valbo, she won two silver medals in the 3 x 5 km at the 1964 Winter Olympics and the 1968 Winter Olympics. Martinsson finished 4th in the 1968 Winter Olympics in both 5 km and 10 km. Are we justified in saying that \"Martinsson no longer skis. \"? Yes, no, or maybe?", "doc_id": 681, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7085, 18222, 21732, 23145], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Phacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland. Are we justified in saying that \"The Joshua tree woodland habitat is an area where a certain species of plant that can be found, the same kind of plant that is in creosote bush scrub.\"? Yes, no, or maybe? Yes\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty. Are we justified in saying that \"A former Detroit Police Chief got involved in a text-messaging scandal.\"? Yes, no, or maybe? Yes\n###\nThe Greensboro Swarm are an American professional basketball team of the NBA G League and an affiliate of the Charlotte Hornets of the National Basketball Association. Based in Greensboro, North Carolina, the team plays their home games at the Greensboro Coliseum Fieldhouse. The team became the eleventh D-League team to be owned by an NBA team. Are we justified in saying that \"The Greensboro Swarm has never been a part of the NBA.\"? Yes, no, or maybe? No\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH). Are we justified in saying that \"IDH gets a lot of criticism\"? Yes, no, or maybe? Maybe\n###\nAdrienne Maloof (born September 4, 1961) is an American businesswoman, television personality, shoe designer and co-owner of the various business holdings of Maloof Companies, which include a 2% stake in the Palms Casino Resort in Las Vegas, Nevada; Maloof Productions, Maloof Music and the annual Maloof Money Cup skateboarding event. Are we justified in saying that \"Adrienne Maloof was born in America.\"? Yes, no, or maybe?", "doc_id": 723, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42814, 11460, 36673, 3945], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SFU Exchange is a bus terminus for TransLink located on the campus of Simon Fraser University in Burnaby, British Columbia. Opened on September 3, 1965, it serves primarily students, staff, and faculty of Simon Fraser University and residents of UniverCity. Are we justified in saying that \"SFU Exchange is regularly used by members of the Simon Fraser University though it is open to everyone.\"? Yes, no, or maybe? Maybe\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012. Are we justified in saying that \"Kaley Cuoco is an actress\"? Yes, no, or maybe? Maybe\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"Suntaragaali was 3 hours long.\"? Yes, no, or maybe? Maybe\n###\nCharlotte Marie Pomeline Casiraghi (born 3 August 1986) is the second child of Caroline, Princess of Hanover, and Stefano Casiraghi, an Italian industrialist. She is ninth in line to the throne of Monaco. Her maternal grandparents were Rainier III, Prince of Monaco, and American actress Grace Kelly. She is named after her maternal great-grandmother, Princess Charlotte, Duchess of Valentinois. Are we justified in saying that \"Charlotte Marie Pomeline was born on 3 August 1985\"? Yes, no, or maybe? No\n###\nSydney-Denison was an electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of the electoral district of West Sydney in the Ultimo area and named after Governor Denison. It was abolished in 1904 and absorbed into the electoral district of Pyrmont. Are we justified in saying that \"Sydney-Denison was named after the District of West Sydney.\"? Yes, no, or maybe?", "doc_id": 943, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14922, 16365, 482, 2530], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Santa Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"The bricks outside of Santa Lucia are semi-ruinous and grey.\"? Yes, no, or maybe? Maybe\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord. Are we justified in saying that \"Taylor Swift was one of the musicians who performed at the Svensk Hyllningsfest in 2015.\"? Yes, no, or maybe? Maybe\n###\nThe Mini Hatch, stylized as MINI hatch or MINI Hardtop in the US, also known as Mini Cooper or Mini One or simply the Mini, is a three-door hatchback first introduced in late 2000, with a second generation launched in 2006 and a third generation model launched in 2014. A convertible version was introduced in 2004, with the second generation following in 2008. Are we justified in saying that \"A Mini Cooper and Mini One are the same vehicle as the Mini Hatch.\"? Yes, no, or maybe? Yes\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"Bosch will only run for 2 more seasons.\"? Yes, no, or maybe? Maybe\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world. Are we justified in saying that \"Nosopsyllus fasciatus is part of domestic rats.\"? Yes, no, or maybe?", "doc_id": 422, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29416, 19970, 29066, 22402], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m). Are we justified in saying that \"Sonnette is an unincorporated community in west central Powder River County, state of Montana, United States\"? Yes, no, or maybe? Yes\n###\nHere is a list of all of KF Tirana's Cup seasons from 1939 till end of most recent season. This list shows where they finished the season, how many ties won or lost, how many goals they scored and conceded, how many wins draws and losses they had throughout the season, goal difference, winning difference and number of matches played. Are we justified in saying that \"This list shows where they finished the season, how many time they tied with other teams, how many times they won or how many times they lost from the late nineteen thirties on.\"? Yes, no, or maybe? Yes\n###\nPerformance Car, commonly abbreviated to PC, was an automobile magazine from the United Kingdom published by EMAP between October 1983 and July 1998. As suggested by the title, the magazine focussed on the high performance sector of the car market, from hot hatches through to supercars. Are we justified in saying that \" the United Kingdom had a big automobile subculture.\"? Yes, no, or maybe? Maybe\n###\n\"I Never Picked Cotton\" is a song made famous by country music singer Roy Clark. Written by Bobby George and Charles Williams, the song was released in 1970 as the title track to the album released that same year. The song peaked at No. 5 on the \"Billboard magazine\" Hot Country Singles chart that summer. Are we justified in saying that \"I Never Picked Cotton peaked at No. 2\"? Yes, no, or maybe? No\n###\nCape Verde is a volcanic archipelago situated above an oceanic rise that puts the base of the islands 2 km above the rest of the seafloor. Cape Verde has been identified as a hotspot and it has been argued that a mantle plume might be underneath it causing the volcanic activity and associated geothermal anomalies. Are we justified in saying that \"Scientists understand everything about Cape Verde.\"? Yes, no, or maybe?", "doc_id": 219, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3056, 15777, 1017, 32173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation. Are we justified in saying that \"Samson and Delilah has been performed in at least two languages.\"? Yes, no, or maybe? Yes\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II. Are we justified in saying that \"The Spanish Army would have carried the Campo-Giro before the year 1917,\"? Yes, no, or maybe? Yes\n###\nLloyd Newton Morrisett, Jr. (born November 2, 1929) is an American experimental psychologist with a career in education, communications, and philanthropy. He is one of the founders of the Sesame Workshop, the organization famous for the creation of the children's television shows \"Sesame Street\" which was also co-created by him, \"The Electric Company\", and many others. Are we justified in saying that \"Lloyd Newton Morrisett, Jr was solely responsible for the creation of Sesame street.\"? Yes, no, or maybe? No\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Mentha diemenica, which could be known as slender mint, grows to about 3 feet tall in 4 months and is ready shortly after that.\"? Yes, no, or maybe? Maybe\n###\nVitacost.com, Inc is an American e-commerce company based in Boca Raton, Florida, that sells vitamins, supplements and organic grocery products. The company was bought by Kroger, in 2014. Vitacost was inducted into Inc Magazine's \"Inc. 500 Lifetime Hall of Fame,\" in 2006 as one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005). Are we justified in saying that \"Boca Raton was one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005).\"? Yes, no, or maybe?", "doc_id": 389, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38257, 26506, 2727, 35343], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries. Are we justified in saying that \"In 2007 David Beckham won the Ballon d'Or\"? Yes, no, or maybe? No\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Smithereens was from UK.\"? Yes, no, or maybe? Yes\n###\nErnest R. Kroeger (August 10, 1862 \u2013 April 7, 1934) was an American composer. He is mainly known for the pedagogical works he composed for piano; he also taught music in St. Louis, Missouri. Today his papers are held at the Missouri Historical Society. Are we justified in saying that \"Ernest R. Kroeger was an American composer famous for the works he composed for the violin and cello. \"? Yes, no, or maybe? No\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden. Are we justified in saying that \"Ellon Castle was build in 1745\"? Yes, no, or maybe? No\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg. Are we justified in saying that \"In 2015, Lil Dicky released his debut album with Snoop Dogg\"? Yes, no, or maybe?", "doc_id": 868, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16636, 14630, 33419, 22142], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world. Are we justified in saying that \"Knot's Berry Farm sells jams and jellies near the log flume rides.\"? Yes, no, or maybe? Maybe\n###\nNativity in Black is the name of two Black Sabbath tribute albums that came out in the 1990s and 2000s. The albums were recorded with various heavy metal bands paying tribute to Black Sabbath for their influence on the heavy metal genre of rock music. Are we justified in saying that \"Black Sabbath was the inspiration for the band The Black Keys.\"? Yes, no, or maybe? Maybe\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University. Are we justified in saying that \"Liberty University officially endorsed The Old Time Gospel Hour Quartet.\"? Yes, no, or maybe? Maybe\n###\nThe Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s. Are we justified in saying that \"The album is over 15 years old\"? Yes, no, or maybe? Yes\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area. Are we justified in saying that \"The Pikes Peak Center is sometimes abbreviated as PPC.\"? Yes, no, or maybe?", "doc_id": 587, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4646, 1870, 39694, 8794], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas. Are we justified in saying that \"It was controversial to move the finale out of the original state\"? Yes, no, or maybe? Maybe\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant. Are we justified in saying that \"Hakea gibbosa is the most concerning for New Zealand.\"? Yes, no, or maybe? Maybe\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\" Are we justified in saying that \"Bugger is offensive to old.\"? Yes, no, or maybe? Maybe\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"Although designed to be comedic, the film was deemed unfunny by parts of the population.\"? Yes, no, or maybe? Maybe\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition. Are we justified in saying that \"Best of 4Minute is a two language album.\"? Yes, no, or maybe?", "doc_id": 236, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23271, 8219, 7281, 29978], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"He became a woman\"? Yes, no, or maybe? Maybe\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017. Are we justified in saying that \"The professional tennis players in the 2017 ATP Challenger were happy to play the tournament on hard courts. \"? Yes, no, or maybe? Maybe\n###\nMichiko (\u7f8e\u667a\u5b50 ) , born Michiko Sh\u014dda (\u6b63\u7530\u7f8e\u667a\u5b50 , Sh\u014dda Michiko ) on 20 October 1934, is the Empress of Japan as the wife of Akihito, the current Emperor of Japan reigning from 7 January 1989. She succeeded her mother-in-law, Empress Nagako (K\u014djun), consort of Emperor Hirohito (Sh\u014dwa). Are we justified in saying that \"Michiko was alive in 1934\"? Yes, no, or maybe? Yes\n###\nHomebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line. Are we justified in saying that \"It was written over many years\"? Yes, no, or maybe? Maybe\n###\nThomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 1, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily. From 1951 until 1967, he was the boss of the Lucchese crime family, one of the Five Families that dominates organized crime in New York City. Are we justified in saying that \"Thomas \"Tommy\" Lucchese once had a boss.\"? Yes, no, or maybe?", "doc_id": 638, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1608, 14295, 44972, 22730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southern Methodist University (SMU) is a private research university in Dallas, University Park, and Highland Park, Texas. Founded in 1911 by the Methodist Episcopal Church, South, SMU operates satellite campuses in Plano, Texas, and Taos, New Mexico. SMU is owned by the South Central Jurisdiction of the United Methodist Church. Of the university's 11,643 students, 6,411 are undergraduates. Are we justified in saying that \"SMU has less than 20,000 students in it.\"? Yes, no, or maybe? Yes\n###\nPeter Billingsley (born April 16, 1971), also known as Peter Michaelsen and Peter Billingsley-Michaelsen, is an American actor, director, and producer, known for his role as Ralphie in the 1983 movie \"A Christmas Story\" and as \"Messy Marvin\" in the Hershey's Chocolate Syrup commercials during the 1970s. He began his career as an infant in television commercials. Are we justified in saying that \"Peter Billingsley was 12 years old when he started acting\"? Yes, no, or maybe? No\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Weltenbrand was formed in nineteen hundred ninety six.\"? Yes, no, or maybe? No\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison. Are we justified in saying that \"Victoria Morrison rewrites some blog work into English\"? Yes, no, or maybe? Yes\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland. Are we justified in saying that \"Phacelia pedicellata is a poisonous flower native to the southwestern United States and Baja California\"? Yes, no, or maybe?", "doc_id": 742, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11753, 16410, 28698, 12354], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Take Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\". Are we justified in saying that \"Kim Weston joined the Motown label after Marvin Gaye.\"? Yes, no, or maybe? Maybe\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis. Are we justified in saying that \"Escape from Suburbia: Beyond the American Dream made trillions.\"? Yes, no, or maybe? Maybe\n###\nThe UK Parliament constituency of County Galway was an historic Irish constituency, comprised the whole of County Galway, except for the Borough of Galway. It replaced the pre-Act of Union Parliament of Ireland constituency. Its representatives sat in the British House of Commons. Are we justified in saying that \"The Parliament of Galway was in Ireland county.\"? Yes, no, or maybe? No\n###\nThe Board of Directors Annual Report is an album by vocal group The Mills Brothers with pianist and bandleader Count Basie and His Orchestra featuring performances recorded in 1968 and released on the Dot label. The album follows Basie's 1967 collaboration with The Mills Brothers \"The Board of Directors\". Are we justified in saying that \"There were several women in Count Basie's Orchestra.\"? Yes, no, or maybe? Maybe\n###\nDiorama is the fourth studio album by Australian alternative rock band Silverchair. Released on 31 March 2002 by Atlantic/. It won the 2002 ARIA Music Award for Best Group and Best Rock Album. The album was co-produced by Daniel Johns and David Bottrill. While Bottrill had worked on albums for a variety of other bands, \"Diorama\" marked the first production credit for lead singer Johns. Are we justified in saying that \"Daniel Johns and David Bottrill have spoken to each other\"? Yes, no, or maybe?", "doc_id": 890, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44745, 18092, 38121, 8912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005. Are we justified in saying that \"The first test flight was reported less than 10 years ago.\"? Yes, no, or maybe? No\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions. Are we justified in saying that \"Barry and Stuart are very famous.\"? Yes, no, or maybe? Maybe\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996. Are we justified in saying that \"Trainspotting was very well written.\"? Yes, no, or maybe? Maybe\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"Olive Cooper wrote Three Little Sisters.\"? Yes, no, or maybe? Yes\n###\nAlexander Stewart Jolly (1887\u20131957) was a Sydney-based architect, published poet and children\u2019s author in the early 20th century. His buildings are primarily in Sydney's northern suburbs and the north coast of New South Wales. His architectural work was strongly influenced by Frank Lloyd Wright\u2019s School in Chicago, as well as the Arts and Crafts movement of the time. Are we justified in saying that \"Alexander Stewart Jolly was born in Australia.\"? Yes, no, or maybe?", "doc_id": 517, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17636, 34698, 16524, 6640], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif. Are we justified in saying that \"There Is a Man in Our House was released before 1960\"? Yes, no, or maybe? No\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"The Commotions was released on February 21, 1990\"? Yes, no, or maybe? No\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister. Are we justified in saying that \"The script writer got the idea for the film while drinking Johnnie Walker.\"? Yes, no, or maybe? Maybe\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming. Are we justified in saying that \"Resorts Casino Tunica is the most expensive casino in Mississippi \"? Yes, no, or maybe? Maybe\n###\nHonest Ed's was a landmark discount store located in Toronto, Ontario, Canada. It was named for its proprietor, Ed Mirvish, who opened the store in 1948 and oversaw its operations for almost 60 years, until his death in 2007. The store continued in operation until it was permanently closed on December 31, 2016. Are we justified in saying that \"Ed Mirvish was known as \"Honest Ed\".\"? Yes, no, or maybe?", "doc_id": 662, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8219, 39571, 39058, 10408], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017. Are we justified in saying that \"The professional tennis players in the 2017 ATP Challenger were happy to play the tournament on hard courts. \"? Yes, no, or maybe? Maybe\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position. Are we justified in saying that \"A dog could possibly be employed by the CTA\"? Yes, no, or maybe? Maybe\n###\nCinnaholic is a vegan bakery franchise that started in 2010 and currently operates in eight states. The company's owners appeared on the television show Shark Tank in 2014, which ended with them ultimately turning down a $200,000 investment offer from Robert Herjavec. The company has adopted a franchise business model and has plans to open 100 locations by 2020. Are we justified in saying that \"Cinnaholic has plans to open 100 locations by 2020.\"? Yes, no, or maybe? Yes\n###\nThe Enlistment Act 1970 is a statute of the Parliament of Singapore that caters for the enlistment of persons in the Singapore Armed Forces. The law repeals the Singapore Army Act and People\u2019s Defence Force Act of 1965 and is designed specifically to subject enlisted personnel under military law during the period of enlistment and service. Are we justified in saying that \"Parliament passed a law for Singapore to have military personal subject to it's own laws.\"? Yes, no, or maybe? Yes\n###\nWind power in Montana is a growing industry. At a nameplate capacity of 210 megawatts (MW), the $500 million Glacier Wind Farm, which is located in Toole and Glacier counties, became Montana's largest in October 2009, surpassing the 135 MW Judith Gap Wind Farm in Wheatland County. Are we justified in saying that \"Wheatland County borders Toole and Glacier County's in Montana\"? Yes, no, or maybe?", "doc_id": 310, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22138, 22125, 34109, 27854], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records. Are we justified in saying that \"The album was released in 2016\"? Yes, no, or maybe? Yes\n###\nThe Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"The dog is very big\"? Yes, no, or maybe? Yes\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name. Are we justified in saying that \"Benoit means \"blessed\".\"? Yes, no, or maybe? Yes\n###\nAmerican Motors Incorporated (AMI) designed, manufactured, and sold a mini-van for commercial delivery use. This company was not related to the American Motors Corporation (AMC), a major automaker formed in 1954 by the merger of the Nash-Kelvinator Corporation and the Hudson Motor Car Company. Are we justified in saying that \"American Motors Incorporated was part of the Auto Industry.\"? Yes, no, or maybe? Yes\n###\n\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015. Are we justified in saying that \"Both songs premiered during Richard Kingsmill's new music segment on Triple J, which was more than 100 days ago.\"? Yes, no, or maybe?", "doc_id": 309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23855, 44696, 877, 17220], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "FS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani. Are we justified in saying that \"FS Kozani plays against Arsenal on occasion.\"? Yes, no, or maybe? Maybe\n###\nListennn... the Album is the debut studio album by American hip hop disc jockey DJ Khaled. It was released on June 6, 2006. by Terror Squad Entertainment and Koch Records. The album features guest appearances from Young Jeezy, Bun B, Birdman, Juelz Santana, Slim Thug, Krayzie Bone, Chamillionaire, Trina, Twista, Freeway, Jadakiss, Beanie Sigel, Styles P and Lil Scrappy, among others. Are we justified in saying that \"The album was released the year after 2004.\"? Yes, no, or maybe? No\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker. Are we justified in saying that \"The Doberman Gang was released more than 100 years ago.\"? Yes, no, or maybe? No\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long. Are we justified in saying that \"State Route 204 is a popular road to drive in\"? Yes, no, or maybe? Maybe\n###\nTo Drown A Rose is a single by Death in June. Additional music performers include: Christ 777, Douglas P., Gary Carey, Jan O', John Balance, Rose McDowall. The vinyl has the phrases \"Our time has been...\" and \"...and will be again\" scratched into it. The test pressing for this release was done on 12\" vinyl as opposed to the finalized 10\" format. Are we justified in saying that \"Death in June had to scrap their single because it was done on 12' vinyl.\"? Yes, no, or maybe?", "doc_id": 387, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41679, 26235, 16985, 24388], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"The Mast\u00edn Espa\u00f1ol needs to eat a lot of food.\"? Yes, no, or maybe? Maybe\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times. Are we justified in saying that \"He was capped for the Belgian national team 48 times.\n\"? Yes, no, or maybe? No\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer. Are we justified in saying that \"Joseph Maurice Ravel ends with a L.\"? Yes, no, or maybe? Yes\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \"There was another movie made by the same name which was a comedy drama years later\"? Yes, no, or maybe? Maybe\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser was one of multiple Waffen-SS members who survived World War II \"? Yes, no, or maybe?", "doc_id": 825, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26657, 1112, 12895, 10721], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"Evel Knievel's museum cost $3.50 to enter.\"? Yes, no, or maybe? Maybe\n###\nDiablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood. Are we justified in saying that \"Diablo was the first Western that starred Clint Eastwood's nephew Scott.\"? Yes, no, or maybe? No\n###\nImpatiens sakeriana is a species of flowering plant in the family Balsaminaceae. It is native to Cameroon and Equatorial Guinea. It grows in mountain forest understory habitat at altitudes up to 2000 meters. It occurs on Mount Cameroon. It can be locally common in parts of its range, but its habitat is threatened by agriculture. Are we justified in saying that \"Impatiens sakeriana grows in rain forests.\"? Yes, no, or maybe? No\n###\nZambia Sugar Plc, is a company based in Mazabuka, Southern Province, Zambia and is the largest sugar producer in Zambia. The company is listed on the Lusaka Stock Exchange (symbol: ZSUG) with 82% of the shares held by Illovo Sugar Limited of South Africa (a subsidiary of Associated British Foods) and the balance by institutional and private shareholders in Zambia. Are we justified in saying that \"Illovo Sugar Limited of South Africa holds less than 83% of the shares for Zambia Sugar Plc.\"? Yes, no, or maybe? Yes\n###\nGeneo Grissom (born June 4, 1992) is an American football defensive end for the New England Patriots. He played college football at Oklahoma. He was drafted by the New England Patriots in the third round with the 97th overall pick of the 2015 NFL Draft. Are we justified in saying that \"During the NFL draft in 2015, Geneo was drafted by the Patriots in the third round as the 96th pick and played football in college.\"? Yes, no, or maybe?", "doc_id": 963, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6062, 24999, 4516, 1152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father. Are we justified in saying that \"John Abele is the current Milwaukee County Executive.\"? Yes, no, or maybe? No\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98. Are we justified in saying that \"Death Race was released in different countries.\"? Yes, no, or maybe? Maybe\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Sushil Kumar Shinde was an amazing vice president of India. \"? Yes, no, or maybe? Maybe\n###\nDuke is a fictional character from the \"\" toyline, comic books, and cartoon series. He is the G.I. Joe Team's First Sergeant, and debuted in 1983. The character is also featured in both the \"\" animated series and comic books. Channing Tatum portrays Duke in the 2009 live-action film, \"\", and the 2013 sequel \"\". Are we justified in saying that \"Duke was played by Tatum.\"? Yes, no, or maybe? Yes\n###\nJack Frost is the name of two unrelated fictional characters appearing in American comic books published by Marvel Comics. The first Jack Frost was published by Marvel's 1940s forerunner Timely Comics during the period fans and historians call the Golden Age of comic books. Are we justified in saying that \"Jack Frost, the first instance, was published by Golden Age Comics during a timely period in the 1940s\"? Yes, no, or maybe?", "doc_id": 678, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39284, 23315, 5324, 11875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maxillaria rufescens, the Light Fox-red Maxillaria, is a species of orchid native to Trinidad and the Amazon Basin in Colombia, Ecuador, Peru, Bolivia, Venezuela, The Guianas and Brazil. The plant grows at eleveations of 200 to 2000 meters, and grows up to 1 inches (3 to 4 centimeters). Are we justified in saying that \"Maxillaria rufescens grow more heavily in Trinidad than in other areas.\"? Yes, no, or maybe? Maybe\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"It is located in western pennsylvania\"? Yes, no, or maybe? Maybe\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017). Are we justified in saying that \"Andy Barclay is a character in the \"Child's Play\" movie franchise.\"? Yes, no, or maybe? Yes\n###\nThe Mnet Asian Music Award for Best Collaboration is an award presented annually by CJ E&M Pictures (Mnet). It was first awarded at the 12th Mnet Asian Music Awards ceremony held in 2010; singers Ga-in & Jo Kwon won the award for their song \"We Fell in Love\", and it is given in honor for the artists with the most artistic achievement in collaboration performances in the music industry. Are we justified in saying that \"The Mnet Asian Music Award is a disliked show in South korea\"? Yes, no, or maybe? Maybe\n###\nThe New York Blade was a free weekly newspaper focusing on lesbian, gay, bisexual and transgender (LGBT) issues in New York City, New York. The \"Blade\" was a member of the National Gay Newspaper Guild, and contained news, entertainment, classified ads, and free personals for men and women. Are we justified in saying that \"The New York Blade is now paid\"? Yes, no, or maybe?", "doc_id": 972, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24673, 29955, 43606, 10893], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others. Are we justified in saying that \"Umashree, Kari Subbu, and Hulagappa Kattimani all played the same character in Kalavu.\"? Yes, no, or maybe? Maybe\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983. Are we justified in saying that \"Robert L. \"Rusty\" White's father was born July 1905 in Newton, Mississippi.\"? Yes, no, or maybe? Maybe\n###\nDie Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen. Are we justified in saying that \"Die Antwoord is comprised of two men and one woman.\"? Yes, no, or maybe? Yes\n###\nRana amurensis (Khabarovsk frog, Siberian wood frog, Heilongjiang brown frog or Amur brown frog) is a species of true frog found in northern Asia. It ranges across western Siberia, as well as northeastern China, northeastern Mongolia, and on the northern Korean Peninsula and on Sakhalin. \"Rana coreana\" was previously included in this species as a subspecies. Are we justified in saying that \"Rana amurensis can be found in northern Korean Peninsula. \"? Yes, no, or maybe? Yes\n###\nPlainfield South High School, or PSHS, is a four-year public high school located in Joliet, a southwest suburb of Chicago, Illinois, in the United States. It is part of the Plainfield Community Consolidated School District 202, which also includes three other high schools: Plainfield Central High School, Plainfield North High School and Plainfield East High School. Are we justified in saying that \"PSHS is in south west of Chicago\"? Yes, no, or maybe?", "doc_id": 494, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1172, 29394, 2174, 31574], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "There Was a Crooked Man... is a 1970 western starring Kirk Douglas and Henry Fonda and directed by Joseph L. Mankiewicz. This was the only western made by Mankiewicz, director of such notable films as \"All About Eve\", \"Guys and Dolls\" and \"Cleopatra\". It was written by David Newman and Robert Benton, their first script after \"Bonnie and Clyde\". Are we justified in saying that \"There Was a Crooked Man was released after the year 1970 ended.\"? Yes, no, or maybe? No\n###\nCattle Decapitation is an American extreme metal band from San Diego, California, formed in 1996. The band's current line-up includes vocalist Travis Ryan, guitarist Josh Elmore, drummer Dave McGraw, and bassist Derek Engemann. Cattle Decapitation have released seven albums, the most recent being \"The Anthropocene Extinction\" in 2015. Are we justified in saying that \"The band has released seven albums.\"? Yes, no, or maybe? Yes\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States. Are we justified in saying that \"The Forum Shops at Caesars has the largest gross income.\"? Yes, no, or maybe? Yes\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Political satire is a popular film genre in Israel.\"? Yes, no, or maybe? Maybe\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\". Are we justified in saying that \"\"Crawling\" was written by Linkin Park for the DLC pack in \"Rock Band 3\".\"? Yes, no, or maybe?", "doc_id": 12, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31744, 32661, 9504, 43225], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time. Are we justified in saying that \"Jefferson County was named because Thomas Jefferson was going to be president.\"? Yes, no, or maybe? Maybe\n###\nAndrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen. Are we justified in saying that \"Andrea was born in 1953.\"? Yes, no, or maybe? Yes\n###\nThe European Association of Science Editors (EASE ) is a non-profit membership organisation for people interested in science communication and editing. Founded in 1982, in France, EASE now has an international membership from diverse backgrounds, professional experiences, and job titles. Are we justified in saying that \"EASE doesn't make a profit.\"? Yes, no, or maybe? Yes\n###\nSimon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro. Are we justified in saying that \"Simon Corbell was born more than 5000 days ago.\"? Yes, no, or maybe? Yes\n###\nDame Nicola Mary Brewer DCMG is a British diplomat and university administrator. In May 2014 she was appointed Vice-Provost (International) at University College London. She is a non-executive director of Aggreko. Brewer was British High Commissioner to South Africa from 2009 to 2013. Are we justified in saying that \"Brewer's appointment as British High Commissioner to South Africa ended in May 2013.\"? Yes, no, or maybe?", "doc_id": 744, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32697, 3709, 2670, 14072], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns. Are we justified in saying that \"The german 88mm gun was originally designed as an anti-aircraft only weapon.\"? Yes, no, or maybe? Maybe\n###\nAdam Best is a fictional character from the BBC soap opera \"EastEnders\", played by David Proud, the first adult actor with a visible disability to appear regularly in the soap. Both Proud and his character live with spina bifida. The character made his first appearance in the episode broadcast on 10 September 2009 and his last in the one broadcast on 19 July 2010. Are we justified in saying that \"Adam Best had a visible disability\"? Yes, no, or maybe? Yes\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe. Are we justified in saying that \"David Tench provided the voice of Drew Forsythe\"? Yes, no, or maybe? No\n###\nBalaji K. Kumar is a Film Director who entered Tamil cinema as a director with the 2013 thriller film \"Vidiyum Munn\" which released on 29 November 2013 and received positive reviews from critics. Then started his career as story board artist for advertising firms like Ogilvy & Mather, JWT, Saatchi & Saatchi. Are we justified in saying that \"Balaji K. Kumar sold a billion tickets.\"? Yes, no, or maybe? Maybe\n###\nAlong With The Gods \u2013 Part 1 () is an upcoming South Korean fantasy drama film based on a webcomic of the same name. The film will be released in two parts, and stars Ha Jung-woo, Cha Tae-hyun, Ju Ji-hoon, Lee Jung-jae, Do Kyung-soo and Kim Hyang-gi. The first part of the film will be released on December 20, 2017. Are we justified in saying that \"Along with the Gods is a multi film franchise\"? Yes, no, or maybe?", "doc_id": 828, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16883, 20299, 1322, 15975], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\". Are we justified in saying that \"Aodh Mac Cathmhaoil has Irish ancestry \"? Yes, no, or maybe? Yes\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style. Are we justified in saying that \"Real Fuerza A\u00e9rea won all of their wrestling matches.\"? Yes, no, or maybe? Maybe\n###\nLibya TV (also known as Libya Al Ahrar TV) is a Libyan TV channel broadcast by satellite from its headquarters in Doha. The channel was created in 2011 during the Libyan Civil War. Its presents news, opinions, analysis, photo and video reports about Libya in specific and the region in a wider scope. It focuses on Libya\u2019s revolution and future toward building a democratic state. Are we justified in saying that \"Libya TV was created in the 20th century.\"? Yes, no, or maybe? No\n###\nIreland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth. Are we justified in saying that \"There are two islands larger than Ireland in Europe\"? Yes, no, or maybe? Yes\n###\nWang Tieguan ()is a Chinese noted geologist,Academician of the Chinese Academy of Sciences,Professor of China University of Petroleum,PhD Tutor born in December,1937,born in Shanghai City,People's Republic of China in December,1937, graduated from Beijing Petroleum Geology School(predecessor of Yangtze University)in 1956 and from Beijing Petroleum Institute in 1965. Are we justified in saying that \"Wang Tieguan never lived in China after 1966.\"? Yes, no, or maybe?", "doc_id": 491, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26692, 21054, 28616, 29103], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"Lathan McKay lives in Minnesota.\"? Yes, no, or maybe? Maybe\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old. Are we justified in saying that \"John Cameron Urschel (born June 24, 1991) is a Canadian historian and retired professional American football guard and center.\"? Yes, no, or maybe? No\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro. Are we justified in saying that \"Salavatore Rosa taught many students how to paint.\"? Yes, no, or maybe? Maybe\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon has been in a draft before.\"? Yes, no, or maybe? Yes\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include: Are we justified in saying that \"Abbott had 39 years of life before passing on\"? Yes, no, or maybe?", "doc_id": 551, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1957, 23939, 5074, 6572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Game Plan is a 2007 American family comedy film directed by Andy Fickman and written by Nichole Millard, Kathryn Price and Audrey Wells and starring Dwayne \"The Rock\" Johnson (marking the last film in which Johnson uses his ring name \"The Rock\" in billing). It follows an NFL quarterback who finds out he has an 8-year-old daughter from a previous relationship. Are we justified in saying that \"Kids can watch this movie. \"? Yes, no, or maybe? Yes\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005. Are we justified in saying that \"Samuel Eto'o Fils was born less than 1981 days ago.\"? Yes, no, or maybe? No\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music. Are we justified in saying that \"Mission: Impossible III was released in the 2000s\"? Yes, no, or maybe? Yes\n###\nAmethyst: Princess of Gemworld is a comic book series published by DC Comics in the 1980s. The series tells the story of a teenage girl named Amy Winston who discovers that she is the orphaned princess of the magical Gemworld. Amy learns that an evil ruler called Dark Opal is out to destroy her and travels to Gemworld to overthrow him. Are we justified in saying that \"Amethyst: Princess of Gemworld was inspired by a real teen girl named Amy who had delusions and was fighting cancer.\"? Yes, no, or maybe? Maybe\n###\nCommunal riots occurred in Bihar from 24 October to 11 November 1946, in which Hindu mobs targeted Muslim families. The riots were triggered by the Great Calcutta Killings, as well as the Noakhali riots earlier that year. Mahatma Gandhi declared that he would fast unto death if the riots did not stop. The riots were part of a sequence of communal violence that culminated in the partition of India. Are we justified in saying that \"Communal riots occurred in Bihar were due to food price escallation\"? Yes, no, or maybe?", "doc_id": 308, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3294, 34278, 29604, 8483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Diablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood. Are we justified in saying that \"diablo is a western film very similar to a clint eastwood movie\"? Yes, no, or maybe? Maybe\n###\nLeonard Pilkington (1527\u20131599) was an English academic and clergyman. A Marian exile, he became Regius Professor of Divinity at Cambridge and Master of St John's College, Cambridge at the start of the reign of Elizabeth I. In his subsequent church career, he followed the way opened when his brother James Pilkington became Bishop of Durham. Are we justified in saying that \"Leonard Pilkington was an American clergyman\"? Yes, no, or maybe? No\n###\nClarendon is an urbanized, upper-class neighborhood in Arlington County, Virginia, located between the Rosslyn area and the Ballston area. It was named after Edward Hyde, 1st Earl of Clarendon, a leading statesman and historian of the English Civil War. The main thoroughfares are Wilson Boulevard (one-way westbound) and Clarendon Boulevard (one-way eastbound). Are we justified in saying that \"Clarendon has houses.\"? Yes, no, or maybe? Yes\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\". Are we justified in saying that \"Dr. Scobie published a magazine in 1972.\"? Yes, no, or maybe? Maybe\n###\nFinsbury Park TMD was a railway Traction Maintenance Depot situated in London, England. It was the first purpose built main line diesel locomotive depot opened in England and it was fully commissioned in April 1960. Finsbury Park was a steam shed under British Railways with the depot code 34G; the depot code of the diesel depot under BR was FP. The nearest railway station is Finsbury Park. Are we justified in saying that \"Other first purpose main line diesel locomotive depots were built after Finsbury Park.\"? Yes, no, or maybe?", "doc_id": 947, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43546, 45290, 10102, 28092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "St Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795. Are we justified in saying that \"The city of Port Phillip is part of the St Kilda neighbourhood.\"? Yes, no, or maybe? Yes\n###\nThe Puerto Rico Baseball Academy and High School (PRBAHS) is a non-profit organization combining academics and sports programs into one curriculum. Its goal is to prepare its students for higher education, competitive college scholarship opportunities, and the Major League Baseball Draft. The PRBAHS is the only high school in Puerto Rico or the United States with this type of learning environment. Are we justified in saying that \"The Puerto Rico Baseball Academy and High School starts with a B.\"? Yes, no, or maybe? No\n###\nStraumfj\u00f6r\u00f0 Icelandic), or Straumfj\u01ebr\u00f0 (Old Norse) sometimes anglicised to Straumsfjordr, Straumfjordr, Straumsfjord or Straumfjord, is according to the Sagas of Icelanders a fjord in Vinland where Thorfinn Karlsefni set up a temporary settlement. It is described in the \"Saga of Erik the Red\", but not in the \"Greenland saga\". Its name translates to \"Current-fjord\", \"Stream-fjord\" or \"Tide-fjord\". Are we justified in saying that \"Straumfj\u00f6r\u00f0 Icelandic is the greenland saga from erik the red, created by Thorfinn Karlsefni\"? Yes, no, or maybe? No\n###\nThe Asian Institute is a research centre at the Munk School of Global Affairs at the University of Toronto, and is located in the historical Devonshire House, a former residential hall of the university's Trinity College. Ritu Birla is the Richard Charles Lee Director of the Asian Institute. Are we justified in saying that \"The Asian Institute at the University of Toronto is located in a newly built Devonshire House, but has never been affiliated with the university.\"? Yes, no, or maybe? No\n###\n\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\". Are we justified in saying that \"Guy Sebastian set the release date for his single Cover on My Heart.\"? Yes, no, or maybe?", "doc_id": 842, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7064, 32436, 1677, 8125], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Altamonte Springs is a suburban city in Seminole County, Florida, United States, which had a population of 41,496 at the 2010 census. The city is in the northern suburbs of the Orlando\u2013Kissimmee\u2013Sanford Metropolitan Statistical Area, which the United States Census Bureau estimated had a population of 2,054,574 in 2008. Are we justified in saying that \"Altamonte Springs, a suburban city in Florida, will show a population increase in the 2020 census. \"? Yes, no, or maybe? Maybe\n###\nPrincess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer. Are we justified in saying that \"Maria's father often took her hunting\"? Yes, no, or maybe? Maybe\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee Are we justified in saying that \"Mystery was published in 1990.\"? Yes, no, or maybe? Yes\n###\nThe Washington Nationals are a professional baseball team that has been based in Washington, D.C. since . The Nationals are a member of both the Major League Baseball's (MLB) National League Eastern Division and the National League (NL) itself. Since the 2008 season, the Nationals have played in Nationals Park; from 2005 through , the team played in Robert F. Kennedy Memorial Stadium. Are we justified in saying that \"The Washington Nationals are amateurs.\"? Yes, no, or maybe? No\n###\nSemonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781. Are we justified in saying that \"The population in 2006 was 7,780 plus one.\"? Yes, no, or maybe?", "doc_id": 141, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20868, 18068, 14997, 32651], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel. Are we justified in saying that \"bete noire was the last album ferry made\"? Yes, no, or maybe? Maybe\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in Washington state.\"? Yes, no, or maybe? Yes\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"The President of the Tibetan Youth Congress is Tsewang Rigzin.\"? Yes, no, or maybe? Yes\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation. Are we justified in saying that \"The 89th Medium Tank Battalion was a stand alone unit\"? Yes, no, or maybe? No\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant. Are we justified in saying that \"Neelix Is a character played by \"\".\"? Yes, no, or maybe?", "doc_id": 142, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19749, 19661, 30919, 30134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Franco Mari (Born 23 January 1947) is an Italian actor and comedian. Better known as Rupert Sciamenna, his best known character, he is famous for his participation in television programs such as Mai dire... on Italia 1 in many sketches with Marcello Macchia. Are we justified in saying that \"Franco Mari is a thin man\"? Yes, no, or maybe? Maybe\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie. Are we justified in saying that \"The DVD came out after Y2K.\"? Yes, no, or maybe? No\n###\nKJEF-CA, channel 13, was a class A television station in Jennings, Louisiana. Owned by Townsquare Media, the station was an independent station. It was the only television station owned by Townsquare, a company that otherwise specializes exclusively in radio. Are we justified in saying that \"KJEF-CA had 500 employees\"? Yes, no, or maybe? Maybe\n###\nKathleen Delaney is an American actress, voice actress, singer, and dancer who works on Broadway and on the properties of 4Kids Entertainment. She is best known as the voice of Hina in the 4Kids dub of One Piece, Mai Valentine in uncut versions of \"Yu-Gi-Oh!\" and Rouge in \"Sonic X\" and the succeeding games until 2010, when she was replaced by Karen Strassman. Are we justified in saying that \"Delaney voiced Rouge in Sonic video games until the middle of Obama's second term as president of the U.S.\"? Yes, no, or maybe? No\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks. Are we justified in saying that \"Cocaine is a globally used drug\"? Yes, no, or maybe?", "doc_id": 392, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40327, 34873, 2689, 34947], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mick Napier (born December 12, 1962) is an American director, actor, teacher and author living in Chicago. He is the founder and artistic director of the Annoyance Theatre and an award-winning director at The Second City. He has directed Stephen Colbert, Tina Fey, Rachel Dratch, Horatio Sanz, Nia Vardalos, Andy Richter, Jeff Garlin, and David Sedaris, amongst others. Are we justified in saying that \"Mick Napier directed Stephen Colbert, Tina Fey, and Donald Trump\"? Yes, no, or maybe? Maybe\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex. Are we justified in saying that \"Muccan Station is a rap musician\"? Yes, no, or maybe? No\n###\nLove's Labour's Won is a lost play attributed by contemporaries to William Shakespeare, written before 1598 and published by 1603, though no copies are known to have survived. Scholars dispute whether it is a true lost work, possibly a sequel to \"Love's Labour's Lost\", or an alternative title to a known Shakespeare play. Are we justified in saying that \"\"Love's Labour's Won\" was misplaced by Shakespeare.\"? Yes, no, or maybe? Maybe\n###\nCentral Mountain Air Ltd. is a Canadian regional airline based in Smithers, British Columbia. It operates scheduled and charter services and transborder services. Its main base is Smithers Airport, with other bases at Calgary International Airport, Vancouver International Airport and Prince George Airport. Are we justified in saying that \"British Columbia contains 3 regional airlines.\"? Yes, no, or maybe? Maybe\n###\nThe 1968 Senior League World Series took place from August 13\u201318 in Gary, Indiana, United States. New Hyde Park, New York defeated West Tampa, Florida in the championship game. It was the third straight title for New York. This was the first SLWS held in Gary. Are we justified in saying that \"1968 was the first time that New York and Tampa competed.\"? Yes, no, or maybe?", "doc_id": 905, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27897, 33512, 28977, 24105], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Guns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin). Are we justified in saying that \"Guns of Diablo came out in the 21st century\"? Yes, no, or maybe? No\n###\n\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the Are we justified in saying that \"Eternally will be featured in other films.\"? Yes, no, or maybe? Maybe\n###\nAldo Fabrizi (] ; 1 November 1905 \u2013 2 April 1990 in Rome, Italy) was an Italian actor, director, screenwriter and comedian, probably best known for the role of the heroic priest in Roberto Rossellini's \"Rome, Open City\" and as partner of Tot\u00f2 in a number of successful comedies. Are we justified in saying that \"Aldo Fabrizi was a director before he was a comedian\"? Yes, no, or maybe? Maybe\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"\"I'm So Sorry\" was recorded in 2017.\"? Yes, no, or maybe? Maybe\n###\nThe Gospel According to the Other Mary is an opera/oratorio by contemporary American composer John Adams. The world premiere took place on May 31, 2012, at the Walt Disney Concert Hall in Los Angeles with Gustavo Dudamel conducting the Los Angeles Philharmonic who also premiered the staged version on March 7, 2013, at the same venue. Are we justified in saying that \"John Adams did not compose more operas.\"? Yes, no, or maybe?", "doc_id": 78, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27106, 34811, 43830, 30849], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004. Are we justified in saying that \"The second game is technically superior to the first game.\"? Yes, no, or maybe? Maybe\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only. Are we justified in saying that \"Sisters of Mercy formed before 1990.\"? Yes, no, or maybe? Yes\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"JC Aragone plays tennis for the USA.\"? Yes, no, or maybe? Yes\n###\nExergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid. Are we justified in saying that \"Kokam America, Inc. will open a store in Texas the next fiscal year.\"? Yes, no, or maybe? Maybe\n###\nJay Kahn is a Democratic member of the New Hampshire Senate representing the 10th district. The 10 district is located in the southwestern corner of the state and includes Alstead, Chesterfield, Gilsum, Harrisville, Hinsdale, Keene, Marlborough, Roxbury, Sullivan, Surry, Swanzey, Walpole, Westmoreland and Winchester, New Hampshire. Are we justified in saying that \"Jay Kahn has lived in Florida.\"? Yes, no, or maybe?", "doc_id": 119, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [988, 26897, 42335, 38263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run. Are we justified in saying that \"Jean de Segonzac worked with Eric Overmyer on Homicide: The Movie.\"? Yes, no, or maybe? Yes\n###\nMiss Peregrine's Home for Peculiar Children is a contemporary fantasy debut novel by American author Ransom Riggs. The story is told through a combination of narrative and vernacular photographs from the personal archives of collectors listed by the author. Are we justified in saying that \"Riggs has written a fantasy novel. \"? Yes, no, or maybe? Yes\n###\nOn July 16, 2009, Harvard University professor Henry Louis Gates Jr. was arrested at his Cambridge, Massachusetts home by local police officer Sgt. James Crowley, who was responding to a 9-1-1 caller's report of men breaking and entering the residence. The arrest initiated a series of events that unfolded under the spotlight of the international news media. Are we justified in saying that \"This happened in 2009\"? Yes, no, or maybe? Yes\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries. Are we justified in saying that \"The Ballon d'Or is judged by the President of the United States\"? Yes, no, or maybe? No\n###\nJohn Michael Stipe (born January 4, 1960) is an American singer, songwriter, musician, film producer, music video director, visual artist, and philanthropist. He is best known as the lead singer of the alternative rock band R.E.M. from their formation in 1980 until their dissolution in 2011. Are we justified in saying that \"John Michael Stipe is known for giving away money.\"? Yes, no, or maybe?", "doc_id": 496, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44684, 1868, 19400, 17113], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alias is a fictional character in the \"Forgotten Realms\" campaign setting for the \"Dungeons & Dragons\" fantasy role-playing game. Alias is the main character of \"Azure Bonds\". She also appeared in the computer game, Curse of the Azure Bonds which was based on the book. Alias later appears in the sequel \"Song of the Saurials\", and the standalone book \"Masquerades\". Are we justified in saying that \"Alias is an actress that has featured in \"Forgotten Realms\" campaign setting for the \"Dungeons & Dragons\"? Yes, no, or maybe? No\n###\nThe Little League World Series took place between August 22 and August 27 in Williamsport, Pennsylvania. Westbury American Little League of Houston, Texas defeated American Little League of West New York, New Jersey in the championship game of the 20th Little League World Series. Are we justified in saying that \"Westbury American Little League of Houston, Texas lost the championship game.\"? Yes, no, or maybe? No\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\". Are we justified in saying that \"Alex Rider is a title character and the protagonist of the unpopular \"Alex Rider\" novel series by British author Anthony Horowitz.\"? Yes, no, or maybe? Yes\n###\nBruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice. Are we justified in saying that \"Bruno Mingeon was a popular bobsledder\"? Yes, no, or maybe? Maybe\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead. Are we justified in saying that \"The Sierra Leone Civil War ended in March 2002.\"? Yes, no, or maybe?", "doc_id": 75, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32912, 30207, 15877, 29875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Vorontsov Lighthouse (Ukrainian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u044c\u043a\u0438\u0439 \u043c\u0430\u044f\u043a , Russian: \u0412\u043e\u0440\u043e\u043d\u0446\u043e\u0432\u0441\u043a\u0438\u0439 \u043c\u0430\u044f\u043a ) is a famous red-and-white, 27.2 metre landmark in the Black Sea port of Odessa, Ukraine. It is named after Prince Mikhail Semyonovich Vorontsov, one of the governors-general of the Odessa region. Are we justified in saying that \"Prince Mikhail Semyonovich Vorontsov was around 2 meters tall.\"? Yes, no, or maybe? Maybe\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life. Are we justified in saying that \"Health For All has been used by zack.\"? Yes, no, or maybe? Maybe\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"Zuikaku was the largest aircraft carrier in the Japanese fleet.\"? Yes, no, or maybe? Maybe\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972. Are we justified in saying that \"The Arizona Cardinals existed in 1962.\"? Yes, no, or maybe? Yes\n###\nThe 2015 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 30th edition of the ASB Classic, and part of the WTA International tournaments category of the 2015 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 5 to 10 January 2015. Are we justified in saying that \"There were 30 edition of the ASB Classic played before the 2015 ASB Classic \"? Yes, no, or maybe?", "doc_id": 967, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27550, 1165, 27262, 34774], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The J.J. Deal and Son Carriage Factory was the largest factory built in Jonesville, Michigan. It is the only 19th century factory remaining in the City. It is located at 117 West Street. On August 1, 2012, the building was added to the National Register of Historic Places. Are we justified in saying that \"Michigan only has 3 remaining 19th century factories in total.\"? Yes, no, or maybe? Maybe\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success. Are we justified in saying that \"deep death is an arcade video game\"? Yes, no, or maybe? Yes\n###\nMarion Anna Fischer (born July 18, 1986 in East Berlin) is a German actress and singer. Since 2003, she appeared in over 30 film and television roles in appearance. She is most recognised to international audiences as the innocent vampire \"Nora\" in Dennis Gansel's drama film \"We Are The Night\" Are we justified in saying that \"\"Nora\" used to drink rat's blood instead.\"? Yes, no, or maybe? Maybe\n###\nWood River is a provincial electoral district for the Legislative Assembly of Saskatchewan, Canada. Located in southern Saskatchewan, the district was created by the \"Representation Act, 1994\" (Saskatchewan) out of the former constituency of Assiniboia-Gravelbourg and half of the Shaunavon district. Are we justified in saying that \"Wood River is in southern Saskatchewan.\"? Yes, no, or maybe? Yes\n###\nKaalamellam Kaathiruppen (Tamil: \u0b95\u0bbe\u0bb2\u0bae\u0bc6\u0bb2\u0bcd\u0bb2\u0bbe\u0bae\u0bcd \u0b95\u0bbe\u0ba4\u0bcd\u0ba4\u0bbf\u0bb0\u0bc1\u0baa\u0bcd\u0baa\u0bc7\u0ba9\u0bcd ; English: I Will Wait Forever ) is 1997 Tamil romance film directed by R. Sundarrajan. The film stars Vijay and Dimple in the lead roles, while R. Sundarrajan, Jaishankar, Srividya, Karan, Manivannan play other pivotal roles. The music for the film was composed by Deva and the film released on 14 January 1997. Are we justified in saying that \"Kaalamellam Kaathiruppen was finished in 1997\"? Yes, no, or maybe?", "doc_id": 425, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33635, 38931, 20724, 5823], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator. Are we justified in saying that \"Fraser Wishart is a daily radio and television commentator.\"? Yes, no, or maybe? No\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The Arkansas Razorbacks wore red and yellow as their team uniform.\"? Yes, no, or maybe? Maybe\n###\nJerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern. Are we justified in saying that \"Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He played college football at Georgia Southern where hes also met his future wife.\"? Yes, no, or maybe? Maybe\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered. Are we justified in saying that \"Justicia umbrosa cannot grow in shady landscapes\"? Yes, no, or maybe? No\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex. Are we justified in saying that \"Croton lechleri is a dull looking red latex type plant.\"? Yes, no, or maybe?", "doc_id": 34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31312, 40459, 16998, 8446], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW. Are we justified in saying that \"The government wanted to close the plant.\"? Yes, no, or maybe? Maybe\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910. Are we justified in saying that \"Yi Bangja and Crown Prince Euimin were born in 1901.\"? Yes, no, or maybe? Yes\n###\nThe Valley of Fire Road (also called the Valley of Fire Highway) is a road in northeastern Clark County, Nevada serving the Valley of Fire State Park. The roadway was previously designated State Route 40 (SR 40), and the segment within the state park is currently designated a Nevada Scenic Byway. Are we justified in saying that \"The Valley of Fire Road is located in idaho\"? Yes, no, or maybe? No\n###\nNew Hampshire Route 120 is a 26.928 mi secondary north\u2013south state highway in Sullivan and Grafton counties in the upper Connecticut River Valley region of New Hampshire. Its southern terminus is at New Hampshire Route 11 and New Hampshire Route 103 in Claremont. Its northern terminus is at New Hampshire Route 10 in Hanover. Are we justified in saying that \"New Hampshire Route 120 is a 26.918 mi secondary north\u2013south state highway\"? Yes, no, or maybe? No\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title. Are we justified in saying that \"Gene Mayer doesn't compete in doubles tennis.\"? Yes, no, or maybe?", "doc_id": 350, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31765, 7356, 35414, 13825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Aster\" (M915) is a Tripartite-class minehunter of the Belgian Naval Component, launched on 16 December 1985 at the Mercantile-Belyard shipyard in Rupelmonde and christened by Queen Paola of Belgium. The patronage of \"Aster\" was accepted by the city of Blankenberge. \"Aster\" was the first of the Belgian Tripartite-class minehunters. Are we justified in saying that \"People did not want it to launch\"? Yes, no, or maybe? Maybe\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city. Are we justified in saying that \"The county seat of Keith County does not refer to a chair.\"? Yes, no, or maybe? Yes\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole is a second album \"? Yes, no, or maybe? No\n###\nTony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls. Are we justified in saying that \"Tony Rena Snell Jr. has a R.\"? Yes, no, or maybe? Yes\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \"Ranila has an A.\"? Yes, no, or maybe?", "doc_id": 446, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39231, 12511, 4687, 28286], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"The Flirts split up in 1990.\"? Yes, no, or maybe? Maybe\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175. Are we justified in saying that \"Look at Me is a song by the Beatles\"? Yes, no, or maybe? No\n###\nBrandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more. Are we justified in saying that \"Brandon hughes is an american actor\"? Yes, no, or maybe? No\n###\nMarry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil. It aired on KBS2 from October 14 to December 3, 2013 on Mondays and Tuesdays at 22:00 for 16 episodes. Are we justified in saying that \"Marry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil had more than 14 episodes.\"? Yes, no, or maybe? Yes\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees. Are we justified in saying that \"Jaron Long's father has only worked for NY teams\"? Yes, no, or maybe?", "doc_id": 224, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44258, 25691, 8714, 1415], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Derailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago. Are we justified in saying that \"The novel is set in Chicago.\"? Yes, no, or maybe? Maybe\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups. Are we justified in saying that \"Corn smut is very expensive to make\"? Yes, no, or maybe? Maybe\n###\nA Lady's Morals is a 1930 American Pre-Code film offering a highly fictionalized account of singer Jenny Lind. The movie features opera diva Grace Moore as Lind, Reginald Denny as a lover, and Wallace Beery as P. T. Barnum; Beery would play Barnum again four years later in \"The Mighty Barnum\". The film contains some fine opera arias by Moore and was directed by Sidney Franklin. Are we justified in saying that \"The story of Lind was fabricated to a large extent in this production.\"? Yes, no, or maybe? Yes\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006. Are we justified in saying that \"\"Live Free or Die\" was the best rated episode of the HBO original series \"The Sopranos\".\"? Yes, no, or maybe? Maybe\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"10 September 2016 has a z.\"? Yes, no, or maybe?", "doc_id": 515, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37710, 32664, 8035, 31652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament. Are we justified in saying that \"The 2011 Atlantic Sun Conference Baseball Tournament is watched mainly by seniors\"? Yes, no, or maybe? Maybe\n###\nAndrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen. Are we justified in saying that \"Andrea was born in Bavaria.\"? Yes, no, or maybe? Yes\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is in CA\"? Yes, no, or maybe? Yes\n###\nVivekananda Degree College is the only Degree college in Ichoda Mandal which is established in 2006 and is affiliated to Kakatiya University of Telangana, India. The college has its campus at Ichoda, Adilabad. The college runs degree courses in Computer Science, Arts, Science, Commerce and Management. Are we justified in saying that \"Vivekananda Degree College offers both bachelors and masters degrees.\"? Yes, no, or maybe? Maybe\n###\n\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\". Are we justified in saying that \"Guy performed the song many times on tv\"? Yes, no, or maybe?", "doc_id": 854, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12473, 30527, 39547, 19359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor. Are we justified in saying that \" the Portuguese governor went on to become a very important federal government official\"? Yes, no, or maybe? Maybe\n###\nArtur Edler von Mecenseffy (23 June 1865, Vienna \u2014 6 October 1917, Asiago) was an Austro-Hungarian Army officer who held the rank of \"Feldmarschall-leutnant\" (\"lieutenant field marshal\") and served during World War I, becoming the highest ranking officer of Austria-Hungary to be killed on the battlefield. Are we justified in saying that \"Artur died in Asiago.\"? Yes, no, or maybe? Yes\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training. Are we justified in saying that \"The Boulton Paul Balliol and Sea Balliol took a lot of money to maintain\"? Yes, no, or maybe? Maybe\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships. Are we justified in saying that \"The United States Bowling Congress Open Championships were held in Nevada a little bit more than a decade ago\"? Yes, no, or maybe? Yes\n###\nKenneth \"Ken\" Gibson was a Northern Irish politician who was the Chairman of the Volunteer Political Party (VPP), which he had helped to form in 1974. He also served as a spokesman and Chief of Staff of the loyalist paramilitary organisation, the Ulster Volunteer Force (UVF). Are we justified in saying that \"ken gibson was the founder of vpp in 1974 and was also the president of the loyalist paramilitary organisation uvf\"? Yes, no, or maybe?", "doc_id": 692, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4244, 36231, 13494, 13330], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added. Are we justified in saying that \"Stereolab has released at least three albums.\"? Yes, no, or maybe? Yes\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field no longer exists\"? Yes, no, or maybe? Yes\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title. Are we justified in saying that \"Grand Prix tennis circuit is also now defunct\"? Yes, no, or maybe? Maybe\n###\nBe Mine Tonight was the debut single from New Zealand band Th' Dudes. It was released in May 1979 as a Double A-side with Walking In Light and reached No. 36 on the New Zealand music charts. Be Mine Tonight won Single of the Year at the 1979 New Zealand Music Awards. It was voted 27th best New Zealand Song of the 20th Century by APRA members and featured on the Nature's Best CD. Are we justified in saying that \"The B side of the debut single by Th'Dudes was a hit.\"? Yes, no, or maybe? Maybe\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts. Are we justified in saying that \"Diaspora studies cannot be taught in college.\"? Yes, no, or maybe?", "doc_id": 406, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7548, 15536, 4700, 15518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\". Are we justified in saying that \"Abbas-Mustan received the Filmfare Best Villain Award\"? Yes, no, or maybe? No\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists. Are we justified in saying that \"Wikipedia articles exist for most of the list.\"? Yes, no, or maybe? Maybe\n###\nUni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats. Are we justified in saying that \"Jos\u00e9 Mangri\u00f1\u00e1n has a capacity of 3000 seats.\"? Yes, no, or maybe? No\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde. Are we justified in saying that \"Jango had a black cast\"? Yes, no, or maybe? Maybe\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"The song \"Amor a la Mexicana\" was written by Mexican singer Thalia.\"? Yes, no, or maybe?", "doc_id": 408, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21072, 10503, 11958, 8228], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"James Wyatt is not currently a Methodist minister.\"? Yes, no, or maybe? Yes\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station. Are we justified in saying that \"Moto is the operator of service areas in England.\"? Yes, no, or maybe? Yes\n###\nDavid Krakauer (born September 22, 1956) is an American clarinetist raised and based in New York, NY. He is known for his work in klezmer music as well as classical music and avant-garde improvisation. He is also considered an accomplished jazz player. Are we justified in saying that \"David Krakauer has an E.\"? Yes, no, or maybe? Yes\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart. Are we justified in saying that \"Korean boy band member Sangmin contributed to the album Timeless: Begins.\"? Yes, no, or maybe? Yes\n###\nSantos \"Sandy\" Alomar Vel\u00e1zquez Jr. (] , ; born June 18, 1966) is a professional baseball catcher, coach, and manager. He played in Major League Baseball catcher for the San Diego Padres, Cleveland Indians, Chicago White Sox, Colorado Rockies, Texas Rangers, Los Angeles Dodgers, and New York Mets between 1988 and 2007. Are we justified in saying that \"Santos \"Sandy\" Alomar Vel\u00e1zquez Jr. never sweated.\"? Yes, no, or maybe?", "doc_id": 696, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34703, 16743, 17458, 17233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SeaWorld Ohio was a park in the SeaWorld chain of marine animal theme parks. The park opened in 1970 directly across the lake and less than one mile from Geauga Lake Park in Aurora, Ohio, United States. The small lake separated the two parks. Wildwater Kingdom, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016. Are we justified in saying that \"Blizzard Beach, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016.\"? Yes, no, or maybe? No\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer. Are we justified in saying that \"Joseph Maurice Ravel was smart.\"? Yes, no, or maybe? Maybe\n###\nBoneyard Beach is a 1995 album by Raleigh, North Carolina band Dish, led by singer and pianist Dana Kletter, on Interscope Records. The album was produced by John Agnello at Ardent Studios in Memphis, Tennessee. Interscope's VP, Tom Whalley, told \"Billboard\" magazine that \"the high quality of songwriting in Dish and the sound of Dana's voice are two things that set this band apart.\" Are we justified in saying that \"Boneyard Beach was Dish's debut album.\"? Yes, no, or maybe? Maybe\n###\nThe 1985 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1985 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The season opener against Florida State was the last season opening loss until 2015. Are we justified in saying that \"The team wasn't able to participate in the 1985 season.\"? Yes, no, or maybe? No\n###\nSeven Little Monsters is a children's picture book by American author and illustrator Maurice Sendak. \"Seven Little Monsters\" was published by Harper & Row in 1977 and served as the basis for the Canadian-Chinese television production of the same name (2000-2007). Are we justified in saying that \"The television production of Seven Little Monsters started ten years after the book was published.\"? Yes, no, or maybe?", "doc_id": 186, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31827, 30534, 33012, 14122], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin. Are we justified in saying that \"Corn crab soup is definitely of northern Chinese origin.\"? Yes, no, or maybe? No\n###\nSqueezing Out Sparks is the fourth studio album by English musician Graham Parker and his band the Rumour. It was voted album of the year in the 1979 \"Village Voice\" Pazz & Jop Critics Poll and later ranked number 334 on \"Rolling Stone\" magazine's list of the 500 greatest albums of all time. Although the Rumour were not credited on the cover, their name was included on the album label. Are we justified in saying that \"Graham Parker has sold at least one album\"? Yes, no, or maybe? Yes\n###\nWinning America is a documentary television film about the Canadian band Said the Whale. It follows the band on their first US tour down through California, and then to South by Southwest. It premiered on CBC Television on July 23, 2011. The film was directed by Brent Hodge and Thomas Buchan, and was produced by Brent Hodge, Jon Siddall and Sheila Peacock. It was nominated for a Leo Award in 2012. Are we justified in saying that \"The band is called Said the Bird.\"? Yes, no, or maybe? No\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\". Are we justified in saying that \"Down the Way was Stone's first album.\"? Yes, no, or maybe? No\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"Look at My Dab was released as a single after October 31st, 2015.\"? Yes, no, or maybe?", "doc_id": 266, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31442, 26531, 1283, 5169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism. Are we justified in saying that \"Big Sky is considered a town and has a town government.\"? Yes, no, or maybe? No\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Nick Harper has an album known as Smithereens.\"? Yes, no, or maybe? Yes\n###\nAmericana Deluxe is the second studio album by Big Bad Voodoo Daddy. This album is also sometimes called \"Big Bad Voodoo Daddy\", as the album cover prominently displays a stylized \"Big Bad Voodoo Daddy\" logo and does not feature the phrase \"Americana Deluxe\" on it. However, the liner notes and the band's website clearly show that the true title is indeed \"Americana Deluxe\". Are we justified in saying that \"Big Bad Voodoo Daddy has 3 members.\"? Yes, no, or maybe? Maybe\n###\nWayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war. Are we justified in saying that \"He will retire at the age of 50.\"? Yes, no, or maybe? Maybe\n###\nMurder Rock (Italian: Murderock - uccide a passo di danza; also known as Murder-Rock: Dancing Death, Slashdance and The Demon Is Loose!) is a 1984 Italian giallo film starring Olga Karlatos and Ray Lovelock, and written and directed by Lucio Fulci. Fulci recalled the producer forced him to turn the film into a musical with the music of Keith Emerson due to the success of \"Flashdance\". Are we justified in saying that \"Flashdance was a dance studio.\"? Yes, no, or maybe?", "doc_id": 711, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32479, 13965, 19984, 15880], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Real Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded. Are we justified in saying that \"Most players in Real Madrid C were Spanish.\"? Yes, no, or maybe? Maybe\n###\nBe Mine Tonight was the debut single from New Zealand band Th' Dudes. It was released in May 1979 as a Double A-side with Walking In Light and reached No. 36 on the New Zealand music charts. Be Mine Tonight won Single of the Year at the 1979 New Zealand Music Awards. It was voted 27th best New Zealand Song of the 20th Century by APRA members and featured on the Nature's Best CD. Are we justified in saying that \"Be Mine Tonight was successful in Australia\"? Yes, no, or maybe? Maybe\n###\nJames Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"James Wyatt won a literary award for \"Dungeon Master's Guide\".\"? Yes, no, or maybe? Maybe\n###\nElmhurst is a residential neighborhood in the southernmost part of Oakland, California. Originally a separate town, it was annexed by Oakland in 1909, and today is considered part of East Oakland. It lies at an elevation of 39 feet (12 m). It contains the Eastmont Town Center. Are we justified in saying that \" Eastmont Town Center is the best one in Oakland.\"? Yes, no, or maybe? Maybe\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils. Are we justified in saying that \"The Big Cube had a lot of drugs\"? Yes, no, or maybe?", "doc_id": 477, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6051, 39625, 45458, 6652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38. Are we justified in saying that \"Leningrad artists displayed art inside one of the biggest exhibitions in Russia after the death of Stalin.\"? Yes, no, or maybe? Yes\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"Foals have been covered by rush.\"? Yes, no, or maybe? Maybe\n###\nHolding Back the River is the third studio album by Wet Wet Wet. It was released on 30 October 1989. The album is actually a follow-up to their debut, \"Popped In Souled Out\". Its offspring singles were \"Sweet Surrender\", \"Broke Away\", \"Hold Back the River\" and \"Stay With Me Heartache (Can't Stand the Night)\". The album reached #2 in the charts. Are we justified in saying that \"There were three albums by Wet Wet Wet before Holding Back the River.\"? Yes, no, or maybe? No\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances. Are we justified in saying that \"Spaceballs is both funny and dramatic at the same time.\"? Yes, no, or maybe? Maybe\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title. Are we justified in saying that \"The Palm Harbor Open was played over 10 hours ago.\"? Yes, no, or maybe?", "doc_id": 163, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33671, 29184, 3843, 14809], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cambarus cryptodytes, the Dougherty Plain cave crayfish or Apalachicola cave crayfish, is a small, freshwater crayfish endemic to Florida and Georgia in the United States. It is an underground species known only from waters associated with the Floridan aquifer. Are we justified in saying that \"Apalachicola cave crayfish lives in saltwater\"? Yes, no, or maybe? No\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant. Are we justified in saying that \"Neelix is a good person.\"? Yes, no, or maybe? Maybe\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The North African ostrich is known for burying it's head in the sand. \"? Yes, no, or maybe? Maybe\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"Albrect von Houwald was married to Helene Grafin von Carmer.\"? Yes, no, or maybe? Maybe\n###\nHideki Kamiya (\u795e\u8c37 \u82f1\u6a39 , Kamiya Hideki , born December 19, 1970) is a video game designer and director working for PlatinumGames. Kamiya was formerly employed by Capcom and Clover Studio, and founded PlatinumGames in 2006, along with other former Capcom staff. Are we justified in saying that \"Hideki Kamiya worked as a video game designer and director for Capcom and Clover Studio.\"? Yes, no, or maybe?", "doc_id": 942, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21846, 9751, 2638, 11832], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Margarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos. Are we justified in saying that \"Margarita la tornera is translated to Margarita the Gatekeeper in English.\"? Yes, no, or maybe? Yes\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"Hyde, Jekyll, Me is based on the Dr. Jekyll and Mr. Hyde character\"? Yes, no, or maybe? Yes\n###\nThe Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation. Are we justified in saying that \"There are no star players on the U-20 team currently. \"? Yes, no, or maybe? Maybe\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"Colorz of Rage hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley had blossoming careers independent of the urban drama film.\"? Yes, no, or maybe? Maybe\n###\nHeck's Department Store, a chain of West Virginia based discount department stores, was founded by Boone County natives and businessmen Fred Haddad, Tom Ellis, and Lester Ellis and wholesale distributor Douglas Cook. The Heck's name was a combination of the names Haddad, Ellis and Cook. Haddad served as President, Lester Ellis was Vice-President, and Tom Ellis was Secretary-Treasurer. Are we justified in saying that \"Heck's Department Store first name ends with s.\"? Yes, no, or maybe?", "doc_id": 775, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13108, 26038, 14095, 18751], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Am\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics. Are we justified in saying that \"Am\u00e9lie Simone Mauresmo thought her opponents were easy during the 2004 Summer Olympics \"? Yes, no, or maybe? Maybe\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene. Are we justified in saying that \"The Foo Fighters were popular worldwide\"? Yes, no, or maybe? Maybe\n###\nUna Lillian Paisley (born 18 November 1922 in Kew in Melbourne in Victoria - died 1977 in Kew, Victoria) was an Australian cricket player. She played twelve Test matches for the Australia national women's cricket team. She captained the Australia national women's cricket team in four Test matches against New Zealand and England. Are we justified in saying that \"Una Lillian Paisley won a gold medal.\"? Yes, no, or maybe? Maybe\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"Nadeem F. Paracha is a talented author in Pakistan.\"? Yes, no, or maybe? Maybe\n###\nGiovanni Visconti \u2014 according to Lorenzo Cardella nephew of Pope Gregory X. He was ostensibly created cardinal-bishop of Sabina by his uncle in 1275 and in 1276 was named judge in the case concerning the translation of bishop Giovanni of Potenza to the archbishopric of Monreale, postulated by the cathedral chapter of Monreale. He died in 1277 or 1278. Are we justified in saying that \"Giovanni Visconti died in both 1277 and 1278.\"? Yes, no, or maybe?", "doc_id": 91, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11460, 27200, 8810, 24136], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012. Are we justified in saying that \"Kaley Cuoco is an actress\"? Yes, no, or maybe? Maybe\n###\nThe Grove is a business district located between Vandeventer and Kingshighway in the Forest Park East neighborhood of St. Louis, Missouri. It is near Barnes-Jewish Hospital, Washington University School of Medicine, Saint Louis University Hospital, Saint Louis University School of Medicine, Forest Park, and Tower Grove Park. Are we justified in saying that \"The Grove is a business district in St. Louis, Missouri that has many hospitals and medical schools, to mention a few: Barnes-Jewish Hospital, Washington University School of Medicine, Saint Louis University Hospital, Saint Louis University School of Medicine, Forest Park, and Tower Grove Park and many others.\"? Yes, no, or maybe? Maybe\n###\nTexas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax. Are we justified in saying that \"Texas Monthly was successful in its legal action.\"? Yes, no, or maybe? Maybe\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"Wallace Michael Ross founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia while teaching at Sturgess School in Derby.\"? Yes, no, or maybe? Maybe\n###\nMelinda Jacobs (born August 2, 1967) is an American born entertainment reporter, currently providing content to Secrets of the City and [1]. Over the span of her 20-year career in television/radio Jacobs has interviewed celebrities such as Quincy Jones, Kathie Lee Gifford, Joan Rivers, Demi Lovato and Adrian Peterson. Are we justified in saying that \"They weren't successful.\"? Yes, no, or maybe?", "doc_id": 887, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8269, 29922, 38244, 613], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Lady's Morals is a 1930 American Pre-Code film offering a highly fictionalized account of singer Jenny Lind. The movie features opera diva Grace Moore as Lind, Reginald Denny as a lover, and Wallace Beery as P. T. Barnum; Beery would play Barnum again four years later in \"The Mighty Barnum\". The film contains some fine opera arias by Moore and was directed by Sidney Franklin. Are we justified in saying that \"A Lady's Morals was made in the 1930s\"? Yes, no, or maybe? Yes\n###\nThe Local Government (Northern Ireland) Act 1972 (1972 c. 9) was an Act of the Parliament of Northern Ireland that constituted district councils to administer the twenty-six local government districts created by the Local Government (Boundaries) Act (Northern Ireland) 1971, and abolished the existing local authorities in Northern Ireland. Are we justified in saying that \"Northern Ireland has several localized government districs\"? Yes, no, or maybe? Yes\n###\nCraig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\". Are we justified in saying that \"He had a different career before becoming a director.\"? Yes, no, or maybe? Yes\n###\nRonald Mayorga S\u00e1nchez (born June 26, 1984, Yumbo, Valle del Cauca, Colombia) is an award-winning Colombian journalist and TV anchor of \"Red\" in Caracol Television in Colombia. As a radio journalist who works with \"Blue Radio\" one of the radio station's imported from Latin America as a host in \"Vox Populi\". Are we justified in saying that \"Ronald Mayorga S\u00e1nchez has never been a tv anchor\"? Yes, no, or maybe? No\n###\nHarry Brand (October 20, 1895 \u2013 February 22, 1989) was an American press agent. Described as \"the mastermind who made Shirley Temple the most famous child star in history, Betty Grable a GI Joe pinup girl and Marilyn Monroe a sex goddess,\" Brand was the head of publicity at 20th Century Fox from 1935 until 1962. Are we justified in saying that \"Brand was the head of publicity at Fox for half a decade\"? Yes, no, or maybe?", "doc_id": 478, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17761, 36576, 34755, 14056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Asana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook. Are we justified in saying that \"Asana is a mobile only application.\"? Yes, no, or maybe? No\n###\nDangerously Excited (; lit. \"I'm a Civil Servant\") is a 2012 South Korean comedy-drama film starring Yoon Je-moon as a stuffy municipal bureaucrat who learns to embrace life when a budding rock band moves into his basement. The film premiered at the 2011 Busan International Film Festival and also screened at the 2012 Udine Far East Film Festival. Are we justified in saying that \"The film was shown before 2012.\"? Yes, no, or maybe? Yes\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village. Are we justified in saying that \"The population of Makri was 1919 in the viilliage\"? Yes, no, or maybe? No\n###\nChicken Shack are a British blues band, founded in the mid-1960s by Stan Webb (guitar and vocals), Andy Silvester (bass guitar), and Alan Morley (drums), who were later joined by Christine Perfect (McVie) (vocals and keyboards) in 1968. Chicken Shack has performed with various line-ups, Stan Webb being the only constant member. Are we justified in saying that \"Chicken Shack is only a game.\"? Yes, no, or maybe? No\n###\nGalli Galli Sim Sim (Devanagari: \u0917\u0932\u0940 \u0917\u0932\u0940 \u0938\u093f\u092e \u0938\u093f\u092e) is the Hindi language adaptation of the American children's television series \"Sesame Street\" (famous for its Muppets), for India. It is co-produced by Sesame Workshop and Turner Entertainment, through Miditech. The show's Indian production company is known as Sesame Workshop India. Are we justified in saying that \"Turner Entertainment did not want to work on this project.\"? Yes, no, or maybe?", "doc_id": 877, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38435, 1170, 1809, 45130], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lucas Franchoys the Younger or Lucas Franchoys II (alternative spellings of name: Lucas Franchois, Lucas Fran\u00e7ois, Louis Franchoys) (28 June 1616 in Mechelen \u2013 3 April 1681 in Mechelen) was a Flemish Baroque painter from Mechelen, who painted numerous altarpieces and portraits in a style reminiscent of Anthony van Dyck. Are we justified in saying that \"Lucas Franchoys brothers often helped him .\"? Yes, no, or maybe? Maybe\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"Bosch is a film.\"? Yes, no, or maybe? No\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years. Are we justified in saying that \"FMCG are sold slowly\"? Yes, no, or maybe? No\n###\nCognizant is an American multinational corporation that provides IT services, including digital, technology, consulting, and operations services. It is headquartered in Teaneck, New Jersey, United States. Cognizant is listed in the NASDAQ-100 and the S&P 500 indices. It was founded as an in-house technology unit of Dun & Bradstreet in 1994, and started serving external clients in 1996. Are we justified in saying that \"Dun & Bradstreet is the IT service division of the Cognizant corporation. \"? Yes, no, or maybe? No\n###\nBurton & Taylor is a BBC Four TV film directed by Richard Laxton, and based on the legendary acting duo and former husband and wife, Richard Burton and Elizabeth Taylor, during their preparation for a 1983 theatrical production of the play, \"Private Lives\". The film stars Helena Bonham Carter and Dominic West in the title roles. Are we justified in saying that \"Private Lies came out the year after 1981.\"? Yes, no, or maybe?", "doc_id": 413, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21975, 26881, 21817, 41358], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife. Are we justified in saying that \"The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It was a great piece of work.\"? Yes, no, or maybe? Maybe\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest. Are we justified in saying that \"Beilin District is well traveled\"? Yes, no, or maybe? Maybe\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974. Are we justified in saying that \"Per\u00f3n would have been elected a third time if not for the coup.\"? Yes, no, or maybe? Maybe\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008. Are we justified in saying that \"Takeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and another company.\"? Yes, no, or maybe? Yes\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title. Are we justified in saying that \"The 2009 British Speedway Championship took place more than 1001 days ago.\"? Yes, no, or maybe?", "doc_id": 834, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34031, 40301, 24741, 36421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics. Are we justified in saying that \"Dave Smith is from Russia\"? Yes, no, or maybe? No\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944. Are we justified in saying that \"Tippet was a prolific recording artist before meeting Holst.\"? Yes, no, or maybe? Maybe\n###\nCougar Town is an American television sitcom that ran for 102 episodes over six seasons, from September 23, 2009 until March 31, 2015. The first three seasons aired on ABC, with the series moving to TBS for the final three seasons. The pilot episode was broadcast after \"Modern Family\". ABC officially gave the series a full season pickup on October 8, 2009. Are we justified in saying that \"Cougar Town was forced to move to TBS\"? Yes, no, or maybe? Maybe\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date. Are we justified in saying that \"Hemorrhage (In My Hands)\" which reached #322 on the \"Billboard\" Hot 100 charts.\"? Yes, no, or maybe? No\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District. Are we justified in saying that \"At the 2010 census more than 2 million people lived in the built up area made of 3 out of 4 urban districts.\"? Yes, no, or maybe?", "doc_id": 450, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24604, 32715, 43858, 6884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "No. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville. Are we justified in saying that \"No. 59 Squadron RAAF is also located at RAAF Base Townsville\"? Yes, no, or maybe? Maybe\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress. Are we justified in saying that \"It does not have a name that is called a reporting name\"? Yes, no, or maybe? No\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website. Are we justified in saying that \"Van Morrison sells the album at his concerts.\"? Yes, no, or maybe? Yes\n###\nPhacelia pedicellata is a species of flowering plant in the borage family, Boraginaceae. Its common names include specter phacelia and pedicellate phacelia. It is native to the southwestern United States and Baja California, where it can be found in several types of habitat, including creosote bush scrub and Joshua tree woodland. Are we justified in saying that \"Phacelia pedicellata is not native to the United States\"? Yes, no, or maybe? No\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station. Are we justified in saying that \"Grantham North Services has 3 parks\"? Yes, no, or maybe?", "doc_id": 348, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34367, 25594, 34344, 22080], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gloria Marie Steinem (born March 25, 1934) is an American feminist, journalist, and social political activist, who became nationally recognized as a leader and a spokeswoman for the American feminist movement in the late 1960s and early 1970s. She is listed in Who's Who in America. Are we justified in saying that \"Gloria Marie Steinem was a lesbian\"? Yes, no, or maybe? Maybe\n###\nMichael Cunningham (born November 6, 1952) is a U.S. novelist and screenwriter. He is best known for his 1998 novel \"The Hours\", which won the Pulitzer Prize for Fiction and the PEN/Faulkner Award in 1999. Cunningham is a senior lecturer of creative writing at Yale University. Are we justified in saying that \"Michael Cunningham is over 50 years old today\"? Yes, no, or maybe? Yes\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization. Are we justified in saying that \"315th Airlift Wing consists of Air force reserves and active-duty military personal\"? Yes, no, or maybe? Maybe\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008. Are we justified in saying that \"Laura Elena Z\u00fa\u00f1iga Huizar is smart.\"? Yes, no, or maybe? Maybe\n###\nThe 1972 Grantland Rice Bowl was an NCAA College Division game following the 1972 season, between the Louisiana Tech Bulldogs and the Tennessee Tech Golden Eagles. Louisiana Tech quarterback Denny Duron was named outstanding offensive player, while his teammate linebacker Joe McNeely was named outstanding defensive player. Are we justified in saying that \"Louisiana Tech had both outstanding offensive player and outstanding defensive player, Joe McNeeley and Denny Duron, repectively.\"? Yes, no, or maybe?", "doc_id": 136, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8585, 34382, 13933, 9216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\". Are we justified in saying that \"Daraar was critically well-received.\"? Yes, no, or maybe? Maybe\n###\nGloria Marie Steinem (born March 25, 1934) is an American feminist, journalist, and social political activist, who became nationally recognized as a leader and a spokeswoman for the American feminist movement in the late 1960s and early 1970s. She is listed in Who's Who in America. Are we justified in saying that \"Gloria Marie Steinem has 3 children\"? Yes, no, or maybe? Maybe\n###\nThe Newtown Pippin, also known as Albemarle Pippin, is an American apple originated in the late 17th or early 18th century and still cultivated on a small scale. At one time there were two very similar apple cultivars known as the 'Yellow Newtown' ('Albermarle Pippin') and 'Green Newtown' ('Brooke Pippin'), one of which perhaps originated as a sport of the other. Are we justified in saying that \"Yellow and Green Pippins are still produced on a small scale though they go by different names.\"? Yes, no, or maybe? Yes\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals. Are we justified in saying that \"Di Natale had 70 goals.\"? Yes, no, or maybe? No\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there. Are we justified in saying that \"Congressman Joseph Bouck was raised on Bouck's Island.\"? Yes, no, or maybe?", "doc_id": 128, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39233, 26645, 35527, 25207], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season. Are we justified in saying that \"The MVP get sponsorships\"? Yes, no, or maybe? Maybe\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo. Are we justified in saying that \"Celly Cel is a very good\"? Yes, no, or maybe? Maybe\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer. Are we justified in saying that \"He had more success when he was back in the group\"? Yes, no, or maybe? Maybe\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress. Are we justified in saying that \"The MiG-31 interceptor can attack large high-speed targets such as a Boeing 747.\"? Yes, no, or maybe? Maybe\n###\nGeorge Montgomery (born April 26, 1962) is a retired American basketball player. He played basketball at Illinois, and was a second-round draft selection of the Portland Trail Blazers in the 1985 NBA Draft, though he never played in the NBA. He is the biological father of Warriors center JaVale McGee, but did not raise his son. Are we justified in saying that \"George Montgomery can dribble a basketball\"? Yes, no, or maybe?", "doc_id": 626, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16315, 39231, 44248, 34568], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films. Are we justified in saying that \"Jonathan Michael Lovitz met Trump.\"? Yes, no, or maybe? Maybe\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"The Flirts split up in 1990.\"? Yes, no, or maybe? Maybe\n###\nRobert Cary Blanchard (November 5, 1968 \u2013 September 6, 2016) was an American football placekicker in the National Football League. He played eight years for five teams: the New York Jets for his first two years, the Indianapolis Colts after taking 1994 off, the Washington Redskins in 1998, the New York Giants in 1999, and the Arizona Cardinals in his final season. Are we justified in saying that \"Robert Blanchard's NFL career began in 1991\"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"In 2015, the City of Orlando tore down the grandstands at Tinker Field to build a new stadium.\"? Yes, no, or maybe? Maybe\n###\nNabokov's Congeries was a collection of work by Vladimir Nabokov published in 1968 and reprinted in 1971 as \"The Portable Nabokov\". Because Nabokov supervised its production less than a decade before he died, it is useful in attempting to identify which works Nabokov considered to be his best, especially among his short stories. Are we justified in saying that \"\"The Portable Nabokov\" was supervised by Nabokov 7 years before he died.\"? Yes, no, or maybe?", "doc_id": 714, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27179, 18359, 1114, 30236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Liberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD). Are we justified in saying that \"Areilza had left Democratic Center Union.\"? Yes, no, or maybe? Yes\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"Wallace Michael Ross died peacefully.\"? Yes, no, or maybe? Maybe\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points. Are we justified in saying that \"The Cornhuskers played 8 games in 1994\"? Yes, no, or maybe? Maybe\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"The San Nicolao Tunnel took more than 10 years to construct\"? Yes, no, or maybe? Maybe\n###\n\"Legion\" is an American cable television series created for FX by Noah Hawley, based on the Marvel Comics character David Haller / Legion. It is connected to the \"X-Men\" film series, the first television series to do so. The first season, consisting of eight episodes, began airing on February 8, 2017. A second season was ordered in March 2017. Are we justified in saying that \"legion is an american series connected to x men and created for dx by noah hawley with 8 episode for season 1\"? Yes, no, or maybe?", "doc_id": 745, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3366, 9124, 5159, 25817], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) is a 1975 Soviet Ukrainian musical film, produced by Viktor Storozhenko starring Sofia Rotaru in the main role, as well as Soviet Ukrainian Smerichka vocal-instrumental band. The movie features songs in Ukrainian, Moldovan and Russian of Sofia Rotaru filmed in the background of Ukrainian Carpathian mountains. Are we justified in saying that \"Pisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) was filmed in 1975\"? Yes, no, or maybe? Yes\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"Home Depot also sells plants.\"? Yes, no, or maybe? Maybe\n###\nThe 18th Street Expressway (often shortened to 18th Street in everyday speech) is a freeway in Kansas City, Kansas that runs from Interstate 35 north to Interstate 70/U.S. Route 24/U.S. Route 40. It carries the U.S. Route 69 designation its entire length. Are we justified in saying that \"The 18th Street Expressway is one of the longest freeways in Kansas City.\"? Yes, no, or maybe? Maybe\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330. Are we justified in saying that \"Thomas Cooper was the best England international footballer. \"? Yes, no, or maybe? Maybe\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion. Are we justified in saying that \"\"The History of the Decline and Fall of the Roman Empire\" had six volumes published in four years.\"? Yes, no, or maybe?", "doc_id": 375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28631, 29826, 28521, 29347], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro. Are we justified in saying that \"Massaro's pupil Martoriello became a good painter.\"? Yes, no, or maybe? Maybe\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"Youth in Guatemala are redheads.\"? Yes, no, or maybe? Maybe\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\". Are we justified in saying that \"Idichapuli worked in multiple roles on films.\"? Yes, no, or maybe? Yes\n###\nWilliam Irving Turner (1890\u20131950), commonly known as Tim Turner or W.I. Turner, was an American architect. He served as a U.S. Forest Service architect and is credited with much of the design of Timberline Lodge on Mount Hood in Oregon, an important and influential work. Are we justified in saying that \"William Irving Turner talked to Amy.\"? Yes, no, or maybe? Maybe\n###\nLouis Marx (August 11, 1896 \u2013 February 5, 1982) was an American toy maker and businessman whose company, Louis Marx and Company, was the largest toy company in the world in the 1950s. Described by many as an experienced businessman with the mind of child; Louis Marx\u2019s ability to see into the minds of children around the world guided his toy creations and advertising efforts. Are we justified in saying that \"Louis Marx lived through the Great Depression.\"? Yes, no, or maybe?", "doc_id": 637, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26214, 39597, 44507, 31268], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster. Are we justified in saying that \"Mithun Chakraborty really likes being the Grandmaster\"? Yes, no, or maybe? Maybe\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season. Are we justified in saying that \"The Adriatic ABA League started in 1998.\"? Yes, no, or maybe? Maybe\n###\nPrincess Amalie \"Auguste\" of Anhalt-Dessau (German: \"Prinzessin Amalie Auguste von Anhalt-Dessau\" ; 18 August 1793 \u2013 12 June 1854) was a German princess of Anhalt-Dessau who was Princess consort of Schwarzburg-Rudolstadt from 1816 to 1854 as the wife of Friedrich G\u00fcnther, Prince of Schwarzburg-Rudolstadt. Are we justified in saying that \"Gunther was born in 1793.\"? Yes, no, or maybe? Maybe\n###\nJames Bongani Kamte (born 20 July 1982), nicknamed \"Cobra\", is a South African professional golfer. He has played on the Sunshine Tour, Challenge Tour, European Tour, and Asian Tour. He earned his tour card for the 2008 European Tour season by finishing in the top 30 of the qualifying school. Are we justified in saying that \"He started golfing at age 15\"? Yes, no, or maybe? Maybe\n###\nAmargosa is an unincorporated community and census-designated place in Jim Wells County, Texas, United States. Its population was 291 as of the 2010 census. Prior to 2010, the community was grouped with nearby Owl Ranch as part of the Owl Ranch-Amargosa census-designated place. The community is named for the Amargosa Creek that runs nearby. The word \"amargosa\" means \"bitter\" in Spanish. Are we justified in saying that \"Amargosa is a place in Jum Wells County, Texas that has a large population. \"? Yes, no, or maybe?", "doc_id": 739, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10008, 23932, 43160, 16598], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Identification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\". Are we justified in saying that \"The first feature film directed by Skolimowski was called Identification Marks: None (Polish: Rysopsis) in 1964.\"? Yes, no, or maybe? Yes\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005. Are we justified in saying that \"Samuel Eto'o Fils was born in the second month of the year.\"? Yes, no, or maybe? No\n###\nThe 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016. Are we justified in saying that \"Happy Valley is the host city to the ATP Tour.\"? Yes, no, or maybe? Yes\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978. Are we justified in saying that \"Gary Sutherland will be inducted into the MLB Hall of Fame\"? Yes, no, or maybe? Maybe\n###\nKirill Olegovich Starkov (Russian:\u041a\u0438\u0440\u0438\u043b\u043b \u041e\u043b\u0435\u0433\u043e\u0432\u0438\u0447 \u0421\u0442\u0430\u0440\u043a\u043e\u0432, born March 31, 1987), is a professional Danish ice hockey player. He is playing for HC Red Ice in the Swiss National League B. He has previously played for CSKA Moscow, Syracuse Crunch, Youngstown Steelhounds, Red Deer Rebels, Fr\u00f6lunda HC, Timr\u00e5 IK, Esbjerg IK and IK Oskarshamn. Are we justified in saying that \"kirill olegovich is from russia \"? Yes, no, or maybe?", "doc_id": 536, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34818, 38786, 18858, 34329], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Walking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis. Are we justified in saying that \"Walking on Sunshine was Leona Lewis first and last appearance\"? Yes, no, or maybe? Maybe\n###\nPrivate First Class Jose F. Valdez (January 3, 1925 - February 17, 1945) was a United States Army soldier who posthumously received the Medal of Honor \u2014 the United States' highest military decoration \u2014 for his actions near Rosenkranz, France, in the Battle of the Colmar Pocket during World War II. Are we justified in saying that \"Jose Valdez was in the army. \"? Yes, no, or maybe? Yes\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists. Are we justified in saying that \"Contra Conspiracy is a 1999 action film\"? Yes, no, or maybe? No\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall is in New Mexico\"? Yes, no, or maybe? No\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\". Are we justified in saying that \"Never Be Rude to an Arab is a song by Monty Python from two of their albums\"? Yes, no, or maybe?", "doc_id": 693, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22838, 4966, 19006, 19313], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wilson Dam is a dam spanning the Tennessee River between Lauderdale County and Colbert County in the U.S. state of Alabama. It impounds Wilson Lake. It is one of nine Tennessee Valley Authority (TVA) dams on the Tennessee River. The dam was declared a National Historic Landmark on November 13, 1966. Are we justified in saying that \"Wilson Dam is far from Atlanta.\"? Yes, no, or maybe? Maybe\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910. Are we justified in saying that \"Korea was annexed to the Empire of Japan.\"? Yes, no, or maybe? Yes\n###\nSpecies III is a 2004 science fiction thriller television film. The film, directed by Brad Turner, is the third installment of the \"Species\" series, and stars Robin Dunne, Robert Knepper, Sunny Mabrey, Amelia Cooke and John Paul Pitoc. Natasha Henstridge, who was contracted to a trilogy commencing with the first \"Species\" film, briefly reprises the role of Eve in the opening scene. Are we justified in saying that \"Species III is not a crime drama.\"? Yes, no, or maybe? Yes\n###\nRylstone was a former electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of Mudgee and named after and including Rylstone. It was abolished in 1904, with the downsizing of the Legislative Assembly after Federation. Are we justified in saying that \"During its existence, Rylstone was the smallest electoral district in New South Wales\"? Yes, no, or maybe? Maybe\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944. Are we justified in saying that \"Spem in Alium was not one of Tippett's works\"? Yes, no, or maybe?", "doc_id": 21, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [430, 33303, 16644, 33567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues. Are we justified in saying that \"Robin internationalized Stehr's song\"? Yes, no, or maybe? Yes\n###\nSanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\"). Are we justified in saying that \"Sanation had support from all polish citizens.\"? Yes, no, or maybe? Maybe\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Malone only played 1 year of professional football\"? Yes, no, or maybe? No\n###\nRobert Mehrabian (born July 31, 1941, in Tehran, Iran) is an Iranian-American materials scientist and the Chair, President, and Chief Executive Officer of Teledyne Technologies Incorporated. During the 1990s he served as the seventh President of Carnegie Mellon University in Pittsburgh, Pennsylvania, United States. Are we justified in saying that \"Robert Mehrabian was born in Amerrica\"? Yes, no, or maybe? No\n###\nJohn von Neumann's Universal Constructor is a self-replicating machine in a cellular automata (CA) environment. It was designed in the 1940s, without the use of a computer. The fundamental details of the machine were published in von Neumann's book \"Theory of Self-Reproducing Automata\", completed in 1966 by Arthur W. Burks after von Neumann's death. Are we justified in saying that \"John von Neumann's Universal Constructor was created in the early 20th century.\"? Yes, no, or maybe?", "doc_id": 126, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4410, 22152, 23930, 26851], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town. Are we justified in saying that \"Hawthorne has a population of 3,926 as of 2019.\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"Russia was almost involved in the conflict. \"? Yes, no, or maybe? Maybe\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Ralph D. Malone later became a sports announcer. \"? Yes, no, or maybe? Maybe\n###\nDavid Halberstam (April 10, 1934 \u2013 April 23, 2007) was an American journalist and historian, known for his work on the Vietnam War, politics, history, the Civil Rights Movement, business, media, American culture, and later, sports journalism. He won a Pulitzer Prize for International Reporting in 1964. In 2007, while doing research for a book, Halberstam was killed in a car crash. Are we justified in saying that \"David Halberstam's books on the Civil Rights movement have sold in excess of 50,000 copies.\"? Yes, no, or maybe? Maybe\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"The 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts, that continued for more than a week.\"? Yes, no, or maybe?", "doc_id": 966, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13154, 4570, 37056, 21845], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sir Hugh Montgomery, 1st Viscount Montgomery of the Great Ards (c. 1560 \u2013 15 May 1636) was an aristocrat and a soldier, known as one of the \"founding fathers\" of the Ulster-Scots along with Sir James Hamilton, 1st Viscount Claneboye. Montgomery was born in Ayrshire at Broadstone Castle, near Beith. He was the son of Adam Montgomery, the 5th Laird of Braidstane, by his wife and cousin. Are we justified in saying that \"Sir Hugh Montgomery died in 1560\"? Yes, no, or maybe? No\n###\nSpring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38. Are we justified in saying that \"Spring Fine Art Exhibition of Leningrad artists was held in Leningrad in 1954. \"? Yes, no, or maybe? Yes\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree Victoria Murphy is older than 32\"? Yes, no, or maybe? Yes\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site. Are we justified in saying that \"Cincinnati is 5 miles from Cleves.\"? Yes, no, or maybe? Maybe\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars. Are we justified in saying that \"Via Dante starts with an A.\"? Yes, no, or maybe?", "doc_id": 957, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2554, 10322, 38920, 26732], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Game Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams. Are we justified in saying that \"First Game Plan's president was Donald Trump and then Wendell McAdams.\"? Yes, no, or maybe? Maybe\n###\nMarco Masini (born September 18, 1964 in Florence), is an Italian singer-songwriter and musician. . One of his greatest virtues is his voice due to his vocal range, which reaches difficult musical notes, according to experts . . Accompanied by guitarist Riccardo Cherubini, . Are we justified in saying that \"Marco Masini has a voice that can reach difficult musical notes. \"? Yes, no, or maybe? No\n###\nBusby is a census-designated place (CDP) in Big Horn County, Montana, United States. It is on the Northern Cheyenne reservation. The population was 745 at the 2010 census. The town is near the site of the Battle of the Rosebud and the associated Rosebud Battlefield State Park, where General George Custer forces encountered Sioux and Cheyenne forces led by Crazy Horse. Are we justified in saying that \"George Custer liked being a general.\"? Yes, no, or maybe? Maybe\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"The Overwatch World Cup 2017 took place in 2015\"? Yes, no, or maybe? No\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Karan Johar's was in mumbai on february 15 1999\"? Yes, no, or maybe?", "doc_id": 457, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15655, 5291, 37484, 10690], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis. Are we justified in saying that \"Georgia Hart and B.J. Annis have at least one son\"? Yes, no, or maybe? Yes\n###\nThe Pari Aike Formation is a Late Cretaceous geologic formation of Cenomanian (formerly believed to be Maastrichtian) age in southern Patagonia, Argentina. The giant titanosaur \"Puertasaurus\", the megaraptoran \"Orkoraptor\", and the ornithopod \"Talenkauen\" have been recovered from the formation alongside turtles and crocodiles. Are we justified in saying that \"The Pari Aike Formation is located in South America.\"? Yes, no, or maybe? Yes\n###\nAdenanthos terminalis, commonly known as gland flower, yellow gland flower or adenanthos, is a one metre tall shrub in the family Proteaceae. It is found in south eastern regions of Australia, in the states of South Australia and Victoria, and is the most widespread of the two \"Adenanthos\" species occurring outside of Western Australia. Are we justified in saying that \"Adenanthos terminalis is not found in Australia.\"? Yes, no, or maybe? No\n###\n\"Birds of a Feather\" is a 1998 song by the American band Phish. It is the second track from their 1998 album \"The Story of the Ghost\" and was released as their twelfth promotional single by Elektra Records. The song is a funk rock song written by the entire band and lyricist Tom Marshall. Are we justified in saying that \"Many people didn't like that song. \"? Yes, no, or maybe? Maybe\n###\nThis is a list of Japanese idols; a type of celebrity in Japan. The word \"idol\" is almost always used to refer to a young woman, although there a significant number of male idols. The following list includes both female and male idols as well as both solo idols and idol groups. Are we justified in saying that \"Japanese celebrities almost always prefer young women.\"? Yes, no, or maybe?", "doc_id": 253, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29569, 10157, 6171, 4623], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals. Are we justified in saying that \"We're an American Band has a Z.\"? Yes, no, or maybe? No\n###\nDelano Andre Howell (born November 17, 1989) is a former strong safety. He was originally signed by the Buffalo Bills as an undrafted free agent in 2012. Howell played his four years of college football at Stanford University first as a running back, before switching to safety. He is the younger brother of Dan Howell, who played football for the University of Washington. Are we justified in saying that \"Delano Andre Howell is an only child. \"? Yes, no, or maybe? No\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Karan Johar's reached the zenith of his career in 1999\"? Yes, no, or maybe? Maybe\n###\n\"The Encounter\" is episode 151 of the American television series \"The Twilight Zone\". First broadcast on May 1, 1964, its racial overtones caused it to be withheld from syndication in the U.S. On January 3, 2016, the episode was finally reaired as part of Syfy's annual \"Twilight Zone\" New's Year Eve marathon. Are we justified in saying that \"\"The Encounter\" was the 151 episode of the American television series \"Black Mirror\"? Yes, no, or maybe? No\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"True as a Turtle had Jim Smith in it. \"? Yes, no, or maybe?", "doc_id": 54, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21936, 15417, 5393, 43529], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hell's Kitchen Australia is an Australian cooking reality competition television series which premiered on the Seven Network on 6 August 2017. The series is hosted by British chef Marco Pierre White, who previously hosted two seasons of the British version of the format and appeared in rival program \"MasterChef Australia\". Are we justified in saying that \"Marco White doesn't cook food.\"? Yes, no, or maybe? No\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"Riley and Redman were good friends since they worked together.\"? Yes, no, or maybe? Maybe\n###\nMaris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy. Are we justified in saying that \"Maris Soule writes commercial scripts.\"? Yes, no, or maybe? No\n###\nLouis S. Peterson (June 17, 1922 \u2013 April 27, 1998) was a playwright, actor, screenwriter, and professor. He was an American playwright and the first African-American playwright to have a dramatic play produced on Broadway. He was also one of the first African-American writers to be nominated for an Emmy Award. Are we justified in saying that \"Louis S. Peterson was born in the US\"? Yes, no, or maybe? Yes\n###\nLouis Armstrong (1901\u20131971), nicknamed Satchmo or Pops, was an American trumpeter, composer, singer and occasional actor who was one of the most influential figures in jazz. His career spanned five decades, from the 1920s to the 1960s, and different eras in jazz. Are we justified in saying that \"Louis Armstrong was born more than 9999 days ago.\"? Yes, no, or maybe?", "doc_id": 549, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16020, 43622, 17789, 23741], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern. Are we justified in saying that \"McKinnon scored the most points in the league in 2014.\"? Yes, no, or maybe? Maybe\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry. Are we justified in saying that \"Chuck Berry's name is synonymous with \"throw fruit.\"\"? Yes, no, or maybe? Yes\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence. Are we justified in saying that \"A conjectural portrait is hard to identify.\"? Yes, no, or maybe? Maybe\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono. Are we justified in saying that \"Gene Kerrigan has directed at least one documentary film.\"? Yes, no, or maybe? Yes\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years. Are we justified in saying that \"Durable items get changed every so often\"? Yes, no, or maybe?", "doc_id": 426, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6900, 30668, 289, 38478], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Revisited is a 1960 album by Tom Lehrer, consisting of live recordings of all the songs from 1953's \"Songs by Tom Lehrer\". The CD reissue of the album contains two additional tracks that Lehrer wrote and performed for the PBS television show \"The Electric Company\" (and produced and conducted by Joe Raposo). Are we justified in saying that \" Revisited is a 1960 album contains songs sung by Tom Lehre\"? Yes, no, or maybe? Yes\n###\nPearse Island is an island in western British Columbia, Canada, in the Portland Inlet, an inlet of the Pacific Ocean. The island was first charted in 1793 by George Vancouver during his 1791-95 expedition. It was named by George Henry Richards, captain of \"HMS Plumper\", circa 1860, in honour of William Alfred Rombulow Pearse of the Royal Navy, who had been commander of \"HMS Alert\". Are we justified in saying that \"The island is prone to bad weather\"? Yes, no, or maybe? Maybe\n###\nPietro Ferrero (2 September 1898 \u2013 2 March 1949) was the founder of Ferrero SpA, an Italian confectionery and chocolatier company. His company invented Nutella, a hazelnut-cream spread, which is now sold in over 160 countries. The famous Ferrero Rochers are also made by his company, Ferrero, as were Tic-Tacs and various Kinder chocolates. Are we justified in saying that \"company survives thanks to nutella\"? Yes, no, or maybe? Maybe\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street. Are we justified in saying that \"A village is better than a civil parish.\"? Yes, no, or maybe? Maybe\n###\nMark Baltz is a former official in the National Football League (NFL) from 1989 through 2013. He has worked as a head linesman throughout his entire career in the NFL and has been assigned to 21 post-season games, including five conference championship games (1998, 1999, 2000, 2001, 2004). He wore uniform number 26. Are we justified in saying that \"number 26 represents Mark Baltz \"? Yes, no, or maybe?", "doc_id": 352, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25353, 18075, 14696, 11948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Demoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\". Are we justified in saying that \"Demoniac were formed over 5 years ago\"? Yes, no, or maybe? Yes\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, US. The plant is on the Endangered species list and is the rarest known plant in the state.\"? Yes, no, or maybe? Yes\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium. Are we justified in saying that \"Moulton is 68 years old.\"? Yes, no, or maybe? Yes\n###\nDeanne Olivia Bell is an American television personality currently hosting CNBC's reality docu-series \"Make Me a Millionaire Inventor.\" She has previously worked on PBS's \"Design Squad\", Discovery Channel's \"Smash Lab\", and National Geographic's \"The Egyptian Job\". She has also co-hosted DIY Network's \"Money Hunters\" and ESPN's \"Rise Up.\" Are we justified in saying that \"Deanne Olivia Bell ends with l.\"? Yes, no, or maybe? Yes\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough. Are we justified in saying that \"At least three dozen episodes of South Park aired in the United States before \"Merry Christmas, Charlie Manson!\" originally aired.\"? Yes, no, or maybe?", "doc_id": 370, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14250, 31602, 24994, 7773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Henry II (18 April 1503 \u2013 25 May 1555), nicknamed \"Sang\u00fcesino\" because he was born at Sang\u00fcesa, was the King of Navarre from 1517, although his kingdom had been reduced to a small territory north of the Pyrenees by the Spanish conquest of 1512. Henry succeeded his mother, Queen Catherine, upon her death. His father was her husband and co-ruler, King John III, who died in 1516. Are we justified in saying that \"Henry II breathed air.\"? Yes, no, or maybe? Yes\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time. Are we justified in saying that \"The population of Fayette will increase.\"? Yes, no, or maybe? Maybe\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament. Are we justified in saying that \"It was a warm day when Belmont won its first tournament championship in 2011.\"? Yes, no, or maybe? Maybe\n###\nThree Preludes is a ballet made for Mikhail Baryshnikov by Mark Morris to eponymous music by George Gershwin for his own company and presented as a piece d'occasion by the New York City Ballet. The performance took place June 16, 1992, at the New York State Theater, Lincoln Center. Are we justified in saying that \"The performance was held on the sixteenth day of the month of June\"? Yes, no, or maybe? Yes\n###\nThe Coy C. Carpenter Library and Dorothy Carpenter Medical Archives, located at Wake Forest School of Medicine, is a library named after the first dean of the university's medical school, Coy Cornelius Carpenter, M.D., and his wife, Dorothy (Mitten) Carpenter. Are we justified in saying that \"The Coy C. Carpenter Library and Dorothy Carpenter Medical Archives is named after two men.\"? Yes, no, or maybe?", "doc_id": 214, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4971, 38674, 6137, 1474], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart. Are we justified in saying that \"John Howe had no daughters\"? Yes, no, or maybe? Maybe\n###\nChristian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team. Are we justified in saying that \"His family always knew he'd be big in baseball. \"? Yes, no, or maybe? Maybe\n###\nThe Argentine Grand Prix (Spanish: \"Gran Premio de Argentina\") was a round of the Formula One championship, held intermittently from to , all at the same autodrome in the Argentine national capital of Buenos Aires. Argentine president Juan Per\u00f3n was the driving force behind the creation of the circuit, after seeing the success of the country's own Juan Manuel Fangio. Are we justified in saying that \"The Argentine Grand Prix made up the whole Formula One Championship.\"? Yes, no, or maybe? No\n###\nThe Secret Garden is the 1987 Hallmark Hall of Fame TV film adaptation of the novel \"The Secret Garden\", aired on CBS November 30, 1987 and produced by Rosemont Productions Limited, who also produced \"Back to the Secret Garden\". The film stars Barret Oliver, Jadrien Steele, Billie Whitelaw and Sir Derek Jacobi. Are we justified in saying that \"the film stars michael jordan\"? Yes, no, or maybe? No\n###\n\"Makes No Difference\" is the first single by Canadian rock band Sum 41. It was released in June 2000 as the lead single from the band's extended play \"Half Hour of Power\". The song is featured on the soundtracks for \"Bring it On\", \"Out Cold\" and \"Van Wilder\". A new version of the song was featured on Sum 41's greatest hits compilation, \"All the Good Shit\". Are we justified in saying that \" Sum 41 is a band that comes from a country which has parts that speak both English and French.\"? Yes, no, or maybe?", "doc_id": 37, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44725, 1486, 36218, 2079], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling. Are we justified in saying that \"The jungle book was written by Stephen Sommers.\"? Yes, no, or maybe? No\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg. Are we justified in saying that \"Marie Hedwig Auguste of Sulzbach was only ever a Countess.\"? Yes, no, or maybe? No\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games. Are we justified in saying that \"The Seattle SuperSonics didn't succeed against the Phoenix Suns in six games.\"? Yes, no, or maybe? Yes\n###\nThe Circuit Gilles Villeneuve (also spelled Circuit Gilles-Villeneuve in French) is a motor racing circuit in Montreal, Quebec, Canada. It is the venue for the FIA Formula One Canadian Grand Prix. It has previously hosted the FIA World Sportscar Championship, the Champ Car World Series, the NASCAR Canadian Tire Series, the NASCAR Xfinity Series and the Grand-Am Rolex Sports Car Series. Are we justified in saying that \"Circuit Gilles Villeneuve is the only racing circuit in Montreal.\"? Yes, no, or maybe? Maybe\n###\nJos\u00e9 C. Vales (born 1965, Zamora) is a Spanish writer and translator of English literature. He studied in Salamanca and Madrid. He has translated numerous English and American authors into Spanish, including Dickens, Trollope, Austen, Wilkie Collins, Defoe, Mary Shelley, Arnold Bennett, Eudora Welty, Stella Gibbons, E.F. Benson, and Edmund Crispin. Are we justified in saying that \"Vales never met Dickens.\"? Yes, no, or maybe?", "doc_id": 655, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38027, 25686, 4221, 21088], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Matthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season. Are we justified in saying that \"Mansfield played multiple positions.\"? Yes, no, or maybe? Maybe\n###\nThe United States Ambassador to Sweden (Swedish: \"USA:s ambassad\u00f6r i Sverige\" ) serves as the official diplomatic representative of the President and the Government of the United States of America to the King and the Government of the Kingdom of Sweden. Diplomatic relations between Sweden and the United States began with the signing of the Treaty of Amity and Commerce in 1783. Are we justified in saying that \"The United States Ambassador to Sweden is the diplomatic representative to the Prince of Sweden.\"? Yes, no, or maybe? No\n###\nPanadol night is a GlaxoSmithKline painkiller intended for use at night. It consists of 500 milligrams of paracetamol, 25 milligrams of diphenhydramine hydrochloride (a sedating antihistamine) and other \"non-hazardous ingredients\" It is sold in Australia, Cyprus United Kingdom, Ireland, New Zealand and the Middle East. It became available as an over the counter medication in the UK in 1996. Are we justified in saying that \"Most painkillers are intended to be used in the evening.\"? Yes, no, or maybe? Maybe\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Vaux said it was the best house he ever designed\"? Yes, no, or maybe? Maybe\n###\nMichelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California. Are we justified in saying that \"Michelle Do was born in California.\"? Yes, no, or maybe?", "doc_id": 646, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42876, 40068, 44961, 13908], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bremen ( ) is a small town in Lincoln County, Maine, United States. The population was 806 at the 2010 census. Located on Muscongus Bay and the Gulf of Maine, it includes the villages of Broad Cove, Turners Corner, Bremen, Medomak and Muscongus. Hog Island is a center and camp for the Maine chapter of the National Audubon Society. Are we justified in saying that \"Bremen ( ) is a small town in Lincoln County, Maine, United States. It has many villages in it.\"? Yes, no, or maybe? Yes\n###\nThe Agassiz Brewing Company was a Canadian brewing company, founded by former Fort Garry Brewing Company brewmaster Gary De Pape. The company was established in 1998 in Winnipeg, Manitoba and based there until 2010. It was named for the prehistoric glacial Lake Agassiz which once covered much of Manitoba. Agassiz beer was available in Manitoba, Saskatchewan, Ontario, and British Columbia. Are we justified in saying that \"the company was established in 2010\"? Yes, no, or maybe? No\n###\n3 Musketeers is a direct-to-video action film by The Asylum loosely based on \"The Three Musketeers\" by Alexandre Dumas. The film is directed by Cole McKay and is a mockbuster that was released shortly after the Paul W. S. Anderson film \"The Three Musketeers\". The film was released on DVD and Blu-ray disc on October 25, 2011. Are we justified in saying that \"3 Musketeers was very popular in 2010\"? Yes, no, or maybe? No\n###\nBenny Bell (born Benjamin Samberg or Benjamin Zamberg, March 21, 1906 \u2013 July 6, 1999) was an American singer-songwriter who reached popularity in the 1940s, with a comeback in the 1970s. He is particularly remembered for his risqu\u00e9 but cheerfully optimistic songs. Are we justified in saying that \"Benny Bell's father was born on March 21, 1885.\"? Yes, no, or maybe? Maybe\n###\nHow Not to Die: Surprising Lessons on Living forever, Safer, and Healthier from America\u2019s Favorite Medical Examiner is a book about safe and healthy living written by Jan Garavaglia, aka \"Dr. G\", who is Chief Medical Examiner for the District Nine (Orange-Osceola) Medical Examiner's Office in Florida. Are we justified in saying that \"How Not to Die: Surprising Lessons on Living forever, Safer, and Healthier from America\u2019s Favorite Medical Examiner was written by Dr J\"? Yes, no, or maybe?", "doc_id": 522, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37537, 15299, 20430, 26120], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season. Are we justified in saying that \"The ABA League Finals MVP award can be shiny\"? Yes, no, or maybe? Maybe\n###\nThe large intestine (Chinese: \u5927\u80a0/\u5927\u8178: pinyin: \"d\u00e0 ch\u00e1ng\") is one of the \"fu\" organs stipulated by traditional Chinese medicine (TCM). As distinct from the Western medical concept of large intestine, this concept from TCM is more a way of describing a set of interrelated parts than an anatomical organ. It is a functionally defined entity and not equivalent to the anatomical organ of the same name. Are we justified in saying that \"There are 3 \"fu\" organs stipulated by TCM.\"? Yes, no, or maybe? Maybe\n###\nThe 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau. Are we justified in saying that \"The 3rd Macau International Movie Festival is everyone's favorite version so far\"? Yes, no, or maybe? Maybe\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea. Are we justified in saying that \"Super Show 6 - Super Junior World Tour Concert Album is an exciting album\"? Yes, no, or maybe? Maybe\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011. Are we justified in saying that \"Reddit founded hipmunk in 2010\"? Yes, no, or maybe?", "doc_id": 97, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41919, 34080, 35437, 32881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Airline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya. Are we justified in saying that \"Kenya Airways is headquartered in Kenya\"? Yes, no, or maybe? Maybe\n###\nThe United States Ambassador to Sweden (Swedish: \"USA:s ambassad\u00f6r i Sverige\" ) serves as the official diplomatic representative of the President and the Government of the United States of America to the King and the Government of the Kingdom of Sweden. Diplomatic relations between Sweden and the United States began with the signing of the Treaty of Amity and Commerce in 1783. Are we justified in saying that \"Prior to 1783, there were no official diplomatic relations between the United States and Sweden.\"? Yes, no, or maybe? Yes\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole was released by a band.\"? Yes, no, or maybe? No\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family. Are we justified in saying that \"The station was renamed in 1928\"? Yes, no, or maybe? Yes\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle. Are we justified in saying that \"City Mall has recently closed.\"? Yes, no, or maybe?", "doc_id": 802, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31935, 11654, 13916, 32904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "For Those Who Think Young is a 1964 beach party film shot in Techniscope, directed by Leslie H. Martinson and featuring James Darren, Pamela Tiffin, Paul Lynde, Tina Louise, Bob Denver, Nancy Sinatra, Robert Middleton, Ellen Burstyn (billed as Ellen McRae), Claudia Martin and Woody Woodbury. Are we justified in saying that \"For Those Who Think Young was a film directed by James Darren.\"? Yes, no, or maybe? No\n###\nThe Mercantil Tower (also known as the Mercantil Building) is a skyscraper located in the Venezuelan city of Caracas, is known for being the fourth tallest tower in the city and the country with 179 m in height and 40 floors, is located at Avenida Andres Bello, Candelaria Parish of Libertador municipality northwest of the capital. Are we justified in saying that \"The Mercantil Tower is 180m in height.\"? Yes, no, or maybe? No\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o breathes air.\"? Yes, no, or maybe? Yes\n###\nCarol Goodman, also known under the pseudonym Juliet Dark, is an American professor and author of gothic fiction. She has also written under the pseudonym Lee Carroll with her husband Lee Slominsky. Goodman currently serves as a creative writing professor at the State University of New York at New Paltz. Are we justified in saying that \"Carol Goodman has written under 2 pseudonyms.\"? Yes, no, or maybe? Yes\n###\nMorgan\u2019s Wonderland is a purpose-built 25-acre theme park in San Antonio, Texas for individuals with special needs. The park, which opened in spring 2010 on the site of the former Longhorn Quarry, was developed by Gordon Hartman, a former homebuilder from San Antonio. He said his daughter, Morgan, who deals with cognitive and physical challenges, inspired creation of the park. Are we justified in saying that \"Morgan's Wonderland was closed down recently.\"? Yes, no, or maybe?", "doc_id": 22, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26031, 13018, 17592, 42964], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\"). Are we justified in saying that \"Sanacja was created at some point after the final guns blasted for the end of the Great War.\"? Yes, no, or maybe? Yes\n###\nThe Sound and the Fury is an American drama film directed by James Franco. It is the second film version of the novel of the same name by William Faulkner. The previous adaptation, directed by Martin Ritt, was released in 1959. The film was released in a limited release and through video on demand on October 23, 2015, by New Films International. Are we justified in saying that \"The Sound and the Fury was James Franco's favorite film\"? Yes, no, or maybe? Maybe\n###\nThe Ravenswood City School District is a public school district headquartered in East Palo Alto, California, US. The district, in the San Francisco Bay Area, serves the communities of East Palo Alto and eastern Menlo Park. Students from this school district who continue on with public schooling matriculate to the Sequoia Union High School District. In 2008-09 it served over 4,500 students. Are we justified in saying that \"The Ravenswood City School District serves the communities of Orange Park and Hollywood\"? Yes, no, or maybe? No\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films. Are we justified in saying that \"She is a well known porn actress. \"? Yes, no, or maybe? Yes\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality. Are we justified in saying that \"Julian was born before Lucy Pounder.\"? Yes, no, or maybe?", "doc_id": 837, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38180, 38355, 78, 42292], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"Amy Timberlake feels sleepy when she gets up \"? Yes, no, or maybe? Maybe\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia. Are we justified in saying that \"The Prosecutor General's Office is located within Moscow's city limits.\"? Yes, no, or maybe? Maybe\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious. Are we justified in saying that \"The song was enjoyed by critics.\"? Yes, no, or maybe? Yes\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi. Are we justified in saying that \"NASA John H. Glenn Research Center at Lewis Field is a NASA center located between the cities of Brook Park and Cleveland.\"? Yes, no, or maybe? No\n###\nMajid (born 1975) is a Danish rapper of Moroccan-Berber origin. Residing in Aved\u00f8re near Copenhagen, Denmark he was a contributor to Danish act Outlandish, which also hails from Br\u00f8ndby Strand. Majid contributed to their tours and performed as a special guest in the warm-up for their acts. Are we justified in saying that \"Majid was not born in Denmark.\"? Yes, no, or maybe?", "doc_id": 277, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6780, 31254, 19827, 27905], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Satya and Dil Se won at the 44th filmfare awards.\"? Yes, no, or maybe? Yes\n###\nTOTO is a legalized form of lottery sold in Singapore, known by different names elsewhere. Singapore Pools is the only legal lottery operator in Singapore. It was established on 23 May 1968 to control widespread illegal gambling in Singapore during the 1960s. Are we justified in saying that \"Singapore had a illegal gambling problem in the 1960s\"? Yes, no, or maybe? Yes\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins. Are we justified in saying that \"Roderick Dwayne \"Rod\" Higgins was born in 1966\"? Yes, no, or maybe? No\n###\nGuns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin). Are we justified in saying that \"Guns of Diablo is a black and white film\"? Yes, no, or maybe? No\n###\nDatong () is a prefecture-level city in northern Shanxi province, People's Republic of China, located in a basin at an elevation of 1040 m and bordering Inner Mongolia to the north and west and Hebei to the east. It had a population of 3,318,057 at the 2010 census of whom 1,629,035 lived in the built up area made of 3 out of 4 urban districts, namely Chengqu, Kuangqu and Nanjiao District. Are we justified in saying that \"More than 40 percent of the residents of Datong live in Urban districts\"? Yes, no, or maybe?", "doc_id": 262, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32463, 26455, 7456, 42499], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Douglas Eric \"Doug\" Liman (born July 24, 1965) is an American film director and producer best known for \"Swingers\" (1996), \"Go\" (1999), \"The Bourne Identity\" (2002), \"Mr. & Mrs. Smith\" (2005), \"Jumper\" (2008), \"Fair Game\" (2010), and \"Edge of Tomorrow\" (2014). Are we justified in saying that \"Douglas Eric Liman has wanted to be a director and producer since childhood.\"? Yes, no, or maybe? Maybe\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Ahmad Kemal Idris is from France\"? Yes, no, or maybe? No\n###\nClub Deportivo Utiel is a football team based in Utiel in the autonomous community of Valencian Community. Founded in 1945, the team plays in Tercera Divisi\u00f3n \u2013 Group 6. The club's home ground is \"La Celadilla\", which has a capacity of 1,500 spectators. Are we justified in saying that \"Club Deportivo Utiel sells out every game\"? Yes, no, or maybe? Maybe\n###\nHungry for You is a 1996 American thriller and science fiction film directed by Dimitri Logothetis and produced by Gary Hudson. This film has been music composed by Joel Hirschhorn.The film starring Michael Phenicie, Rochelle Swanson, Gary Wood, Nancy Hochman and Ritchie Montgomery in the lead roles. Are we justified in saying that \"Movies are sometimes directed by and produced by different people.\"? Yes, no, or maybe? Yes\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s. Are we justified in saying that \"There are currently 1890 Catholic cardinals.\"? Yes, no, or maybe?", "doc_id": 793, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23711, 9614, 12729, 40200], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League. Are we justified in saying that \"David Thomas Bush is still alive.\"? Yes, no, or maybe? Maybe\n###\nKali Michele Rocha (born December 5, 1971) is an American actress. She is known for portraying Karen Rooney, the mother of four Rooney children and school's vice principal, in the Disney Channel sitcom, \"Liv and Maddie\". She has also co-written four episodes of the show. Are we justified in saying that \"Kali Michele Rocha is an adult film star.\"? Yes, no, or maybe? No\n###\nBallads of Sacco & Vanzetti is a set of ballad songs, written and performed by Woody Guthrie, related to the trial, conviction and execution of Sacco and Vanzetti. The series was commissioned by Moe Asch in 1945 and recorded in 1946 and 1947. Guthrie never completed the project and was unsatisfied by the result. The project was released later in its abandoned form by Asch. Are we justified in saying that \"Ballads of Sacco & Vanzetti is a set of rap songs\"? Yes, no, or maybe? No\n###\n\"Sun Goes Down\" is a song by German DJ and record producer Robin Schulz. It features the vocals from British singer Jasmine Thompson. The song was released in Germany as a digital download on 24 October 2014. The song peaked at number two on the German Singles Chart. Are we justified in saying that \" Robin Schulz was born in Germany.\"? Yes, no, or maybe? Maybe\n###\nJames Hagan (21 January 1918 \u2013 26 February 1998), known as Jimmy Hagan, was an English football player and manager born in Washington, County Durham, England. He played between 1938 and 1958 for Sheffield United and once for England. As manager he had his greatest successes with S.L. Benfica in the early 1970s. Are we justified in saying that \"Jimmy Hagan played for Sheffield United for 30 years.\"? Yes, no, or maybe?", "doc_id": 162, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11781, 38227, 29666, 15672], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa. Are we justified in saying that \"There were no African women who became professional nurses before Cecilia.\"? Yes, no, or maybe? Yes\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits. Are we justified in saying that \"Deep Purple released 4 songs in the year nineteen hundred eighty six.\"? Yes, no, or maybe? Maybe\n###\nAnalyze This is a 1999 gangster comedy film directed by Harold Ramis, who co-wrote the screenplay with playwright Kenneth Lonergan and Peter Tolan. The film stars Robert De Niro as a mafioso and Billy Crystal as his psychiatrist. A sequel, \"Analyze That\", was released in 2002. Are we justified in saying that \"Analyze This was seen by Homer.\"? Yes, no, or maybe? Maybe\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name. Are we justified in saying that \"Norma Bindlebeep had one episode where she pretended to be Sondra Huxtable.\"? Yes, no, or maybe? Maybe\n###\nJohn Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart. Are we justified in saying that \"Joseph Howe was born to a farming family in Boston.\"? Yes, no, or maybe?", "doc_id": 764, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26973, 29029, 24144, 7750], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Beastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar). Are we justified in saying that \"Beastie Boys were a great American hip hop group who got paid a lot\"? Yes, no, or maybe? Maybe\n###\nThe R-33 (Russian: \u0412\u044b\u043c\u043f\u0435\u043b \u0420-33 , NATO reporting name: AA-9 Amos) is a long-range air-to-air missile developed by the Vympel. It is the primary armament of the MiG-31 interceptor, intended to attack large high-speed targets such as the SR-71 Blackbird, the B-1 Lancer bomber, and the B-52 Stratofortress. Are we justified in saying that \"\u0412\u044b\u043c\u043f\u0435\u043b and Vympel are the same word.\"? Yes, no, or maybe? Yes\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy. Are we justified in saying that \"The team hopes to make it into the playoffs the following season.\"? Yes, no, or maybe? Maybe\n###\nJuan Rivera is an American singer and actor of Mexican heritage. He is part of one of the most prominent families, leading in regional Mexican music in the United States. His family includes singers, Jenni Rivera, Lupillo Rivera, Chiquis Rivera, and businesswoman Rosie Rivera.. His songs \"El Ser Equivocado\" and \" La Lampara\" ranked on the Billboard Latin charts. Are we justified in saying that \"Juan Rivera was in a band with his family\"? Yes, no, or maybe? Maybe\n###\nTodd Strauss-Schulson (born June 24, 1980) is an American film director, screenwriter, producer, editor, and cinematographer, best known for directing the 2011 comedy film \"A Very Harold & Kumar 3D Christmas\", and the 2015 horror comedy film \"The Final Girls\". He has also directed episodes of the television series \"The Inbetweeners\" (2012) and \"Zach Stone Is Gonna Be Famous\" (2013). Are we justified in saying that \"Todd Strauss-Schulson is retired\"? Yes, no, or maybe?", "doc_id": 885, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12272, 13516, 13322, 32089], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Joel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden. Are we justified in saying that \"Benji Madden's parents have one son\"? Yes, no, or maybe? No\n###\nThe Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title. Are we justified in saying that \" Paul McNamee and Paul Kronk were good friends\"? Yes, no, or maybe? Maybe\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX. Are we justified in saying that \"High Noon Toons is an animated series\"? Yes, no, or maybe? Yes\n###\nElmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County. Are we justified in saying that \"The county the city is in starts with a C\"? Yes, no, or maybe? Yes\n###\nThe Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash. Are we justified in saying that \"The Middlewich Folk and Boat Festival happens in the sixth month of the year.\"? Yes, no, or maybe?", "doc_id": 908, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1215, 5270, 7949, 44063], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harold Buttleman, Daredevil Stuntman (also known as Buttleman) is a 2003 film written and directed by Francis Stokes; the only movie he has directed. It won the Jury Prize at the Deep Ellum Film Festival in 2003. It was awarded the audience award in the Had to Be Made Film Festival in 2005. Are we justified in saying that \"Daredevil Stuntman was awarded the audience award in the Had to Be Made Film Festival in 2005.\"? Yes, no, or maybe? Yes\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire. Are we justified in saying that \"Eugene Gearty work with scorsesse\"? Yes, no, or maybe? Yes\n###\nJack Tate is a Republican legislator in the U.S. State of Colorado. He represents Senate District 27 in the Denver Metro Area, which encompasses parts of unincorporated Arapahoe County, the City of Centennial, and the town of Foxfield. He serves on the Senate Local Government, the Senate Business, Labor & Technology, and Joint Technology committees. Are we justified in saying that \"Jack Tate supports conservative viewpoints.\"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field is being rebuilt\"? Yes, no, or maybe? Maybe\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books. Are we justified in saying that \"Boleslav William Felix Robert Sienkiewicz was born the month before June\"? Yes, no, or maybe?", "doc_id": 641, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6261, 1938, 8123, 25808], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Serial Killers Ink is a website dedicated to selling \"murderabilia\" (collectibles related to murders, murderers or other violent crimes) and serial killer art, interviewing convicted serial killers and also serves as a meeting place for those interested or involved in the murderabilia industry. Are we justified in saying that \"Murderabilia is not different from collectibles related to murders, murderers or other violent crimes.\"? Yes, no, or maybe? Yes\n###\nHakea gibbosa, commonly known as hairy hakea or rock hakea, is a shrub of the family Proteaceae native to southeastern Australia. It has become an environmental weed in South Africa and New Zealand, where it had been introduced for use as a hedge plant. Are we justified in saying that \"The threat that Hakea gibbosa poses to South Africa and New Zealand will keep increasing.\"? Yes, no, or maybe? Maybe\n###\nThe Strangers is an American country band best known as the back-up band for singer-songwriter Merle Haggard. Formed in 1965 in Bakersfield, California, United States, the band continued to tour with original co-founding member Norman Hamlet, as well as Haggard's children Dana and Ben. Are we justified in saying that \"The Strangers band toured for a while after 1965\"? Yes, no, or maybe? Yes\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally. Are we justified in saying that \"Since childhood, the male subject of this context longed to make people laugh.\"? Yes, no, or maybe? Maybe\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions. Are we justified in saying that \"Barry and Stuart are British funnymen who have performed their work around the world.\"? Yes, no, or maybe?", "doc_id": 715, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10615, 10180, 30601, 23439], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor. Are we justified in saying that \"Hennville is where a fight took place between the French and Portuguese\"? Yes, no, or maybe? Yes\n###\nBellevue ( ) is a city in the Eastside region of King County, Washington, United States, across Lake Washington from Seattle. As the third-largest city in the Seattle metropolitan area, Bellevue has variously been characterized as an edge city, a boomburb, or satellite city. The population was 141,400 in a 2016 census estimate. Are we justified in saying that \"bellevue is home to the popular music festival brochella\"? Yes, no, or maybe? Maybe\n###\nHannah Kate Whelan (born 1 July 1992) is a retired British artistic gymnast who competed at the 2008 Summer Olympics and the 2012 Summer Olympics. Whelan won three European Championships medals and four British senior national titles, and was the bronze medallist in the all-around at the 2014 Commonwealth Games. Are we justified in saying that \"Hannah Kate Whelan was born on 1 July 1992.\"? Yes, no, or maybe? Yes\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"ECSU is quite diverse in its population\"? Yes, no, or maybe? Maybe\n###\nThe Wolfsonian\u2013Florida International University or The Wolfsonian-FIU, located in the heart of the Art Deco District of Miami Beach, Florida, is a museum, library and research center that uses its collection to illustrate the persuasive power of art and design. For fifteen years, The Wolfsonian has been a division within Florida International University. Are we justified in saying that \"The Wolfsonian\u2013Florida International University is a school\"? Yes, no, or maybe?", "doc_id": 228, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [769, 44102, 42238, 20959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Loudest Engine is the third studio album of London-based Australian indie rock band Howling Bells. The album was released through Cooking Vinyl on 9\u00a0September 2011 to mostly positive reviews. It was produced by Mark Stoermer and recorded at Battle Born Studios, in Las Vegas from September to October 2010. Are we justified in saying that \"The Loudest Engine had exactly 17 negative reviews.\"? Yes, no, or maybe? Maybe\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places. Are we justified in saying that \"The William Martin Armistead House wasn't considered Historic until 2009.\"? Yes, no, or maybe? Maybe\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden. Are we justified in saying that \"The Kyrkog\u00e5rden Runestone is a runestone located in Sweden.\"? Yes, no, or maybe? Yes\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas. Are we justified in saying that \"Paul Albert Raymond Barlatier de Mas was born in october\"? Yes, no, or maybe? Yes\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization. Are we justified in saying that \"The 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The capital of South Carolina is Detroit.\"? Yes, no, or maybe?", "doc_id": 718, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2979, 27552, 13420, 26914], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1967 Senior League World Series took place from August 14\u201317 in Des Moines, Iowa, United States. Westbury, New York defeated West Des Moines, Iowa in the championship game. It was New York's second straight championship. This was the final SLWS held in Des Moines. Are we justified in saying that \"It was the final SLWS held in Des Moines.\"? Yes, no, or maybe? Yes\n###\nThe 2017 Macanese general election took place on 17 September 2017 according to the provisions of the Basic Law of Macau. Out of a total of 33 seats, 14 were directly elected by universal suffrage under the highest averages method, while 12 were voted on from the Functional constituency, and 7 from nomination by the Chief Executive. Are we justified in saying that \"The 2017 Macanese general election had more voters than in 2016\"? Yes, no, or maybe? Maybe\n###\nMariner Books, a division of Houghton Mifflin Harcourt, was established in 1997 as a publisher of fiction, non-fiction, and poetry in paperback. Mariner is also the publisher of the Harvest imprint backlist, formerly published by Harcourt Brace/Harcourt Brace Jovanovich. Are we justified in saying that \"Tens of thousands of books have been published by Mariner Books.\"? Yes, no, or maybe? Maybe\n###\nBeilin District () is one of nine districts of Xi'an, the capital of Shanxi province, China. The well-known Small Wild Goose Pagoda is also located in the district. The smallest, but most densely populated, of Xi'an's county-level divisions, it borders the districts of Xincheng to the northeast, Yanta to the south, and Lianhu to the northwest. Are we justified in saying that \"Beilin District has some dogs living in it\"? Yes, no, or maybe? Maybe\n###\nThe Pulitzer Prize for Photography was one of the American Pulitzer Prizes annually awarded for journalism. It was inaugurated in 1942 and replaced by two photojournalism prizes in 1968: the Pulitzer Prize for Feature Photography and \"Pulitzer Prize for Spot News Photography\". The latter was renamed for Pulitzer Prize for Breaking News Photography in 2000. Are we justified in saying that \"The Pulitzer Prize for Photography became 2 separate awards in the year 2000.\"? Yes, no, or maybe?", "doc_id": 261, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28829, 26799, 39349, 16244], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"the winner of the 211 red bull manny mania amateur u.s. championship is sebo walker\"? Yes, no, or maybe? Yes\n###\nAustin Bush McHenry (September 22, 1894 \u2013 November 27, 1922) was a professional baseball player who played outfielder in the Major Leagues from 1918 to 1922 for the St. Louis Cardinals. Before his major-league promotion, he spent three seasons with the Milwaukee Brewers of the American Association. His best season in the major leagues came in 1921, when he hit .350. Are we justified in saying that \"Austin Bush McHenry is a quick person.\"? Yes, no, or maybe? Maybe\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Broadway Rose has a short opening.\"? Yes, no, or maybe? Maybe\n###\nRichard Church Thompson (October 8, 1957 \u2013 July 27, 2016) was an American illustrator and cartoonist best known for his syndicated comic strip \"Cul de Sac\" and the illustrated poem \"Make the Pie Higher\". He was given the Reuben Award for Outstanding Cartoonist of the Year for 2010. Are we justified in saying that \"Richard Church Thompson never drew anything.\"? Yes, no, or maybe? No\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012. Are we justified in saying that \"The studio complex was inaugurated less than 1000 days ago.\"? Yes, no, or maybe?", "doc_id": 835, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19803, 12825, 41812, 28003], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Malloreon is a five-part fantasy book series written by David Eddings, which follows \"The Belgariad\". The Malloreon is set in the same world as The Belgariad, but expands on several aspects of the setting, especially the eastern continent of Mallorea. Are we justified in saying that \"The Malloreon is David Eddings' best selling book series.\"? Yes, no, or maybe? Maybe\n###\nMy Little Pony: The Movie is a 1986 American animated musical fantasy film based on the Hasbro toy line, My Little Pony. Theatrically released on June 20, 1986 by De Laurentiis Entertainment Group, the film features the voices of Danny DeVito, Madeline Kahn, Cloris Leachman, Rhea Perlman and Tony Randall. Are we justified in saying that \"Hasbro's My Little Pony toys had a movie made based on them.\"? Yes, no, or maybe? Yes\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle. Are we justified in saying that \"Boon Brewery has sold millions.\"? Yes, no, or maybe? Maybe\n###\nThe Underground Man (1997) is a novel by Mick Jackson. Critically acclaimed, it was shortlisted for the Booker Prize for that year. It shows the life of an eccentric and reclusive Victorian Duke, loosely modelled on William Cavendish-Scott-Bentinck, 5th Duke of Portland. His latest scheme involves building a set of tunnels beneath his estate. Are we justified in saying that \"Mick Jackson started writing at the age of 10.\"? Yes, no, or maybe? Maybe\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"It took CUHK over a decade to get their charter from the Legislative Council of Hong Kong.\"? Yes, no, or maybe?", "doc_id": 820, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14212, 11318, 39607, 39620], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station. Are we justified in saying that \"Grantham North Services has been seen by Brady.\"? Yes, no, or maybe? Maybe\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Daniel Murphy was born February 24, 1996.\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"JCPenny is the most popular anchor in staughton mall.\"? Yes, no, or maybe? Maybe\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro. Are we justified in saying that \"Massaro died in 1703.\"? Yes, no, or maybe? No\n###\nNomindsland is the debut album by the industrial metal band Red Harvest, released through Black Mark Production in 1992. It is notable for being their only release that could be described as thrash metal, before the band moved towards industrial metal. Are we justified in saying that \"Nomindsland is not industrial metal.\"? Yes, no, or maybe?", "doc_id": 273, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13518, 44740, 20768, 2888], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Palm Harbor Open is a defunct tennis tournament that was played on the Grand Prix tennis circuit in 1980. The event was held in Palm Harbor, Florida and was played on outdoor hard courts. Paul McNamee won the singles title while partnering with Paul Kronk to win the doubles title. Are we justified in saying that \" Paul McNamee and Paul Kronk reputation as players improved after winning the double \"? Yes, no, or maybe? Maybe\n###\nSqueezing Out Sparks is the fourth studio album by English musician Graham Parker and his band the Rumour. It was voted album of the year in the 1979 \"Village Voice\" Pazz & Jop Critics Poll and later ranked number 334 on \"Rolling Stone\" magazine's list of the 500 greatest albums of all time. Although the Rumour were not credited on the cover, their name was included on the album label. Are we justified in saying that \"Squeezing Out Sparks by Rumour was ranked number 444 on the Billboard list of 500 greatest albums of all time.\"? Yes, no, or maybe? No\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels. Are we justified in saying that \"Louis Glenn Marson played professional baseball for two MLB clubs. \"? Yes, no, or maybe? Yes\n###\nThe International University of Rabat or IUR is a semi-public university founded in 2010 in Morocco. It delivers double-degrees, in collaboration with foreign universities, in law, engineering, aeronautics, energy engineering, architecture, business management and political sciences. Are we justified in saying that \"Poli Sci is offered as a Major at a university somewhere in Africa. \"? Yes, no, or maybe? Yes\n###\nGun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso. Are we justified in saying that \"Gun Bow lived to be 19 years old in human years.\"? Yes, no, or maybe?", "doc_id": 975, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19046, 32616, 45118, 24383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Species III is a 2004 science fiction thriller television film. The film, directed by Brad Turner, is the third installment of the \"Species\" series, and stars Robin Dunne, Robert Knepper, Sunny Mabrey, Amelia Cooke and John Paul Pitoc. Natasha Henstridge, who was contracted to a trilogy commencing with the first \"Species\" film, briefly reprises the role of Eve in the opening scene. Are we justified in saying that \"The film was not delayed to 2006.\"? Yes, no, or maybe? Yes\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980. Are we justified in saying that \"Harlem World Crew wasn't a good fit for the band\"? Yes, no, or maybe? Yes\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"Kristen Bell was in the pilot for Heroes\"? Yes, no, or maybe? No\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue. Are we justified in saying that \"The Sky Bar roof top patio and the Drake Underground basement nightclub are located at opposite ends of the building when considering the vertical axis.\"? Yes, no, or maybe? Yes\n###\nThe Albany Great Danes men's lacrosse team represents the University at Albany in NCAA Division I men's college lacrosse. Albany currently competes in the America East Conference and plays its home games on John Fallon Field. The team has reached the NCAA Men's Lacrosse Championship tournament nine times. The Great Danes are currently coached by Scott Marr. Are we justified in saying that \"The Great Danes have beaten other lacrosse teams many times.\"? Yes, no, or maybe?", "doc_id": 109, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22611, 36740, 8072, 31783], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Allen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Allen S. Weiner used to work at Stanford.\"? Yes, no, or maybe? Yes\n###\nThe McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous. Are we justified in saying that \"The McLaren team were not proud of the McLaren MP4/1 as it lost races\"? Yes, no, or maybe? Maybe\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"The Home Depot first opened in a small town\"? Yes, no, or maybe? Maybe\n###\nBianca Gascoigne (born 28 October 1986) is a British glamour model and television personality. She is the daughter of Sheryl Gascoigne, and adopted daughter of Paul Gascoigne, a former footballer. She has a brother Mason and a half-brother Regan Gascoigne. She came sixth in the nineteenth series of Channel 5 reality show \"Celebrity Big Brother\". Are we justified in saying that \"Bianca Gascoigne came 1st in Celebrity Big Brother\"? Yes, no, or maybe? No\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi. Are we justified in saying that \"NASA John H. Glenn Research Center is located near a large river.\"? Yes, no, or maybe?", "doc_id": 472, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [946, 24728, 37846, 12285], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds. Are we justified in saying that \"marvin buck was born in a wealthy family\"? Yes, no, or maybe? Maybe\n###\nTwo Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations. Are we justified in saying that \"Two Men And A Truck owns at least one truck.\"? Yes, no, or maybe? Yes\n###\nWilliston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower. Are we justified in saying that \"Control towers are pleasant to look at.\"? Yes, no, or maybe? Maybe\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property. Are we justified in saying that \"Nikola Tesla's former Wardenclyffe laboratory was the first laboratory in New York State\"? Yes, no, or maybe? Maybe\n###\nAlbert Woolley (26 September 1902 \u2013 5 January 1978) was an English cricketer active in 1926 who played for Lancashire. He was born in Salford and died in Doncaster. He appeared in seven first-class matches as a righthanded batsman who bowled right arm fast-medium pace. He scored 61 runs with a highest score of 24 and held nine catches. He took eleven wickets with a best analysis of four for 56. Are we justified in saying that \"Lancashire scored 26 runs in 1978.\"? Yes, no, or maybe?", "doc_id": 317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34564, 44139, 34246, 3629], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sh\u0101h Mahm\u016bd Hotak, (Pashto, Dari: ), also known as Sh\u0101h Mahm\u016bd Ghilj\u012b (Pashto: \u0634\u0627\u0647 \u0645\u062d\u0645\u0648\u062f \u063a\u0644\u062c\u064a\u200e ) (lived 1697 \u2013 April 22, 1725), was an Afghan ruler of the Hotak dynasty who overthrew the heavily declined Safavid dynasty to briefly become the king of Persia from 1722 until his death in 1725. Are we justified in saying that \"Sh\u0101h Mahm\u016bd Hotak was born before 1700.\"? Yes, no, or maybe? Yes\n###\nFoaly is a fictional character in the Artemis Fowl series written by Eoin Colfer. He is the technical consultant to the Lower Elements Police (LEP). He is the most intelligent centaur on and under the Earth, considers himself to be an unappreciated genius, and is the inventor of most of the advanced technology possessed by the fairy world, rivaled only by Opal Koboi. Are we justified in saying that \"LEP is based on a real task force\"? Yes, no, or maybe? Maybe\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships. Are we justified in saying that \"Angus Scott was born in Scotland\"? Yes, no, or maybe? Maybe\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898. Are we justified in saying that \"Grimsby Town Football Club changed its name due to a copyright issue\"? Yes, no, or maybe? Maybe\n###\nDaraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\". Are we justified in saying that \"Arbaaz Khan beat another actor for the Filmfare Best Villain Award for his debut film.\"? Yes, no, or maybe?", "doc_id": 581, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22542, 44756, 15132, 5437], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The accused would never harm a dog-owner.\"? Yes, no, or maybe? No\n###\nFast-moving consumer goods (FMCG) or consumer packaged goods (CPG) are products that are sold quickly and at relatively low cost. Examples include non-durable goods such as packaged foods, beverages, toiletries, over-the-counter drugs and many other consumables. In contrast, durable goods or major appliances such as kitchen appliances are generally replaced over a period of several years. Are we justified in saying that \"one example is a skateboard.\"? Yes, no, or maybe? No\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt. Are we justified in saying that \"Klagenfurt am Worthersee will eventually become the 8th largest city\"? Yes, no, or maybe? Maybe\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh. Are we justified in saying that \"Stanley Anthony Woods played football at a Pittsburgh college\"? Yes, no, or maybe? Yes\n###\nRipponlea is an inner suburb of Melbourne, Victoria, Australia, named after the adjoining Rippon Lea Estate. It is 7\u00a0km south east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, Ripponlea had a population of 1,478. Are we justified in saying that \"8 years ago, Ripponlea had a population of about seventeen hundred\"? Yes, no, or maybe?", "doc_id": 360, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4342, 14804, 43060, 4919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter. Are we justified in saying that \"Renee's heirs got a lot of items after her passing.\"? Yes, no, or maybe? Yes\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser became a good person near the end of his life\"? Yes, no, or maybe? Maybe\n###\nJames Brandon (born 20 September 1980) is a British journalist, who was kidnapped in Iraq during 2004 while on assignment from the \"Sunday Telegraph\" and \"The Scotsman\", covering the occupation and insurgency. He was kidnapped by the Mahdi Army militia and was released after one day. Are we justified in saying that \"The Mahdi Army militia was part of the insurgency.\"? Yes, no, or maybe? Yes\n###\nMike Hoffman (born September 20, 1980) is an American former professional ice hockey player. After leaving the University of Connecticut in 2003, he began his first pro season playing with the Worcester IceCats in the AHL and the Peoria Rivermen of the ECHL. He signed a professional contract with the Toronto Maple Leafs in 2005, but he has never played in the National Hockey League. Are we justified in saying that \"Mike Hoffman is a Spanish ice hockey player.\"? Yes, no, or maybe? No\n###\nThe Samsung Galaxy Tab 7.7 is a tablet computer of a series of Android-based tablet computer produced by Samsung, introduced on 1 September 2011 at IFA in Berlin. Related models are the Galaxy Tab 7.0 Plus, Samsung Galaxy Tab 2 7.0, and Samsung Galaxy Tab 3 7.0. Are we justified in saying that \"The Samsung Galaxy 7.7 came out on the market following the release of related models Galaxy Tab 7.0 Plus, Samsung Galaxy Tab 2 7.0, and Samsung Galaxy Tab 3 7.0, all Android-based tablets. Samsung will introduce a Galaxy Tab in Berlin on 1 September, 2111. \"? Yes, no, or maybe?", "doc_id": 480, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38715, 25282, 5974, 9280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team. Are we justified in saying that \"Bisson has fans all over the globe. \"? Yes, no, or maybe? Maybe\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons. Are we justified in saying that \"The Grand Prix des Fronti\u00e8res was held at a street circuit \"? Yes, no, or maybe? Yes\n###\nThe Ghost & Mrs. Muir is an American sitcom based on the 1947 film of the same name, which was based on the 1945 novel by R. A. Dick. It premiered in September 1968 on NBC. After NBC canceled the series, it aired on ABC for one season before being canceled a final time. The program is currently seen weekday mornings on the digital subchannel \"GetTV.\" Are we justified in saying that \"The Ghost & Mrs. Muir movie was a failure\"? Yes, no, or maybe? Maybe\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts. Are we justified in saying that \"Phacelia mutabilis plants are everywhere in the western states\"? Yes, no, or maybe? Maybe\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song. Are we justified in saying that \"The cars had three albums.\"? Yes, no, or maybe?", "doc_id": 295, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42740, 44190, 37797, 40796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Oxford Companion to Beer, abbreviated OCB, is a book in the series of Oxford Companions published by Oxford University Press. The book provides an alphabetically arranged reference to beer, compiled and edited by Garrett Oliver with a foreword by U.S. chef Tom Colicchio. Published in 2011, the work draws on 166 contributors from 24 countries to amass over 1,100 entries on beer. Are we justified in saying that \"Oxford University Press is a beer publisher\"? Yes, no, or maybe? No\n###\nVitamin C, also known as ascorbic acid and -ascorbic acid, is a vitamin found in food and used as a dietary supplement. As a supplement it is used to treat and prevent scurvy. Evidence does not support use in the general population for the prevention of the common cold. It may be taken by mouth or by injection. Are we justified in saying that \"Vitamin C occurs naturally in the body\"? Yes, no, or maybe? Maybe\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery. Are we justified in saying that \"The Anchor Bankside tavern features fresh fish caught daily and flown in.\"? Yes, no, or maybe? Maybe\n###\nCarolyn Keene is the pseudonym of the authors of the Nancy Drew mystery stories and The Dana Girls mystery stories, both produced by the Stratemeyer Syndicate. In addition, the Keene pen name is credited with the Nancy Drew spin-off, \"River Heights and the Nancy Drew Notebooks. Are we justified in saying that \"Carolyn Keene is a writer that appeals to young men.\"? Yes, no, or maybe? Maybe\n###\nThe 1974 Atlanta Braves season was the ninth season in Atlanta along with the 104th season as a franchise overall. The team finished third in the National League West with a record of 88\u201374, 14 games behind the Los Angeles Dodgers. During the season, Braves outfielder Hank Aaron became the all-time career leader in home runs, surpassing Babe Ruth. Are we justified in saying that \"The Atlanta Braves began playing in Atlanta in 1973.\"? Yes, no, or maybe?", "doc_id": 157, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24023, 28243, 4158, 12152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Clay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan. Are we justified in saying that \"As of the 2011 census, the population was 7,861.\"? Yes, no, or maybe? Maybe\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Circus Palestine is a music video.\"? Yes, no, or maybe? No\n###\nMaya & Marty was an American television variety show that premiered on NBC on May 31, 2016 and lasted one season. The series was co-hosted by comedians Maya Rudolph and Martin Short, and produced by Lorne Michaels. The show features various comedy sketches, musical performances, and celebrity guests. Are we justified in saying that \"Martin Scorsese was a guest on Maya & Marty.\"? Yes, no, or maybe? Maybe\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release. Are we justified in saying that \"The Sloppy Maneater's were a popular band\"? Yes, no, or maybe? Maybe\n###\nJohn Gilbert (born John Cecil Pringle; July 10, 1899 \u2013 January 9, 1936) was an American actor, screenwriter and director. He rose to fame during the silent film era and became a popular leading man known as \"The Great Lover\". At the height of his career, Gilbert rivaled Rudolph Valentino, another silent film era leading man, as a box office draw. Are we justified in saying that \"Actor John Gilbert rose to fame during the first world war.\"? Yes, no, or maybe?", "doc_id": 111, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42114, 29871, 28679, 932], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Thomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330. Are we justified in saying that \"Thomas Cooper played football for 25 years.\"? Yes, no, or maybe? No\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX. Are we justified in saying that \"They competed in the WCW last century\"? Yes, no, or maybe? Yes\n###\nSeaWorld Ohio was a park in the SeaWorld chain of marine animal theme parks. The park opened in 1970 directly across the lake and less than one mile from Geauga Lake Park in Aurora, Ohio, United States. The small lake separated the two parks. Wildwater Kingdom, a small waterpark built by Cedar Fair in 2005, occupied the property until it closed in 2016. Are we justified in saying that \"SeaWorld Ohio was more than a mile from Geauga Lake Park in Aurora, Ohio, United States.\"? Yes, no, or maybe? No\n###\nRefried beans (Spanish: \"frijoles refritos\") is a dish of cooked and mashed beans and is a traditional staple of Mexican and Tex-Mex cuisine, although each cuisine has a different approach when making the dish. Refried beans are also popular in many other Latin American countries. Are we justified in saying that \"Refried beans are made with black beans.\"? Yes, no, or maybe? Maybe\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"Doomsday Device was creating by wreslter's\"? Yes, no, or maybe?", "doc_id": 312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16874, 13201, 36550, 40561], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Clay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan. Are we justified in saying that \"Clay County has a football team.\"? Yes, no, or maybe? Maybe\n###\nEastland Mall is an enclosed shopping mall in Columbus, Ohio. Opened in 1968, it no longer has any open anchor stores. Its four vacant anchors were originally occupied by Lazarus, Kaufmann's (later Macy's), Sears, and JC Penney. The mall is managed by Woodmont Management. Are we justified in saying that \"The Westland Mall is managed by Woodmont Management\"? Yes, no, or maybe? No\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"Once Upon a Time premiered over 6 years ago\"? Yes, no, or maybe? Yes\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include: Are we justified in saying that \"Darrell Abbott played guitar in 1975.\"? Yes, no, or maybe? Maybe\n###\nJ\u00e1nos G\u00e1lv\u00f6lgyi (born 26 May 1948) is a Hungarian actor and comedian. First appearing in 1968's Ki Mit Tud? talent show, he gained national fame for making numerous comedy sketches in the Hungarian National Television, becoming one of the best known comedy actors in the country. Are we justified in saying that \"J\u00e1nos G\u00e1lv\u00f6lgyi first appeared on a talent show twenty years after the year he was born.\"? Yes, no, or maybe?", "doc_id": 780, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19900, 39722, 187, 36309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Carrier Sekani Tribal Council (familiarly known as CSTC) is a tribal council representing eight First Nations in the Central Interior of British Columbia. It was originally known as the \"Lakes District Tribal Council\". The CSTC was incorporated in 1979 and is a registered non-profit society. Are we justified in saying that \"CSTC issues zoning laws.\"? Yes, no, or maybe? Maybe\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage. Are we justified in saying that \"The transmitter that broadcasts KDMD is located Eagle River, about 40 miles from Anchorage.\"? Yes, no, or maybe? Maybe\n###\nRohan Bopanna and Daniel Nestor were the defending champions, but chose not to compete together. Bopanna played alongside Florin Mergea, but lost in the first round to Nestor and Radek \u0160tep\u00e1nek. Nestor and \u0160tep\u00e1nek lost in the quarterfinals to \u0141ukasz Kubot and Marcin Matkowski.
Are we justified in saying that \"Nestor took the loos very poorly\"? Yes, no, or maybe? Maybe\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Seven Ways from Sundown takes place in Texas.\"? Yes, no, or maybe? Maybe\n###\nWJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM. Are we justified in saying that \"WJMF-LP is in the Northern Hemisphere of the globe\"? Yes, no, or maybe?", "doc_id": 803, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12898, 43754, 765, 7804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Terry Butler is an American bassist who currently performs with the death metal bands Obituary and Massacre. He was also a member of Six Feet Under and Death. He was credited on the Death album \"Spiritual Healing\", and band leader Chuck Schuldiner stated that on the latter Death album \"Terry contributed to the songwriting as well\". Are we justified in saying that \"Terry Butler loves dogs\"? Yes, no, or maybe? Maybe\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Chris McKendry is a member of Gen X.\"? Yes, no, or maybe? Yes\n###\nEarly flying machines include all forms of aircraft studied or constructed before the development of the modern aeroplane by 1910. The story of modern flight begins more than a century before the first successful manned aeroplane, and the earliest aircraft thousands of years before. Are we justified in saying that \"The modern aeroplane is the earliest form of aircraft.\"? Yes, no, or maybe? No\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world. Are we justified in saying that \"About eighty percent of students at Hudson Valley Community College live within walking distance from the campus\"? Yes, no, or maybe? Maybe\n###\nAmanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016. Are we justified in saying that \"Amanda got away with murder.\"? Yes, no, or maybe?", "doc_id": 247, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3052, 32346, 7746, 6200], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list. Are we justified in saying that \"Comedy fiction works gives us many predictions about what will happen in the 21st century. \"? Yes, no, or maybe? No\n###\nUtamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation. Are we justified in saying that \"Kanji Kunieda novel was released in 1940\"? Yes, no, or maybe? Maybe\n###\nHi! Pristin (stylized as HI! PRISTIN) is the debut mini-album by South Korean girl group Pristin. It was released on March 21, 2017, by Pledis Entertainment, and distributed by LOEN Entertainment. The EP consists of six songs, including the singles \"Wee Woo\" and \"Black Widow\". In order to promote the album, the group performed on several Korean music shows. Are we justified in saying that \"Pristin released their first album during the second decade of the 21st century\"? Yes, no, or maybe? Yes\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999. Are we justified in saying that \"\"The Land of Fear\" was well received by critics.\"? Yes, no, or maybe? Yes\n###\nMaros Water Park is one of water park existing in Maros district, South Sulawesi and was built and ready to soft launch on October 2009. It is planned to be opened in January 2013. It contains outbound area, cottages, restaurant, mini water park, semi olympic pool and body slide. It is surrounded with natural hills, fresh water on site, and with a couple of caves. Are we justified in saying that \"the park never opened\"? Yes, no, or maybe?", "doc_id": 781, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9639, 19669, 37166, 17413], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bad Family () is a South Korean television series starring Kim Myung-min, Nam Sang-mi, Im Hyun-sik, Yeo Woon-kay, Kang Nam-gil, Geum Bo-ra, Kim Heechul and Lee Young-yoo. It aired on SBS from March 22 to May 11, 2006 on Wednesdays and Thursdays at 21:55 for 16 episodes. Are we justified in saying that \"Kim Myung-min was in 2 of the 16 episodes.\"? Yes, no, or maybe? Maybe\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"Greatest Hits Volume 1 was not released in 1969\"? Yes, no, or maybe? Yes\n###\nThe 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place. Are we justified in saying that \"Ivy League basketball teams are not as good as other basketball teams at the college level.\"? Yes, no, or maybe? Maybe\n###\nThis is a list of United States Air Force test squadrons. It covers units considered to be part of the Air Force and serves as a break out of the comprehensive List of United States Air Force squadrons. Most units in this list are assigned to Air Force Materiel Command, however, a few reside in other Major Commands of the United States Air Force. Are we justified in saying that \"The list is publicly available.\"? Yes, no, or maybe? Maybe\n###\nNigel Edward Povah (born 17 July 1952 in Wandworth, London) is a British chess player. He is an International Master at over-the-board chess and a grandmaster at correspondence chess. Povah is the author of \"Chess Training\". He is reckoned to be the UK's strongest correspondence chess player since Jonathan Penrose. Povah has one son, Jonathan Povah. Are we justified in saying that \"Povah is seen as a better player than Penrose.\"? Yes, no, or maybe?", "doc_id": 378, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9392, 42729, 34305, 7075], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs. Are we justified in saying that \"The Girdler sulfide process has made the filtering of heavy water profitable.\"? Yes, no, or maybe? Maybe\n###\nThe ECW World Tag Team Championship was a professional wrestling world tag team championship in Extreme Championship Wrestling (ECW). It was introduced in National Wrestling Alliance (NWA) affiliate and ECW precursor, Eastern Championship Wrestling in 1992, but was established under ECW in 1994. Are we justified in saying that \"The ECW World Tag Team Championship was a special innovation from the ECW\"? Yes, no, or maybe? Yes\n###\nPassion Play is a 2010 American drama film written and directed by Mitch Glazer, executive produced by Rebecca Wang and starring Mickey Rourke, Megan Fox, Rhys Ifans and Bill Murray. Filming for the production began in December 2009 and is presented by Rebecca Wang Entertainment. It premiered at the 2010 Toronto International Film Festival. Are we justified in saying that \"Rhys Ifans movie premiered at the Film Festival.\"? Yes, no, or maybe? Yes\n###\nJack Tate is a Republican legislator in the U.S. State of Colorado. He represents Senate District 27 in the Denver Metro Area, which encompasses parts of unincorporated Arapahoe County, the City of Centennial, and the town of Foxfield. He serves on the Senate Local Government, the Senate Business, Labor & Technology, and Joint Technology committees. Are we justified in saying that \"Jack Tate is a Republican legilator is the State of Kentucky.\"? Yes, no, or maybe? No\n###\n\"It's Not Right but It's Okay\" is a song by American singer Whitney Houston, from her fourth studio album, \"My Love Is Your Love\". It was written by LaShawn Daniels, Rodney Jerkins, Fred Jerkins III, Isaac Phillips, Toni Estes, and produced by Darkchild. The song examines a woman confronting her lover about his infidelity. Are we justified in saying that \"Rodney Jerkins confronted his lover about their infidelity.\"? Yes, no, or maybe?", "doc_id": 622, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15471, 44786, 4313, 25556], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ryman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc. Are we justified in saying that \"Ryman Auditorium is expected to exist in the year 2362.\"? Yes, no, or maybe? Maybe\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments. Are we justified in saying that \"J\u00fcrgen Melzer starts with an A.\"? Yes, no, or maybe? No\n###\nNew Hampshire Route 202A (abbreviated NH\u00a0202A) is a 14.639 mi east\u2013west state highway in Strafford and Rockingham counties in southeastern New Hampshire. The western terminus is in Northwood at U.S. Route\u00a0202 and New Hampshire\u00a09, near their intersection with U.S. Route\u00a04. Its eastern terminus is in downtown Rochester at New Hampshire Route\u00a0108 and New Hampshire Route\u00a0125. Are we justified in saying that \"NH 202A runs north south\"? Yes, no, or maybe? No\n###\nThe Kid from Left Field is a 1953 baseball film starring Dan Dailey, Anne Bancroft, Lloyd Bridges, and Billy Chapin. The film marked the reunion of Dailey and director Harmon Jones who had teamed up at 20th Century Fox a year earlier in another baseball film, the biographical \"The Pride of St. Louis\". Are we justified in saying that \"Dan Daily was paid more than his coworkers in the movie\"? Yes, no, or maybe? Maybe\n###\nPhakisa Freeway is a motor racing circuit located in Odendaalsrus, South Africa. From 1999 to 2004, the venue hosted the South African motorcycle Grand Prix of the MotoGP championship. It has a capacity of 60,000 spectators and opened in 1999. The track has a 4.24\u00a0km road course and a 1.5 mi oval course. The oval track is an exact copy of Las Vegas Motor Speedway from 1997. Are we justified in saying that \"Phakisa Freeway opened before 1998.\"? Yes, no, or maybe?", "doc_id": 15, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15250, 3413, 10160, 6276], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier. Are we justified in saying that \"Forest Friends has aired on multiple networks.\"? Yes, no, or maybe? Yes\n###\nHomicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run. Are we justified in saying that \"While he worked on both, it is unknown if it was exclusive.\"? Yes, no, or maybe? Maybe\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC. Are we justified in saying that \"SNOBOL5 was the final in a series of computer programming languages developed between 1962 and 1967.\"? Yes, no, or maybe? No\n###\nYou Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California. Are we justified in saying that \"You Can Be Anyone This Time Around was released more than 17 years ago.\"? Yes, no, or maybe? Yes\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks. Are we justified in saying that \"\"Whatever the Case May Be\" first aired in a year that had the number 4 in it. \"? Yes, no, or maybe?", "doc_id": 724, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3219, 35209, 6004, 17320], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Nigeria U-20 men's national soccer team, also known as the Nigeria Under-20s or nicknamed the \"Flying Eagles\", is the youth team for national soccer in Nigeria. It plays a large role in the development of Nigerian soccer, and is considered to be the feeder team for the Nigeria men's national soccer team and is controlled by the Nigeria Football Federation. Are we justified in saying that \"The Nigeria Under-20s have had superstar caliber players.\"? Yes, no, or maybe? Maybe\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs represented the riding of Wentworth for two years\"? Yes, no, or maybe? Yes\n###\nDeliver Us Tour was a concert tour by band Darkest Hour, taking place from late 2007, in support of their fifth studio album \"Deliver Us\" and finishing in December 2008. The tour started shortly after the Undoing Ruin Tour ended, earlier in December 2006. Are we justified in saying that \"The Undoing Ruin Tour was the previous tour to \"Deliver Us Tour\"\"? Yes, no, or maybe? Yes\n###\nCarlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features. Are we justified in saying that \"\"The Signal\" was shown at Sundance in advance of its theatrical debut.\"? Yes, no, or maybe? Yes\n###\nOasis was a literary anthology published in Cairo during World War II. It was edited by Denis Saunders, David Burk, and Victor Selwyn. The introduction was written by General Henry Maitland Wilson, who was at this time Commander-in-Chief of the Middle East. Are we justified in saying that \"Oasis had two people whose names started with the letter D who had worked on editing it.\"? Yes, no, or maybe?", "doc_id": 300, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1895, 41436, 33948, 25619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Young Girl on a Chair is a 1955 bronze sculpture by Giacomo Manz\u00f9, installed at the Hirshhorn Museum and Sculpture Garden in Washington, D.C. The work measures 45 x 23\u00a03/8 x 43\u00a03/4 inches and depicts a nude young girl with her arms rested in her lap. Are we justified in saying that \"Giacomo Manzu was the first to create a scultpute depicting a naked young girl\"? Yes, no, or maybe? Maybe\n###\nJohn M. W. Moorlach (born December 21, 1955 in the Netherlands) is a Republican California State Senator representing 37th Senate district, which includes portions of Orange County, since March 22, 2015. He previously served on the Orange County Board of Supervisors from December 5, 2006 \u2013 January 5, 2015 and as Orange County Treasurer-Tax Collector from March 17, 1995 \u2013 December 5, 2006. Are we justified in saying that \"The senate gained a republican seat when John M. W. Moorlach became senator.\"? Yes, no, or maybe? Maybe\n###\nThe Combat Box was a tactical formation used by heavy (strategic) bombers of the U.S. Army Air Forces during World War II. The combat box was also referred to as a \"staggered formation\". Its defensive purpose was in massing the firepower of the bombers' guns, while offensively it concentrated the release of bombs on a target. Are we justified in saying that \"Its offensive purpose was in massing the firepower of the bombers' guns, while defensively it concentrated the release of bombs on a target. \"? Yes, no, or maybe? No\n###\nAirline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya. Are we justified in saying that \"Irene Koki Mutungi will receive a Nobel Prize in 2020\"? Yes, no, or maybe? Maybe\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life. Are we justified in saying that \"The Health For All program goal of the WHO was started in the 1970s.\"? Yes, no, or maybe?", "doc_id": 445, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36010, 21457, 512, 5365], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Regent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day. Are we justified in saying that \"There are plans underway to produce oil by 2028.\"? Yes, no, or maybe? Maybe\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000. Are we justified in saying that \"The 2012 Sun Life Financial Players' Championship was seen by Trump.\"? Yes, no, or maybe? Maybe\n###\nThe Secret Garden is the 1987 Hallmark Hall of Fame TV film adaptation of the novel \"The Secret Garden\", aired on CBS November 30, 1987 and produced by Rosemont Productions Limited, who also produced \"Back to the Secret Garden\". The film stars Barret Oliver, Jadrien Steele, Billie Whitelaw and Sir Derek Jacobi. Are we justified in saying that \"the secret garden is a novel \"? Yes, no, or maybe? Yes\n###\n\"Thank You\" is the third single by heavy metal band Hellyeah from their debut album \"Hellyeah\". The song is a tribute to all of the band's recently departed family members: Vinnie Paul's brother Dimebag Darrell, Tom Maxwell's mother, and Chad Gray's grandmother. The song reached #37 on the \"Billboard\" Hot Mainstream Rock Tracks chart. Are we justified in saying that \"The single Thank You reached into the top 50 on the Billboard Chart\"? Yes, no, or maybe? Yes\n###\nSt. Petersburg is a city in Pinellas County, Florida, United States. As of the 2015 census estimate, the population was 257,083, making it the fifth-most populous city in Florida and the largest in the state that is not a county seat (the city of Clearwater is the seat of Pinellas County). Are we justified in saying that \"St. Petersburg is not a city in the Central United States.\"? Yes, no, or maybe?", "doc_id": 328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43405, 41522, 33098, 34165], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Science in History is a four-volume book by scientist and historian John Desmond Bernal, published in 1954. It was the first comprehensive attempt to analyse the reciprocal relations of science and society throughout history. It was originally published in London by Watts. There were three editions up to 1969 an. It was republished by MIT Press in 1971 and is still in print. Are we justified in saying that \"Science in History has three words.\"? Yes, no, or maybe? Yes\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\". Are we justified in saying that \"Alex Rider was not featuring short stories written by Horowitz.\"? Yes, no, or maybe? No\n###\nMike Cvik (born July 6, 1962) is a Canadian former National Hockey League linesman, who wore uniform number #88. At 6 foot, 9 Inches, Cvik is as tall as the NHL's tallest player, Zdeno Ch\u00e1ra. He has worked more than 1800 NHL games, including his highlights such as the gold medal game at the 2002 Winter Olympics, the NHL All-Star Game and the Stanley Cup Playoffs. Are we justified in saying that \"While Zdeno Chara is the tallest NHL player, Mike Cvik is not one of the taller players.\"? Yes, no, or maybe? No\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks. Are we justified in saying that \"Jack Bender did not write the 12th episode of lost, that was Damon Lindelof, Bender only directed.\"? Yes, no, or maybe? Yes\n###\nQueen Mother Dorji Wangmo (Dzongkha: \u0f62\u0fa1\u0f7c\u0f0b\u0f62\u0f97\u0f7a\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f58\u0f7c\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f55\u0fb1\u0f74\u0f42\u0f0b; Wylie: \"Rdo-rje Dbang-mo Dbang-phyug\") (December 29, 1955, second daughter of \"Yab\" Ugyen Dorji and \"Yum\" Thuji Zam) is a former queen of Bhutan and first wife of former King Jigme Singye Wangchuck, who is married to four sisters all of whom were entitled to be called queen. Are we justified in saying that \"Queen Mother Dorji Wangmo was born more than 3334 days ago.\"? Yes, no, or maybe?", "doc_id": 24, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22666, 34102, 878, 16069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\". Are we justified in saying that \"Alex Rider showed three short stories written by Horowitz.\"? Yes, no, or maybe? Yes\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field honored a player\"? Yes, no, or maybe? Yes\n###\nWinnie the Pooh and a Day for Eeyore is a 1983 Disney Winnie the Pooh animated featurette, based on two chapters from the books \"Winnie-the-Pooh\" and \"The House at Pooh Corner\", originally released theatrically on March 25, 1983, with the 1983 re-issue of \"The Sword in the Stone\". It is the fourth and final of Disney's original theatrical featurettes adapted from the Pooh books by A. A. Milne. Are we justified in saying that \"The Sword and the Stone was more popular.\"? Yes, no, or maybe? Maybe\n###\nInternational Cycling Classic, also known as the Point Premium Root Beer or simply SuperWeek, was a 17-race series over 17 days open to licensed amateur and professional cyclists. The series took place primarily in the area surrounding Milwaukee, Wisconsin. Are we justified in saying that \"There were more than 17 races in the International Cycling Classic\"? Yes, no, or maybe? No\n###\nThe Tragedy of Julius Caesar is a tragedy by William Shakespeare, believed to have been written in 1599. It is one of several plays written by Shakespeare based on true events from Roman history, which also include \"Coriolanus\" and \"Antony and Cleopatra\". Are we justified in saying that \"Julius Cesar was written was written after the birth of Christ\"? Yes, no, or maybe?", "doc_id": 474, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41410, 13625, 11337, 11466], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"James Wyatt designed the roleplaying game \"Dungeons & Dragons\".\"? Yes, no, or maybe? Yes\n###\nJonathan Erlich and Andy Ram were the defending champions, but Erlich chose not to participate due to an elbow injury, and only Ram competed that year.Ram partnered with Max Mirnyi, but lost to Feliciano L\u00f3pez and Fernando Verdasco in the second round. Are we justified in saying that \"Max Mirnyi and Andy Ram have partnered before this.\"? Yes, no, or maybe? Maybe\n###\nThe European Association of Science Editors (EASE ) is a non-profit membership organisation for people interested in science communication and editing. Founded in 1982, in France, EASE now has an international membership from diverse backgrounds, professional experiences, and job titles. Are we justified in saying that \"EASE is one of 3 major organizations founded in 1982\"? Yes, no, or maybe? Maybe\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland. Are we justified in saying that \"Phacelia coerulea can only be found in Los Angeles\"? Yes, no, or maybe? Maybe\n###\nDaniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country. Are we justified in saying that \"Suelo was born in a cave.\"? Yes, no, or maybe?", "doc_id": 451, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25453, 4448, 6478, 25359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Moody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group. Are we justified in saying that \"Moody 4B is not an instrumental album.\"? Yes, no, or maybe? No\n###\nThe Peoria Rivermen was a professional ice hockey team in the American Hockey League. They played in Peoria, Illinois, USA at the Carver Arena. On June 14, 2013, it was announced that the team would relocate to Utica, New York after the 2012\u201313 AHL season, and be known as the Utica Comets. Are we justified in saying that \"The Peoria Rivermen had a total of 23 hockey players on it.\"? Yes, no, or maybe? Maybe\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality. Are we justified in saying that \"Julian Marley is the son of Bob Marley.\"? Yes, no, or maybe? Yes\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge. Are we justified in saying that \"The reservoir is for oil storage.\"? Yes, no, or maybe? No\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Malone is less than 53 years old\"? Yes, no, or maybe?", "doc_id": 279, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [367, 34486, 22097, 10160], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\" Are we justified in saying that \"the players were given a silver ngget cause that was the award that year\"? Yes, no, or maybe? Maybe\n###\nThe 711 Squadron \"\"Albatrozes\"\" (\"Esquadra 711\") was a flying squadron of the Portuguese Air Force. Its primary mission was Search and Rescue and it has had secondary missions tactical air transport and general air transport in the Azores archipelago. During the time it was active it was the only operational squadron in the Portuguese military to operate both rotary- and fixed-wing aircraft. Are we justified in saying that \"The 711 Squadron ran air based mission for the Portuguese Air Force.\"? Yes, no, or maybe? Yes\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence. Are we justified in saying that \"A conjectural portrait requires lots of skills to make\"? Yes, no, or maybe? Maybe\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC. Are we justified in saying that \"SNOBOL5 was the final in a series of computer programming languages developed between 1962 and 1967.\"? Yes, no, or maybe? No\n###\nRJ Rockers Brewing Company is a beer brewing company based in Spartanburg, South Carolina, founded in 1997 by current owner/brewer, Mark R. Johnsen. The company is considered a microbrewery meaning it has an annual production of less than 15,000 barrels. Are we justified in saying that \" RJ Rockers Brewing Company produces more than 10,000 barrels of beer\"? Yes, no, or maybe?", "doc_id": 216, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41305, 37155, 41531, 8082], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Debra Hayward is a British film producer. As Head of Film at Working Title Films, Hayward frequently served as an executive producer for the company's feature films, working alongside fellow Working Title executive Liza Chasin. After producing \"Les Mis\u00e9rables\", she started her own production company; Monumental Pictures. Are we justified in saying that \"Debra Hayward speaks four languages.\"? Yes, no, or maybe? Maybe\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums). Are we justified in saying that \"Norwegian black metal bands are not popular these days.\"? Yes, no, or maybe? Maybe\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"Each program in ECSU has 2,500 seats.\"? Yes, no, or maybe? No\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"The Home Depot has changed ownership many times in the past\"? Yes, no, or maybe? Maybe\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships. Are we justified in saying that \"He represented Great Britain at the 1900 + 50 + 2 Summer Olympics\"? Yes, no, or maybe?", "doc_id": 322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7138, 18275, 5461, 37584], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Hampshire Route 27 (abbreviated NH 27) is a 37.621 mi long east\u2013west highway in southeastern New Hampshire. The western terminus of NH 27 is in Hooksett at U.S. Route 3 and New Hampshire Route 28 north of Manchester. The eastern terminus is in Hampton Beach at New Hampshire Route 1A, which runs along the New Hampshire coastline adjacent to the Atlantic Ocean. Are we justified in saying that \"New Hampshire Route 27 is a busy highway.\"? Yes, no, or maybe? Maybe\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \"Dariush examined the bourgeoisie of Iran using a film, but Homayoun lead the film.\"? Yes, no, or maybe? Yes\n###\nWar/Dance is a 2007 American documentary film written and directed by Sean Fine and Andrea Nix Fine and produced by Shine Global's Susan MacLaury, a professor at Kean University, and Albie Hecht. It was nominated for the 2008 Academy Award for Best Documentary Feature and received the Emmy Award for Best Documentary and Best Cinematography in 2010. Are we justified in saying that \"War/Dance is a documentary that won a Tony Award in 2007. \"? Yes, no, or maybe? No\n###\n\"Aster\" (M915) is a Tripartite-class minehunter of the Belgian Naval Component, launched on 16 December 1985 at the Mercantile-Belyard shipyard in Rupelmonde and christened by Queen Paola of Belgium. The patronage of \"Aster\" was accepted by the city of Blankenberge. \"Aster\" was the first of the Belgian Tripartite-class minehunters. Are we justified in saying that \"\"Aster\" (M915) is a helicopter\"? Yes, no, or maybe? No\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Alice Sue Claeys never finished in the top 3.\"? Yes, no, or maybe?", "doc_id": 362, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35686, 15515, 39943, 35072], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley. Are we justified in saying that \"Art of Dying does not have a front man\"? Yes, no, or maybe? No\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"The Friant-Kern Canal is a 152 mi Central Valley Project aqueduct delivers water south to Bakersfield.\"? Yes, no, or maybe? Maybe\n###\nSaat Din Mohabbat In (English: \"Seven days in love\" ) is an upcoming Pakistani romantic drama film directed by Meenu-Farjad, produced by Dawn Films and IMGC Global Entertainment and written by Fasih Bari Khan. The film features Mahira Khan and Sheheryar Munawar in lead roles and is also their second mutual film after \"Ho Mann Jahaan\". Are we justified in saying that \"Saat Din Mohabbat will screen in Pakistani theaters\"? Yes, no, or maybe? Maybe\n###\nThe Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958. Are we justified in saying that \"It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras but it wasn't a very good movie.\"? Yes, no, or maybe? Maybe\n###\nAfter Dark is a brand of Indian whisky, manufactured by Radico Khaitan. The whisky was test marketed in 2010, and rolled out nationwide in India by September 2011. It is a 100% grain-based whisky manufactured at Radico's Rampur distillery. It is available in 750ml, 375ml and 180ml bottles. The brand's tagline is \"One Life, Many Passions...Why wait\". Are we justified in saying that \"After Dark will make you tipsy.\"? Yes, no, or maybe?", "doc_id": 151, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22310, 39002, 17473, 34046], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"Pata Nahi Rabb Kehdeyan Rangan Ch Raazi was translated in 5 different languages\"? Yes, no, or maybe? Maybe\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"They became a well known band in England.\"? Yes, no, or maybe? Maybe\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House. Are we justified in saying that \"Spittal is a very large village in Scotland.\"? Yes, no, or maybe? No\n###\nDavid Gregory \"Dave\" Smith (born 24 July 1955) is a retired male race walker from Australia, who represented his native country at two consecutive Summer Olympics, starting in 1980 (Moscow). His best Olympic result was finishing in tenth place in the men's 20\u00a0km race at the 1984 Summer Olympics. Are we justified in saying that \"A racer born in 1975 finished tenth in the 1984 olympic 20 km men's race.\"? Yes, no, or maybe? No\n###\nAhmad Jovdat Ismayil oglu Hajiyev (June 18, 1917 - January 18, 2002) was one of the major Azerbaijani composers of the Soviet period. He is remembered for his monumental orchestral works, having been the first Azerbaijani to compose a symphony (1936). He studied under Azerbaijan's Founder of Composed Music, Uzeyir Hajibeyov and under Russian composer Dmitri Shostakovich. Are we justified in saying that \"Uzeyir Hajibeyov was the first composer to compose a musical in Russia.\"? Yes, no, or maybe?", "doc_id": 983, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15250, 23821, 32564, 13231], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier. Are we justified in saying that \"Forest Friends has aired on multiple networks.\"? Yes, no, or maybe? Yes\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s. Are we justified in saying that \"The private, commercial structure was placed on the National Register of Historic Places more than 1980 days ago.\"? Yes, no, or maybe? Yes\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name. Are we justified in saying that \"Benedict is referenced as Jehovah blessed the Egyptian\u2019s house for Joseph\u2019s sake\"? Yes, no, or maybe? Maybe\n###\nKim Won-sik (Hangul:\u00a0\uae40\uc6d0\uc2dd , born February 15, 1993) better known by his stage name Ravi (Hangul:\u00a0\ub77c\ube44 ), is a South Korean rapper, singer-songwriter, producer, signed under Jellyfish Entertainment. He is a member of the South Korean boy group VIXX and VIXX sub-unit VIXX LR. He debuted as a solo artist on January 9, 2017, with the release of his debut mini album \"R.EAL1ZE\". Are we justified in saying that \"Kim Won-sik is a 50s baby\"? Yes, no, or maybe? No\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer. Are we justified in saying that \"In 2002, Capello produced music in the bad Los Fabulosos.\"? Yes, no, or maybe?", "doc_id": 332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6012, 19479, 11180, 5380], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Painted Thin was a Canadian hardcore punk band, formed in Winnipeg, and active from 1993 to 1999. The core of the band consisted of vocalist and guitarist Stephen Carroll and bassist and vocalist Paul Furgale, with a variety of guest musicians, including James Ash, Dan McCafferty and Jason Tait, on individual recordings. Are we justified in saying that \"Painted Thin played internationally\"? Yes, no, or maybe? Maybe\n###\nA sodium bicarbonate rocket (sometimes called an Alka-Seltzer rocket) is a model rocket fashioned from a 35mm film canister and propelled by the pressure of a gas, often carbon dioxide, generated from the reaction of an acid with sodium bicarbonate. Sodium bicarbonate rockets are often used in science classes to demonstrate principles of chemistry and physics. Are we justified in saying that \"Sodium bicarbonate are also as model to show to students some principle. \"? Yes, no, or maybe? Yes\n###\nHyde, Jekyll, Me () is a 2015 South Korean television series starring Hyun Bin and Han Ji-min. It is based on Lee Choong-ho's webtoon \"Dr. Jekyll Is Mr. Hyde\" (), which gave a romantic comedy spin on the literary character. The series aired on SBS from January 21 to March 26, 2015 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"People in South Korea watch more television during the winter months.\"? Yes, no, or maybe? Maybe\n###\nLiving on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc. Are we justified in saying that \"L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc is the french title of Living On The Edge.\"? Yes, no, or maybe? Yes\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly. Are we justified in saying that \"Stannis Baratheon is the son of Cassana Estermont\"? Yes, no, or maybe?", "doc_id": 704, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27478, 35670, 39555, 19967], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gilford is a town in Belknap County, New Hampshire, United States. The population was 7,126 at the 2010 census. Situated on Lake Winnipesaukee, Gilford is home to Governors Island, Ellacoya State Beach, Belknap Mountain State Forest, Gunstock Mountain Ski Resort, and Bank of New Hampshire Pavilion at Meadowbrook, a seasonal outdoor concert venue. Are we justified in saying that \"Gilford is a town in Belknap County, New Hampshire, United States had four people less than 7130 living there according to the the census taken at the end of the first decade of the twenty-first century.\"? Yes, no, or maybe? Yes\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006. Are we justified in saying that \"There have been several more releases by EMI Music Japan of Dancemania's anime remix albums since 2006.\"? Yes, no, or maybe? Maybe\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Edmund Golding lived in New York City.\"? Yes, no, or maybe? Maybe\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013. Are we justified in saying that \"Brandon Tyler McManus is over 20 years old\"? Yes, no, or maybe? Yes\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall. Are we justified in saying that \"Tuancheng Fortress is in the Northern Hemisphere.\"? Yes, no, or maybe?", "doc_id": 330, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17630, 40941, 44063, 22639], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Manos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor. Are we justified in saying that \"Manos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Americanmodeling agency since 2004.\"? Yes, no, or maybe? No\n###\nDonald Joseph Stanhouse (born February 12, 1951 in Du Quoin, Illinois) is a retired baseball pitcher who had a ten-year major league career from 1972 to 1980, 1982. He played for the Texas Rangers and Baltimore Orioles of the American League and the Montreal Expos and Los Angeles Dodgers of the National League. Are we justified in saying that \"Donald Joseph Stanhouse has never worn cleats\"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field is being rebuilt\"? Yes, no, or maybe? Maybe\n###\nRoland Buerk (born 1973), was a journalist working for the BBC. He was the Tokyo Correspondent for BBC News and is best known for his coverage of the 2011 T\u014dhoku earthquake and tsunami. He is the son of former BBC newsreader and current BBC Radio 4 presenter Michael Buerk. He left the BBC in mid-2012, to work for Nissan in the United Arab Emirates. Are we justified in saying that \"Roland Buerk was a journalist working and typing for the BBC.\"? Yes, no, or maybe? Maybe\n###\nPeter Joseph Wysocki (October 3, 1948 \u2013 June 14, 2003) was an American football linebacker who played his entire six-year career with the Washington Redskins from 1975 to 1980 in the National Football League (NFL). Wysocki previously played four seasons in the Canadian Football League (CFL) for the Hamilton Tiger-Cats, Toronto Argonauts and Saskatchewan Roughriders. Are we justified in saying that \"Peter Joseph Wysocki played for more than 3 teams\"? Yes, no, or maybe?", "doc_id": 990, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7612, 34409, 13574, 20173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999. Are we justified in saying that \"Daoud Abdel Sayed is an American director\"? Yes, no, or maybe? No\n###\nMelinda Heather \"Mindy\" Cohn (born May 20, 1966) is an American actress, voice actress, comedian and singer. She is known for her role as Natalie Green, the student of Edna Garrett (played by Charlotte Rae) in the long-running sitcom \"The Facts of Life\", and for being the voice of Velma Dinkley in the \"Scooby-Doo\" franchise from 2002 to 2015. Are we justified in saying that \"Melinda heather is an actress who was born in America before May 20,1966.\"? Yes, no, or maybe? No\n###\nThe 1934 Australian Grand Prix was a motor race held at the Phillip Island circuit in Victoria, Australia on 19 March 1934. The 200 mile race, which was organised by the Light Car Club of Australia, was the seventh Australian Grand Prix. Contested as a handicap race, it was won by Bob Lea-Wright, driving a Singer 9 Le Mans. Are we justified in saying that \"The 1934 Australian Grand Prix was less than 50 miles long.\"? Yes, no, or maybe? No\n###\nA Merry Friggin' Christmas is a 2014 American black comedy film directed by Tristram Shapeero and written by Phil Johnston. The film stars an ensemble cast featuring Joel McHale, Lauren Graham, Clark Duke, Oliver Platt, Wendi McLendon-Covey, Tim Heidecker, Candice Bergen and Robin Williams. The film was released on November 7, 2014, by Phase 4 Films. Are we justified in saying that \"A Merry Friggin Christmas has an all star cast.\"? Yes, no, or maybe? Maybe\n###\nThe Angel on the Roof: The Stories of Russell Banks (2000) is a collection of short stories by Russell Banks. It consists of a total of thirty-one previously published stories, including twenty-two stories that appeared in earlier short story collections, along with nine that were previously uncollected. Are we justified in saying that \"The Angel on the Roof consists of 30 published stories \"? Yes, no, or maybe?", "doc_id": 60, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31694, 22957, 17503, 27154], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall is very expensive to maintain\"? Yes, no, or maybe? Maybe\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor. Are we justified in saying that \"I Am That Change has been seen by Zack.\"? Yes, no, or maybe? Maybe\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues. Are we justified in saying that \"Opinions on politics are dangerous.\"? Yes, no, or maybe? Maybe\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD). Are we justified in saying that \"The president of ACL was Jose Maria de Areilza.\"? Yes, no, or maybe? Yes\n###\nThe Enlistment Act 1970 is a statute of the Parliament of Singapore that caters for the enlistment of persons in the Singapore Armed Forces. The law repeals the Singapore Army Act and People\u2019s Defence Force Act of 1965 and is designed specifically to subject enlisted personnel under military law during the period of enlistment and service. Are we justified in saying that \"The Enlistment Act 1970 repeals the previous law and enables the Singapore army to breathe underwater\"? Yes, no, or maybe?", "doc_id": 501, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23148, 39923, 22477, 30919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH). Are we justified in saying that \"a lot of people think the IDH is bogus\"? Yes, no, or maybe? Maybe\n###\n\"I'm Living in Two Worlds\" is a song written by Jan Crutchfield, which was recorded and released by American country artist Bonnie Guitar. The song reached number nine on the \"Billboard\" Hot Country Singles chart and number ninety-nine on the \"Billboard\" Hot 100 in early 1966. \"I'm Living in Two Worlds\" became Guitar's first Country top-ten single and her first charting single since 1959. Are we justified in saying that \"jan clutchfield song was recorded,edited and released by guitar\"? Yes, no, or maybe? Maybe\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \"The Pear Tree is a movie about the Iranian bourgeoisie \"? Yes, no, or maybe? Yes\n###\nKJEF-CA, channel 13, was a class A television station in Jennings, Louisiana. Owned by Townsquare Media, the station was an independent station. It was the only television station owned by Townsquare, a company that otherwise specializes exclusively in radio. Are we justified in saying that \"KJEF-CA had 500 employees\"? Yes, no, or maybe? Maybe\n###\nJames Conroy (born February 6, 1977) is an American voice actor, television writer and actor. He is known for appearing on television shows, such as \"Celebrity Deathmatch\", \"Kenny the Shark\" and \"Fetch! with Ruff Ruffman\", radio commercials and video games. He worked for companies such as WGBH, The Walt Disney Company and Discovery Channel. Are we justified in saying that \"James Conroy is currently dead.\"? Yes, no, or maybe?", "doc_id": 374, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2848, 45319, 21012, 6090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "St. Ives Town F.C. is a football club based in St Ives, Cambridgeshire, England. They play in the Southern League Premier Division. This St Ives Town should not be confused with the Cornwall Combination team playing in St Ives, Cornwall, which is also called St Ives Town F.C. Are we justified in saying that \"St Ives is a popular name for football. \"? Yes, no, or maybe? Maybe\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers. Are we justified in saying that \"McColo was open in 2009.\"? Yes, no, or maybe? No\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"nobody lives in st clements\"? Yes, no, or maybe? No\n###\nThe 1997 Indian vice-presidential election was held on 16 August 1997 to elect Vice-President of India. Krishan Kant defeated Surjit Singh Barnala to become 10th Vice-President of India. At the time of the election, VP office was vacant since the incumbent, K. R. Narayanan, had already inaugurated as President following his victory in the presidential election. Are we justified in saying that \"The 1997 Indian vice-presidential election elected the 8th vp\"? Yes, no, or maybe? No\n###\nThe 2007 Internazionali BNL d'Italia was the 2007 edition of the Rome Masters tennis tournament. The men's tournament was part of the 2007 ATP Masters Series and was held on May 5-13. The women's event was a 2007 WTA Tier I Series event and was held on May 13-20. Are we justified in saying that \"The 2007 Internazionali BNL d'Italia occurred in the Southern hemisphere\"? Yes, no, or maybe?", "doc_id": 989, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41171, 5197, 29888, 14468], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site. Are we justified in saying that \"The Eglinton Castle estate was once home to the famous Montgomeries.\"? Yes, no, or maybe? Yes\n###\nAndrea Louise Riseborough (born 20 November 1981) is an English stage and film actress. Her film appearances include \"Birdman or (The Unexpected Virtue of Ignorance)\", \"Oblivion\", \"Welcome to the Punch\", \"Disconnect\", \"Shadow Dancer\", \"W.E.\", \"Brighton Rock\", \"Made in Dagenham\", \"Never Let Me Go\", \"Happy-Go-Lucky\", and \"Venus\". Are we justified in saying that \"Andrea Louise Riseborough has played in less than 8 films.\"? Yes, no, or maybe? No\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden. Are we justified in saying that \"Ellon Castle has been seen by Trump.\"? Yes, no, or maybe? Maybe\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"Kasey Peters was also a member of Tr-Cities Fever.\"? Yes, no, or maybe? Yes\n###\nFifth Harmony is the self-titled third studio album by American girl group Fifth Harmony, released on August 25, 2017, through Syco Music and Epic Records. Its lead single, \"Down\", which features rapper Gucci Mane, was released on June 2, 2017. It is the group's first album following the departure of Camila Cabello in December 2016. Are we justified in saying that \"The girl group Fifth Harmony formed in two thousand eleven.\"? Yes, no, or maybe?", "doc_id": 612, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22322, 6391, 18290, 23586], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The man had depression.\"? Yes, no, or maybe? Maybe\n###\nState Route 360 (SR 360) is a state highway in the southern portion of Mineral County, Nevada, United States. The route connects the former town of Basalt to the rest of Mineral County. A road has been in the place of SR 360 since 1919, and became State Route 10 by 1929. Are we justified in saying that \"Basalt is a populous town\"? Yes, no, or maybe? Maybe\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \" Ranila is a city in the Charkhi Dadri district of the Indian state of Haryana. \"? Yes, no, or maybe? No\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day. Are we justified in saying that \"Tunnel Vision has been read by Carla.\"? Yes, no, or maybe? Maybe\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008. Are we justified in saying that \"Takeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and one other subsidiary.\"? Yes, no, or maybe?", "doc_id": 87, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11265, 13422, 27289, 25960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Matoian (born 1949) is a businessman and television industry executive. He was a vice-president of the CBS Entertainment division. He later became the president of Entertainment at Fox Broadcasting in September 1995. In 1996 he became the president of HBO. Are we justified in saying that \"John Matoian became vice-president at HBO in 1996.\"? Yes, no, or maybe? No\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel. Are we justified in saying that \"Castle Wolfenstein was released during the 20th century\"? Yes, no, or maybe? Yes\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency. Are we justified in saying that \"O'Donnell High School is a 1A school.\"? Yes, no, or maybe? Yes\n###\nIn poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown. Are we justified in saying that \"Triple Crown is a poker term referred to winning the title in two major tours\"? Yes, no, or maybe? No\n###\n\"White as Snow\" is a song by Irish rock band U2 and the ninth track on their 2009 album \"No Line on the Horizon\". It was written from the perspective of a dying soldier serving in Afghanistan, and lasts the length of time it takes him to die. The track is based on the hymn \"Veni, veni Emmanuel\", and is the only political song on the album. Are we justified in saying that \"\"White as Snow\" is the only political song by U2.\"? Yes, no, or maybe?", "doc_id": 393, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41305, 15863, 25902, 11312], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Debra Hayward is a British film producer. As Head of Film at Working Title Films, Hayward frequently served as an executive producer for the company's feature films, working alongside fellow Working Title executive Liza Chasin. After producing \"Les Mis\u00e9rables\", she started her own production company; Monumental Pictures. Are we justified in saying that \"Debra Hayward speaks four languages.\"? Yes, no, or maybe? Maybe\n###\nGrantham North Services is a service area operated by Moto located on the A1 at Gonerby Moor Roundabout, four miles north of Grantham in Lincolnshire, England. The service station has a main car park and coach/lorry park, off which is a BP petrol station. Are we justified in saying that \"BP is the only business near Grantham North Services.\"? Yes, no, or maybe? Maybe\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships. Are we justified in saying that \"Angus Scott (16 August 127 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events.\"? Yes, no, or maybe? No\n###\nBuilth Castle (Welsh: \"Castell Llanfair-ym-Muallt\" ) was a castle built under King Edward I, just outside Builth Wells, Powys, Wales. At one time it was an impressive stone-built castle but all the masonry has been removed over the years and all that remains are the mound on which it stood, the ditches and embankments. Are we justified in saying that \"Builth Castle is a tourist attraction\"? Yes, no, or maybe? Maybe\n###\nThe discography of Death, a metal band, consists of seven studio albums and four live albums. Death was an American metal band founded in 1983. The band's founder, Chuck Schuldiner, is considered \"a pioneering force in death metal and grindcore\". The band ceased to exist after Schuldiner died of brain cancer in 2001, though it remains an enduring metal brand. Are we justified in saying that \"Schuldiner died from complications of chemotherapy.\"? Yes, no, or maybe?", "doc_id": 243, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42711, 9211, 17890, 1774], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Timothy Donald Cook (born November 1, 1960) is an American business executive, industrial engineer, and developer. Cook is the Chief Executive Officer of Apple Inc., previously serving as the company's Chief Operating Officer, under its founder Steve Jobs. Are we justified in saying that \"Timothy Donald Cook passed away last year.\"? Yes, no, or maybe? No\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"The City of Port Phillip is in the south and in the south-west is Port Melbourne.\"? Yes, no, or maybe? Yes\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris. Are we justified in saying that \"Vampire Diaries had an episode written by a woman. \"? Yes, no, or maybe? Yes\n###\n\"Touch Me With Your Love\" is a song by Beth Orton, released as the fourth single from 1996 album \"Trailer Park\". It contains 4 songs, and was released on C.D. and vinyl. The release peaked at #60 in the UK official singles chart. It was also released in Australia with a different track listing, and was the first release by Orton to have a promotional video made for it. Are we justified in saying that \"trailer park was only released on vinyl\"? Yes, no, or maybe? No\n###\nThe position of South African ambassador to the United States is the most prestigious and top diplomatic post in South Africa. The position was first held in March 1949, following the upgrade of South Africa's diplomatic mission to an embassy. The post has been held by many important politicians and is currently held by M. J. Mahlangu. Are we justified in saying that \"South African Ambassador was first held while Taft was president of the United States.\"? Yes, no, or maybe?", "doc_id": 268, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32658, 40886, 42473, 25256], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Williston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower. Are we justified in saying that \"Williston Municipal Airport is in a state of disrepair\"? Yes, no, or maybe? Maybe\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft. Are we justified in saying that \"Carl Filip Anton Forsberg was selected 3rd overall in the 2011 NHL Entry Draft, but didn't sign a contract with the team that drafted him\"? Yes, no, or maybe? No\n###\nThe Little Girl Next Door is a 1912 American silent short drama directed by Lucius Henderson and written by Philip Lonergan. The film starred William Garwood and Marguerite Snow in the lead roles. Prints of the film are in the Library of Congress and other collections. Are we justified in saying that \"Many prints of the Little Girl Next Door are in the Library of Congress.\"? Yes, no, or maybe? Yes\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"King Gizzard & the Lizard Wizard were great friends\"? Yes, no, or maybe? Maybe\n###\nZero to One: Notes on Startups, or How to Build the Future is a 2014 book (release date September 16, 2014) by venture capitalist, PayPal co-founder, and early Facebook investor Peter Thiel along with Blake Masters. It is a condensed and updated version of a highly popular set of online notes taken by Masters for the CS183 class on startups taught by Thiel at Stanford University in Spring 2012. Are we justified in saying that \"The full name of the 2014 book by Peter Thiel and Blake Masters is called Notes on Startups, or How to Build the Future.\"? Yes, no, or maybe?", "doc_id": 987, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8172, 4418, 26078, 26820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maris Soule (born June 19, 1939) is an American author of romance and romantic suspense novels, mysteries, and short stories. Her latest book, \"Eat Crow and Die\", is a mystery novel. Her books feature a variety of settings and situations, including the Iditarod Trail Sled Dog Race, Search and Rescue dogs, barrel racing, dressage, and a Rhodesian Ridgeback puppy. Are we justified in saying that \"Maris Soule was a South American author.\"? Yes, no, or maybe? No\n###\nThe Emami Kolkata Open ATP Challenger Tour (formerly known as State Bank of India ATP Challenger Tour) is a professional tennis tournament played on outdoor hard courts. It is currently part of the Association of Tennis Professionals (ATP) Challenger Tour. It is held annually at the Bengal Tennis Association Stadium in Kolkata, India since 2014. Are we justified in saying that \"The tour was played in 2018.\"? Yes, no, or maybe? Yes\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"The 1997 Porsche Tennis Grand Prix took place in 1995\"? Yes, no, or maybe? No\n###\nThe UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company. Are we justified in saying that \"UKOTCF is not based in france.\"? Yes, no, or maybe? Yes\n###\nBest of 4Minute is the first Japanese compilation album by the South Korean girl group 4Minute. It is composed of all the Japanese tracks released by the group since their debut in Japan. It was released on September 26, 2012 in three different editions: 2 limited CD+DVD (Type A with a live event and Type B with all Japanese music videos) and a Regular edition. Are we justified in saying that \"Subsequent to their debut in Japan, 4Minute eventually released three editions of their Japanese compilation album.\"? Yes, no, or maybe?", "doc_id": 819, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8790, 26659, 42824, 3383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms. Are we justified in saying that \"People at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms watched a limited release of a film called The canyons.\"? Yes, no, or maybe? Yes\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo. Are we justified in saying that \"Celly Cel is a very proliffic rapper\"? Yes, no, or maybe? Maybe\n###\nThe NBA Finals is the championship series of the National Basketball Association (NBA). The entrants are determined by the victors of the Eastern and Western conferences, who engage in a best-of-seven game series to determine the league champion. The winners of the Finals are awarded the Larry O'Brien Championship Trophy, which replaced the Walter A. Brown Trophy in 1983. Are we justified in saying that \"the entrants will play 7 games\"? Yes, no, or maybe? Yes\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The North African ostrict or red-necked ostrich is the largest bird in Noth Africa that can fly.\"? Yes, no, or maybe? Maybe\n###\nArthur Charles Valerian Wellesley, 9th Duke of Wellington, (born 19 August 1945) is a British aristocrat and politician. He has served as Conservative Party Member of the European Parliament (1984\u20131989) for Surrey West and currently sits as a hereditary peer in the House of Lords since 2015. Are we justified in saying that \"The House of Lords has been around since 1945.\"? Yes, no, or maybe?", "doc_id": 331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28918, 32239, 17823, 17708], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\". Are we justified in saying that \"Frederick Wiseman is disliked by some people\"? Yes, no, or maybe? Maybe\n###\nJunoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani. Are we justified in saying that \"A flight of pigeons is a multi part novella\"? Yes, no, or maybe? Maybe\n###\nCari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite. Are we justified in saying that \"Cari Elizabeth Roccaro is 30 years old this year.\"? Yes, no, or maybe? No\n###\nGreat Balls of Fire! is a 1989 American biographical film directed by Jim McBride and starring Dennis Quaid as rockabilly pioneer Jerry Lee Lewis. Based on a biography by Myra Lewis and Murray M. Silver Jr., the screenplay is written by McBride and Jack Baran. The film is produced by Adam Fields, with executive producers credited as Michael Grais, Mark Victor, and Art Levinson. Are we justified in saying that \"Great Balls of Fire! is a film.\"? Yes, no, or maybe? Yes\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels. Are we justified in saying that \"Louis Marson played in baseball legal as an outfielder.\"? Yes, no, or maybe?", "doc_id": 961, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2752, 15267, 40427, 25846], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Valentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood. Are we justified in saying that \"Valentine will have a remake.\"? Yes, no, or maybe? Maybe\n###\nHolly Weber (born September 20, 1984) is an American glamour model and actress. As a model, she has appeared in \"Maxim\", \"FHM\", \"Muscle & Fitness\", \"Glamour\", and as no. 66 on AskMen's Top 99 Most Desirable Women of 2009. She has made uncredited appearances in a number of movies and TV series. Are we justified in saying that \"Holly appeared in other shows that were similar to \"Maxim\"\"? Yes, no, or maybe? Maybe\n###\nStephen Tyrone Colbert ( , ; born May 13, 1964) is an American comedian, television host, actor, and writer. He is best known for hosting the satirical Comedy Central program \"The Colbert Report\" from 2005 to 2014, and hosting the CBS talk program \"The Late Show with Stephen Colbert\" beginning in September 2015. Are we justified in saying that \"Stephen Colbert, born May 13, 1954, became famous as a result of hosting the satirical ABC talk program, \"The Colbert Report\".\"? Yes, no, or maybe? No\n###\nVampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction. Are we justified in saying that \"Vampire Vs Vampire is a film made in 1345\"? Yes, no, or maybe? No\n###\nThe Oakland County Child Killer (OCCK) is an unidentified serial killer responsible for the murders of four or more children, two girls and two boys, in Oakland County, Michigan, United States in 1976 and 1977. Several theories and suspects have been named in the case, but despite all these theories, the cases remain unsolved and the killer(s) have never been identified. Are we justified in saying that \"There was more than one person involved in the OCCK murders.\"? Yes, no, or maybe?", "doc_id": 145, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15186, 14417, 5420, 3076], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi. Are we justified in saying that \"An Indian crime film was inspired by the movie, \"The Godfather\", which was re-released 5 years later, but dubbed in Tamil.\"? Yes, no, or maybe? Yes\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966). Are we justified in saying that \"Ben Barzman was born more than 200 years ago.\"? Yes, no, or maybe? No\n###\nOgallala is a city in Keith County, Nebraska, United States. The population was 4,737 at the 2010 census. It is the county seat of Keith County. In the days of the Nebraska Territory, the city was a stop on the Pony Express and later along the transcontinental railroad. The Ogallala Aquifer was named after the city. Are we justified in saying that \"Ogallala is a city that was once a stop for the transcontinental railroad.\"? Yes, no, or maybe? Yes\n###\nAdam Best is a fictional character from the BBC soap opera \"EastEnders\", played by David Proud, the first adult actor with a visible disability to appear regularly in the soap. Both Proud and his character live with spina bifida. The character made his first appearance in the episode broadcast on 10 September 2009 and his last in the one broadcast on 19 July 2010. Are we justified in saying that \"david proud have a spina bifida\"? Yes, no, or maybe? Yes\n###\nTaki's Magazine, called \"Takimag\" for short, is an online magazine of politics and culture published by the Greek paleoconservative journalist and socialite Taki Theodoracopulos and edited by his daughter Mandolyna Theodoracopulos. Initially called Taki's Top Drawer, the site was redesigned and relaunched under its current title in March 2008 with a subsequent redesign in 2010. Are we justified in saying that \"Takimag was redesigned in the fourth month of 2008.\"? Yes, no, or maybe?", "doc_id": 207, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19237, 1440, 2457, 38838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "After the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal. Are we justified in saying that \"The Empire of Japan invaded and occupied the Northeast over 10 Years ago.\"? Yes, no, or maybe? Yes\n###\nConcrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr. Are we justified in saying that \"Ryan Adams is a musician. \"? Yes, no, or maybe? Yes\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg. Are we justified in saying that \"Marie Hedwig Auguste of Sulzbach was forced by her family to marry the Archduke of Austria thereby becoming an archduchess and cementing German ties with Austria.\"? Yes, no, or maybe? Maybe\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"Jim Mooty was named Most Valuable Player along with Maxie Baughan but people thought it should have been someone else.\"? Yes, no, or maybe? Maybe\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\". Are we justified in saying that \"The Brown Spectator is a student-run journal that is printed on brightly colored paper.\"? Yes, no, or maybe?", "doc_id": 455, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4120, 27675, 13765, 2305], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Reckless is the third book in the The It Girl novels by the German American author Cecily von Ziegesar. The series is ghostwritten from the original idea by Ziegesar. The series, aimed toward young adults, is a spin-off from the bestselling \"Gossip Girl\" series. It was released in 2006 by Little, Brown. Are we justified in saying that \"Cecily von Ziegesar did not write Reckless.\"? Yes, no, or maybe? Yes\n###\nAllen West (born October 17, 1967, Brandon, Florida) is an American death metal guitarist who has been a member of Massacre, Obituary, Six Feet Under, Lowbrow, and Southwicked. He is considered to be a pioneering figure of the death metal genre in the 1980s. Are we justified in saying that \"Allen West is not well known today\"? Yes, no, or maybe? Maybe\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Netflix will make an adaptation for Coriolano: eroe senza patria in the future\"? Yes, no, or maybe? Maybe\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"The song \"Look at My Dab\" originally had another name.\"? Yes, no, or maybe? Yes\n###\nThe San Diego Chargers announced their 40th Anniversary Team in 2000 to honor the top players and coaches in the history of the National Football League team. The Chargers began play in 1960 as part of the American Football League. The anniversary team included 31 players and coaches voted on by fans and a media panel. The team became the Los Angeles Chargers after relocating in 2017. Are we justified in saying that \"The 40th anniversary team mostly lived in Los Angeles.\"? Yes, no, or maybe?", "doc_id": 84, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18543, 18760, 22691, 27597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerry Kupcinet is a five-time Emmy winning director and producer. Kupcinet has directed shows such as \"Judge Judy\", \"Judge Joe Brown\", \"20/20\", \"Home\", \"That's Incredible!\", The Live Aid concert, \"The Dating Game\", \"The Richard Simmons Show\", \"Entertainment Tonight\" and many others. Are we justified in saying that \"Jerry Kupcinet has directed \"The Dating Game\"\"? Yes, no, or maybe? Yes\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Nauter has scored 59 goals in his career.\"? Yes, no, or maybe? Maybe\n###\nState Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long. Are we justified in saying that \"the route used to be shorter but was doubled out of necessity\"? Yes, no, or maybe? Maybe\n###\nThe 2016 MBC Entertainment Awards () presented by Munhwa Broadcasting Corporation (MBC), took place on December 29, 2016 at MBC Public Hall in Sangam-dong, Mapo-gu, Seoul. It was hosted by Kim Sung-joo, Jun Hyun-moo and Lee Sung-kyung. The nominees were chosen from MBC variety, talk and comedy shows that aired from December 2015 to November 2016. Are we justified in saying that \"The 2016 MBC Entertainment Awards () presented by Munhwa Broadcasting Corporation (MBC), took place on november 29, 2016 \"? Yes, no, or maybe? No\n###\nThe third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas. Are we justified in saying that \"Next show can be done outside of the US\"? Yes, no, or maybe?", "doc_id": 122, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27351, 25179, 11591, 10796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park. Are we justified in saying that \"Newnes railway line has never been closed.\"? Yes, no, or maybe? No\n###\nDonaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville. Are we justified in saying that \"Donaldson Center Airport is in Canada.\"? Yes, no, or maybe? No\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority. Are we justified in saying that \"Dickinson owns an airport in North Dakota\"? Yes, no, or maybe? Yes\n###\nNeil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\". Are we justified in saying that \"Neil Sedaka speaks Italian.\"? Yes, no, or maybe? Yes\n###\nNeilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep. Are we justified in saying that \"\"The Slide Project\" was released under the \"E Pluribus Unum\" label.\"? Yes, no, or maybe?", "doc_id": 321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40049, 37429, 32636, 4881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Love's Labour's Lost is a 2000 adaptation of the comic play of the same name by William Shakespeare, directed by and starring Kenneth Branagh. It was the first feature film to be made of this lesser-known comedy. Branagh's fourth film of a Shakespeare play (he did not direct the 1995 \"Othello\", although he did play Iago), \"Love's Labour's Lost\" was a box-office and critical disappointment. Are we justified in saying that \"Kenneth Branagh has directed other film that was adapted after Williams Shakespeare.\"? Yes, no, or maybe? Yes\n###\nHomebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line. Are we justified in saying that \"There are other open-source software package management systems that are cheaper\"? Yes, no, or maybe? Maybe\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"Amy Timberlake wrote the Art of the Deal\"? Yes, no, or maybe? No\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels. Are we justified in saying that \"Peter L.N. Padfield was an astute observer of literary trends. \"? Yes, no, or maybe? Maybe\n###\nDavid K. Lam is a Chinese-born American technology entrepreneur. He founded Lam Research Corporation in 1980. He presently serves as Chairman of Multibeam Corporation (Santa Clara, CA), which manufactures complementary electron beam lithography (CEBL) systems. He also heads the David Lam Group, an investor and business advisor for high-growth technology companies. Are we justified in saying that \"The David Lam Group is headed by an American born man\"? Yes, no, or maybe?", "doc_id": 47, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26610, 12308, 16951, 31142], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming. Are we justified in saying that \"Cape Vakop was chartered over 60 years ago\"? Yes, no, or maybe? Yes\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release. Are we justified in saying that \"Shameless Self-Promotion is the first album\"? Yes, no, or maybe? Yes\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed. Are we justified in saying that \"Kojima directed Metal Gear.\"? Yes, no, or maybe? Yes\n###\nOlivia Genevieve Wells (born 29 April 1994 in Melbourne) is an Australian charity worker and beauty pageant titleholder who was crowned Miss Universe Australia 2013 and represented Australia at Miss Universe 2013 in Moscow, Russia on 9 November 2013. She is the first woman to be crowned Miss Universe Australia from the state of Victoria. Are we justified in saying that \"Olivia Genevieve Wells was from Moscow.\"? Yes, no, or maybe? No\n###\nBilly Jacques was a rugby union and professional rugby league footballer who played in the 1890s, and 1900s, playing representative level rugby union (RU) for Yorkshire, and at club level for Hull F.C. (Prior to the 1895\u201396 Northern Rugby Football Union season, Hull F.C. was a rugby union club), and playing club level rugby league (RL) for St. Helens, and Hull F.C. Are we justified in saying that \"he was the most successful rugby player in the 80's\"? Yes, no, or maybe?", "doc_id": 856, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40316, 25337, 608, 43953], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rutgers University (officially known as Rutgers, The State University of New Jersey) is an institution of higher learning with campuses across the State of New Jersey its main flagship campus in New Brunswick and Piscataway, and two other campuses in the cities of Newark and Camden, New Jersey. Are we justified in saying that \"Rutgers University has only four campuses\"? Yes, no, or maybe? Yes\n###\nThe 1941 U.S. Open was the 45th U.S. Open, held June 5\u20137 at Colonial Country Club in Fort Worth, Texas. Craig Wood, who had lost in a playoff at the U.S. Open two years earlier, finally broke through and claimed his first U.S. Open title, three strokes ahead of runner-up Denny Shute in sweltering heat. Eight years earlier, Shute had defeated him in a playoff at the 1933 British Open. Are we justified in saying that \"The 1941 U.S. Open was held on the 5th\"? Yes, no, or maybe? Yes\n###\nLena \"Lenny\" Kaligaris is a fictional character in \"The Sisterhood of the Traveling Pants\", a best-selling series of young adult novels by Ann Brashares. In the 2005 film version of the first book, and the 2008 sequel, \"The Sisterhood of the Traveling Pants 2\", she is portrayed by Alexis Bledel. Are we justified in saying that \"Ann Brashares doesn't know how to read.\"? Yes, no, or maybe? No\n###\n\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording. Are we justified in saying that \"The song was first performed by Groff.\"? Yes, no, or maybe? Yes\n###\nKirill Olegovich Starkov (Russian:\u041a\u0438\u0440\u0438\u043b\u043b \u041e\u043b\u0435\u0433\u043e\u0432\u0438\u0447 \u0421\u0442\u0430\u0440\u043a\u043e\u0432, born March 31, 1987), is a professional Danish ice hockey player. He is playing for HC Red Ice in the Swiss National League B. He has previously played for CSKA Moscow, Syracuse Crunch, Youngstown Steelhounds, Red Deer Rebels, Fr\u00f6lunda HC, Timr\u00e5 IK, Esbjerg IK and IK Oskarshamn. Are we justified in saying that \"Youngstown Steelhounds is a hockey team in the Swiss National League.\"? Yes, no, or maybe?", "doc_id": 248, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35648, 34069, 28829, 41098], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "weRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo. Are we justified in saying that \"weRead is on the internet\"? Yes, no, or maybe? Yes\n###\nThe London Saturday Journal was a general interest magazine publishing short fiction and nonfiction pieces published in London, England in the Victorian era. The magazine was published by William Smith. During its existence the magazine had four volumes the last of which was issued in 1842. Are we justified in saying that \"The London Saturday Journal was the best selling magazine in London, England in 1841\"? Yes, no, or maybe? Maybe\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"the winner of the 211 red bull manny mania amateur u.s. championship is sebo walker\"? Yes, no, or maybe? Yes\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\". Are we justified in saying that \"Good Morning Lancaster has received several primetime Emmy awards.\"? Yes, no, or maybe? Maybe\n###\nThe Highway of Hope is a 1917 American Western silent film directed by Howard Estabrook and written by Harvey Gates and Willard Mack. The film stars House Peters, Sr., Kathlyn Williams, Jim Farley and Harry De Vere. The film was released on May 17, 1917, by Paramount Pictures. Are we justified in saying that \"The Highway of Hope had five actors in it. \"? Yes, no, or maybe?", "doc_id": 864, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36152, 26174, 35530, 35480], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area. Are we justified in saying that \"Stuart is very rich in history.\"? Yes, no, or maybe? Maybe\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla. Are we justified in saying that \"The icon is normally orange, with hex code #FA9B32. The original icon was created by Stephen Horlander, a designer at Mozilla.\"? Yes, no, or maybe? No\n###\nUSS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941. Are we justified in saying that \"US CHRISTOPER was built for world war I\"? Yes, no, or maybe? No\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines is a for-profit organization\"? Yes, no, or maybe? No\n###\nThe 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015. Are we justified in saying that \"The 2015 Latrobe City Traralgon ATP Challenger had a different name\"? Yes, no, or maybe?", "doc_id": 687, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13925, 21128, 42139, 16695], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice. Are we justified in saying that \"The Mannlicher\u2013Sch\u00f6nauer killed the most people.\"? Yes, no, or maybe? Maybe\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\". Are we justified in saying that \"Linkin Park is from America.\"? Yes, no, or maybe? Yes\n###\nCity Hall Station is a station on Seoul Subway lines 1 (Blue Line) and 2 (Green Line). As its name suggests, Seoul City Hall is located right next to the station. Deoksugung, a historic palace of the Joseon dynasty, is on the other side of the boulevard named Taepyeongno. Are we justified in saying that \"You must take the train to get from City Hall Station to Seoul City Hall\"? Yes, no, or maybe? Maybe\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani. Are we justified in saying that \"Kozani Stadium is not located in Athens. \"? Yes, no, or maybe? Yes\n###\n\"Sultans of Swing\" is a song by British rock band Dire Straits from their eponymous debut album, which band frontman Mark Knopfler wrote and composed. Although it was first released in 1978, it was its 1979 re-release that caused it to become a hit in both the UK and U.S. Are we justified in saying that \"\"Sultans of Swing\" is a song by British rock band Dire Straits that was released and re-released only one years apart.\"? Yes, no, or maybe?", "doc_id": 427, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44037, 7452, 16967, 16407], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948. Are we justified in saying that \"The author was African American.\"? Yes, no, or maybe? Maybe\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets. Are we justified in saying that \"Rodrequis La'Vant Stephens lived in Georgia, and played in the NFL for multiple teams.\"? Yes, no, or maybe? Yes\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million. Are we justified in saying that \"The Eolica Sarichioi Wind Farm will be profitable. \"? Yes, no, or maybe? Maybe\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis. Are we justified in saying that \"Escape from Suburbia: Beyond the American Dream made millions.\"? Yes, no, or maybe? Maybe\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas. Are we justified in saying that \"Snecma and Rolls-Royce are two European aircraft engine manufacturers.\"? Yes, no, or maybe?", "doc_id": 172, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29104, 21399, 5993, 24962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time. Are we justified in saying that \"Thomas Jefferson was the governor of Virginia \"? Yes, no, or maybe? Yes\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins. Are we justified in saying that \"Roderick Dwayne \"Rod\" Higgins is fat.\"? Yes, no, or maybe? Maybe\n###\nMike Hoffman (born September 20, 1980) is an American former professional ice hockey player. After leaving the University of Connecticut in 2003, he began his first pro season playing with the Worcester IceCats in the AHL and the Peoria Rivermen of the ECHL. He signed a professional contract with the Toronto Maple Leafs in 2005, but he has never played in the National Hockey League. Are we justified in saying that \"Mike Hoffman is still as good at hockey today as he was 2 years ago.\"? Yes, no, or maybe? Maybe\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line. Are we justified in saying that \"The Takoma Langley Crossroads Transit Center is a favourite of bus drivers\"? Yes, no, or maybe? Maybe\n###\nThe Eolica Sarichioi Wind Farm is a proposed wind power project in Sarichioi, Tulcea County, Romania. It will consist of eight individual wind farms connected together. It will have 51 individual wind turbines with a nominal output of around 2 MW which will deliver up to 102 MW of power, enough to power over 66,700 homes, with a capital investment required of approximately US$110 million. Are we justified in saying that \"Sarichioi is a rural city.\"? Yes, no, or maybe?", "doc_id": 879, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22316, 36920, 42636, 7567], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hell's Kitchen Australia is an Australian cooking reality competition television series which premiered on the Seven Network on 6 August 2017. The series is hosted by British chef Marco Pierre White, who previously hosted two seasons of the British version of the format and appeared in rival program \"MasterChef Australia\". Are we justified in saying that \"\"MasterChef Australia\" is better than hell's kitchen\"? Yes, no, or maybe? Maybe\n###\nUniversity of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. Are we justified in saying that \"University of Maryland Eastern Shore is a rubbish university \"? Yes, no, or maybe? Maybe\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd. Are we justified in saying that \"Maurice Anthony Foley currently has a long list of likes.\"? Yes, no, or maybe? Maybe\n###\nLeonard \"Boogie\" Weinglass (born 1941) is a charismatic American businessman who founded retailer Merry-Go-Round, a chain of restaurants named Boogie\u2019s Diner, and whose early life was portrayed by actor Mickey Rourke in the 1982 classic American film \"Diner\". Are we justified in saying that \"Leonard Wineglass's popularity as a Google search term rose when the movie with Mickey Rourke was released.\"? Yes, no, or maybe? Maybe\n###\nBallads of Sacco & Vanzetti is a set of ballad songs, written and performed by Woody Guthrie, related to the trial, conviction and execution of Sacco and Vanzetti. The series was commissioned by Moe Asch in 1945 and recorded in 1946 and 1947. Guthrie never completed the project and was unsatisfied by the result. The project was released later in its abandoned form by Asch. Are we justified in saying that \"Moe Asch completed the Ballads of Sacco & Vanzetti after the writer, Woody Guthrie, abandoned the project.\"? Yes, no, or maybe?", "doc_id": 865, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43421, 42124, 9876, 18694], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia. Are we justified in saying that \"The Head of The Investigative Committee of Russia was born in the summertime,\"? Yes, no, or maybe? Yes\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics. Are we justified in saying that \"ALGOL 68 is important.\"? Yes, no, or maybe? Maybe\n###\nA governorate is an administrative division of a country. It is headed by a governor. As English-speaking nations tend to call regions administered by governors either states, provinces, or colonies, the term \"governorate\" is often used in translation from non-English-speaking administrations. Are we justified in saying that \"A governorate is made up of over 1000 people.\"? Yes, no, or maybe? Maybe\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister. Are we justified in saying that \"Lou Gosset was not involved in J.D.'s Revenge\"? Yes, no, or maybe? No\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla. Are we justified in saying that \"The Feed icon was invited for the use of RSS.\"? Yes, no, or maybe?", "doc_id": 487, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35870, 32093, 5218, 21447], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nicotiana langsdorffii, Langsdorff's tobacco, is a species of the \"Nicotiana\" genus (tobacco). It is an annual plant with large leaves (up to 10 inches long) with tall 2 inch nodding long tubular bell shaped flowers that are apple green in colour, with blue anthers. \"N. langsdorfii\" lacks fragrance unlike some of the other tall species. It is grown as an ornamental garden plant. Are we justified in saying that \"Langsdorff's tobacco is a not a very popular green plant\"? Yes, no, or maybe? Maybe\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Tasmania was made popular due to a television cartoon series.\"? Yes, no, or maybe? Maybe\n###\nMount Doom is a fictional volcano in J.R.R. Tolkien's Middle-earth legendarium. It is located in the northwest of the Black Land of Mordor and close to Barad-d\u00fbr. Alternative names, in Tolkien's invented language of Sindarin, include Orodruin (\"fiery mountain\") and Amon Amarth (\"mountain of fate\"). Are we justified in saying that \"Mount Doom is located in Mordor\"? Yes, no, or maybe? Yes\n###\nFinniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla. Are we justified in saying that \"There are more than 5 towns in South Australia. \"? Yes, no, or maybe? Yes\n###\nLuke Strong is a fictional character from the British ITV soap opera, \"Coronation Street\". Portrayed by Craig Kelly, the character appeared throughout 2009. Luke took over Carla Connor's share of the Underworld factory with Tony Gordon. He knew Carla's deceased husband, Paul Connor. Are we justified in saying that \"Luke Strong and Carla Connor worked together\"? Yes, no, or maybe?", "doc_id": 323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28511, 21532, 16538, 39097], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction. Are we justified in saying that \"Lam Ching-ying's favorite film is Vampire Vs Vampire\"? Yes, no, or maybe? Maybe\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex. Are we justified in saying that \"Croton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It has an ugly red and thick latex.\"? Yes, no, or maybe? Maybe\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister. Are we justified in saying that \"Revenge is a common theme in blaxploitation movies.\"? Yes, no, or maybe? Maybe\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene. Are we justified in saying that \"\"Something from Nothing\" was the lead single from the Foo Fighters' eponymous studio album.\"? Yes, no, or maybe? No\n###\nThe third season of \"Gossip Girl,\" an American teen drama based upon the book series by Cecily von Ziegesar. Developed for television by Josh Schwartz and Stephanie Savage. Airing on The CW from September 14, 2009 to May 17, 2010 with 22 episodes. The season premiered 2.55 million viewers and a 1.4 Adults 18-49 rating, up 14% in viewers from its season two finale. Are we justified in saying that \"\"Gossip Girl\" is a teen drama that attracts adult audience as well.\"? Yes, no, or maybe?", "doc_id": 839, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32474, 25085, 16764, 20238], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada. Are we justified in saying that \"The ACT has events in New York City.\"? Yes, no, or maybe? Maybe\n###\nKaalamellam Kaathiruppen (Tamil: \u0b95\u0bbe\u0bb2\u0bae\u0bc6\u0bb2\u0bcd\u0bb2\u0bbe\u0bae\u0bcd \u0b95\u0bbe\u0ba4\u0bcd\u0ba4\u0bbf\u0bb0\u0bc1\u0baa\u0bcd\u0baa\u0bc7\u0ba9\u0bcd ; English: I Will Wait Forever ) is 1997 Tamil romance film directed by R. Sundarrajan. The film stars Vijay and Dimple in the lead roles, while R. Sundarrajan, Jaishankar, Srividya, Karan, Manivannan play other pivotal roles. The music for the film was composed by Deva and the film released on 14 January 1997. Are we justified in saying that \"Deva was one of the lead roles in the movie.\"? Yes, no, or maybe? No\n###\n\"Outro\" is a song by French electronic music artist M83, released as the final track on the group's sixth studio album, \"Hurry Up, We're Dreaming\" (2011). It is a dramatic, symphonic rock song which has evoked \"heartbreak, nostalgia, anticipation, jubilation and triumph\". Are we justified in saying that \"Outro was sung by Obama.\"? Yes, no, or maybe? Maybe\n###\nErnest Guiraud (] ; 26 June 1837 \u2013 6 May 1892) was a French composer and music teacher born in New Orleans, Louisiana. He is best known for writing the traditional orchestral recitatives used for Bizet's opera \"Carmen\" and for Offenbach's opera \"Les contes d'Hoffmann\" (\"The Tales of Hoffmann\"). Are we justified in saying that \"Ernest Guiraud wrote \"Carmen\"\"? Yes, no, or maybe? No\n###\nFerry County is a county located in the U.S. state of Washington. As of the 2010 census, the population was 7,551, making it the fourth-least populous county in Washington. The county seat and largest city is Republic. The county was created out of Stevens County on February 21, 1899 and is named for Elisha P. Ferry, the state's first governor. Are we justified in saying that \"U.S. state of Washington has a population bigger 7,556.\"? Yes, no, or maybe?", "doc_id": 826, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27954, 16281, 24885, 593], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Joseph Smith had a Mother.\"? Yes, no, or maybe? Yes\n###\nDual-role transvestism is the formal diagnosis used by psychologists and physicians to describe people who wear clothes of the opposite sex to experience being the opposite sex temporarily, but don't have a sexual motive or want gender reassignment surgery. The International Classification of Diseases (ICD-10) list three diagnostic criteria for \"Dual-role transvestism\" (F64.1): Are we justified in saying that \"Dual-role transvestism is what Bowie has.\"? Yes, no, or maybe? Maybe\n###\nHim & Her is a British television sitcom about a lazy twenty-something couple: Steve and Becky, who live in Walthamstow, London. It was first broadcast in the United Kingdom on BBC Three on 6 September 2010. It is written by Stefan Golaszewski and stars Russell Tovey and Sarah Solemani. The theme tune is the song \"Boom Bang-a-Bang\" by Lulu. Are we justified in saying that \"Steve and Becky combined age is 60\"? Yes, no, or maybe? No\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others. Are we justified in saying that \"Joseph Lovett won an academy award for this documentary.\"? Yes, no, or maybe? Maybe\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla. Are we justified in saying that \"The Feed icon was created by a team of 3 people.\"? Yes, no, or maybe?", "doc_id": 79, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13904, 32942, 21610, 44948], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o starts with a J.\"? Yes, no, or maybe? Yes\n###\nThe United Nations Peacekeepers Medal (Irish: \"An Bonn Chosant\u00f3ir\u00ed Sioch\u00e1na na N\u00e1isi\u00fan Aontaithe\" ) is awarded to those members of the Irish Defence Forces or Chaplaincy Service who have served overseas on a United Nation Mission or United Nations Mandated Mission. Are we justified in saying that \"The United Nations Peacekeepers Medal is awarded every December.\"? Yes, no, or maybe? Maybe\n###\nSaiyuki (\u6700\u904a\u8a18 , Saiy\u016bki ) is a manga series by Kazuya Minekura which was serialized in \"G-Fantasy\" from 1997 to 2002. It spawned multiple manga sequels, anime adaptations, video games and other media. The story is loosely based on the Chinese novel \"Journey to the West\". Are we justified in saying that \"Saiyuki was born to fantasy in the chinese novel\\\"? Yes, no, or maybe? Maybe\n###\nYear 493 BC was a year of the pre-Julian Roman calendar. At the time, it was known as the Year of the Consulship of Auruncus and Viscellinus (or, less frequently, year 261 \"Ab urbe condita\"). The denomination 493 BC for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years. Are we justified in saying that \"493 BC was 100 years ago\"? Yes, no, or maybe? No\n###\nPhichai Railway Station is a railway station located in Nai Mueang Subdistrict, Phichai District, Uttaradit. It is located 447.553\u00a0km from Bangkok Railway Station and is a class 2 railway station. It is on the Northern Line of the State Railway of Thailand. Phichai Railway Station opened as part of the Northern Line extension from Phitsanulok to Ban Dara Junction in November 1908. Are we justified in saying that \"Phichai Railway Station was under construction in 1906.\"? Yes, no, or maybe?", "doc_id": 596, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17566, 38842, 19983, 17386], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Neilson Hubbard is an American singer-songwriter, musician and producer. His first band was called This Living Hand formed with Clay Jones. They signed to Adam Duritz's label, E Pluribus Unum. After the band split up, Hubbard went on to record three solo albums, \"The Slide Project\", \"Why Men Fail\" and \"Sing Into Me\". He also collaborated with Matthew Ryan to form the band Strays Don't Sleep. Are we justified in saying that \"Hubbard created his own three solo albums prior to being in Strays Don't Sleep.\"? Yes, no, or maybe? Yes\n###\nJohnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\". Are we justified in saying that \"Johnny Kidd also sang in German.\"? Yes, no, or maybe? Maybe\n###\nMarwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros. Are we justified in saying that \"He is in his forties.\"? Yes, no, or maybe? No\n###\nMargarita la tornera (Margarita the Gatekeeper) is an opera in three acts composed by Ruperto Chap\u00ed to a libretto by Carlos Fern\u00e1ndez Shaw, based on a dramatic poem by Jos\u00e9 Zorrilla. It premiered on February 24, 1909 at the Teatro Real in Madrid in a performance conducted by the composer. An acclaimed recording of the opera came out in 1999 with Pl\u00e1cido Domingo and Elisabete Matos. Are we justified in saying that \"The inspiration behind Margarita la tornera was a poem\"? Yes, no, or maybe? Yes\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound. Are we justified in saying that \"The Bavarian Mountain Hound is found in London. \"? Yes, no, or maybe?", "doc_id": 731, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25165, 28380, 33774, 30082], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\". Are we justified in saying that \"\"The Call\" charted in the UK.\"? Yes, no, or maybe? Maybe\n###\nGuns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin). Are we justified in saying that \"Guns of Diablo won many awards.\"? Yes, no, or maybe? Maybe\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185. Are we justified in saying that \"In 2005 South Cove and Wenham Parva both had small populations.\"? Yes, no, or maybe? Yes\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers. Are we justified in saying that \"McColo was founded in the 20th century.\"? Yes, no, or maybe? Maybe\n###\nSt Kilda is an inner suburb (neighbourhood) of the metropolitan area of Melbourne, Victoria, Australia, 6 km south-east of Melbourne's Central Business District. Its local government area is the City of Port Phillip. At the 2011 Census, St Kilda had a population of 17,795. Are we justified in saying that \"The City of Port Philip has a population of 17,795\"? Yes, no, or maybe?", "doc_id": 569, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23743, 45143, 3530, 24235], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Hold Me Tight\" is a rock and roll song by English rock group the Beatles from their 1963 album \"With the Beatles\". It was first recorded during the \"Please Please Me\" album session, but not selected for inclusion and re-recorded for their second album. Are we justified in saying that \"The \"Please Please Me\" album recording session lasted for a few months in 1961 \"? Yes, no, or maybe? Maybe\n###\nJaron Long (born August 28, 1991) is an American professional baseball pitcher who is with the Washington Nationals organization. Prior to playing professionally, Long played college baseball for Chandler-Gilbert Community College and Ohio State University. His father, Kevin Long, is the current hitting coach of the New York Mets and former hitting coach of the New York Yankees. Are we justified in saying that \"Long is the first in his family to go into baseball.\"? Yes, no, or maybe? No\n###\nLes Sept Paroles du Christ sur la Croix (composed 1859) is a musical setting of The Seven Last Words of Christ by C\u00e9sar Franck, though the name in French often refers to an equally well or better known homonymous work by Charles Gounod \"Les sept paroles de N.S. Jesus-Christ sur la croix\". Are we justified in saying that \"Les Sept Paroles du Christ sur la Croix was a successful musical written and performed in France \"? Yes, no, or maybe? Maybe\n###\nFatsia japonica(syn. \"Aralia japonica\" Thunb., \"A. sieboldii\" Hort. ex K.Koch), also glossy-leaf paper plant, fatsi, paperplant or Japanese aralia, is a species of flowering plant in the family Araliaceae, native to southern Japan, southern Korea, and Taiwan. Are we justified in saying that \"Fatsia Japonica is native to the US\"? Yes, no, or maybe? No\n###\nWalking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis. Are we justified in saying that \"Walking on Sunshine was the debut film for two people.\"? Yes, no, or maybe?", "doc_id": 991, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15683, 36201, 11321, 34518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Builth Castle (Welsh: \"Castell Llanfair-ym-Muallt\" ) was a castle built under King Edward I, just outside Builth Wells, Powys, Wales. At one time it was an impressive stone-built castle but all the masonry has been removed over the years and all that remains are the mound on which it stood, the ditches and embankments. Are we justified in saying that \"The castle stood on a mound \"? Yes, no, or maybe? Yes\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city. Are we justified in saying that \"There was a census in 2010\"? Yes, no, or maybe? Yes\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Daniel Murphy was born February 24, 1994.\"? Yes, no, or maybe? Maybe\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock. Are we justified in saying that \"\"Pour Me\" sold 500000 copies\"? Yes, no, or maybe? Maybe\n###\nThe Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project. Are we justified in saying that \"In the adjacent RV park there are at least 1 RV's.\"? Yes, no, or maybe?", "doc_id": 866, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34, 6247, 43439, 18874], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Joan Ganz Cooney Center (informally, the Cooney Center) is an independent, non-profit, non-partisan research and innovation group founded by Sesame Workshop in order to advance children\u2019s literacy skills and foster innovation in children\u2019s learning through digital media. Are we justified in saying that \"Sesame Workshop has put millions into the Joan Ganz Cooney Center for further development of digital media learning.\"? Yes, no, or maybe? Maybe\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center. Are we justified in saying that \"George Balanchine was a choreographer of ballet.\"? Yes, no, or maybe? Yes\n###\nThe Altar Valley is a 45-mile (72\u00a0km) long north-south valley, trending slightly northeast from Sasabe, Arizona on the Mexico border to the Avra Valley west of the Tucson Mountains. It is delimited by Arizona State Route 86, from east-to-west on the north separating it from the Avra Valley which then trends \"northwesterly\", merging into the plains and drainage of the Santa Cruz River. Are we justified in saying that \"Altar extends over 7900 yards across the valley\"? Yes, no, or maybe? Yes\n###\nManos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor. Are we justified in saying that \"Krystalis is engaged to a model.\"? Yes, no, or maybe? Maybe\n###\nThe Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times. Are we justified in saying that \"The book was awarded a prize within 12 months of being published.\"? Yes, no, or maybe?", "doc_id": 968, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20485, 42362, 36698, 18987], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "George White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation. Are we justified in saying that \"George White is a big man\"? Yes, no, or maybe? Maybe\n###\n\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single, Are we justified in saying that \"The song Inbetweener by English Britpop band Sleeper is over 100 years old\"? Yes, no, or maybe? No\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting. Are we justified in saying that \"Dungeons and dragons is a boring game\"? Yes, no, or maybe? Maybe\n###\nThe Prime Minister's XI or PM's XI (formerly Australian Prime Minister's Invitation XI) is an invitational cricket team picked by the Prime Minister of Australia for an annual match held at the Manuka Oval in Canberra against an overseas touring team. The Australian team usually includes up and coming players. Are we justified in saying that \"Cricket is a difficult sport.\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall is a mall in , Virginia, United States\"? Yes, no, or maybe?", "doc_id": 130, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34240, 15817, 13344, 14565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Smithereens was produced by a member of another band.\"? Yes, no, or maybe? Yes\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"The Pursuit of Happyness flopped at the box office\"? Yes, no, or maybe? Maybe\n###\n51829 Williemccool (2001 OD ) is an asteroid named for astronaut Willie McCool, who was killed in the STS-107 (\"Columbia\") space shuttle reentry disaster on February 1, 2003. 51829 Williemccool was discovered on July 21, 2001 at Palomar Observatory by the JPL Near Earth Asteroid Tracking Program. Are we justified in saying that \"\"Wherever US is, We are.\" was a slogan from 1965\"? Yes, no, or maybe? Maybe\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II. Are we justified in saying that \"Astra Modelo 400 was the sidearm standard in the army of the Spanish. \"? Yes, no, or maybe? Yes\n###\nBig Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999. Are we justified in saying that \"The band played at The band played at the Super Bowl XXXII\"? Yes, no, or maybe?", "doc_id": 578, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39771, 7334, 14759, 40419], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"He attended the University of Virginia and was a leading member of three NCAA Men's Tennis Championship winning teams.\"? Yes, no, or maybe? Maybe\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017. Are we justified in saying that \"The 2017 City of Onkaparinga ATP Challenger is a tennis tournament\"? Yes, no, or maybe? Yes\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"von Houwald loved his mother. \"? Yes, no, or maybe? Maybe\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added. Are we justified in saying that \"Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". Members of the band decided to replace Sean O'Hagan with keyboardist Katharine Gifford changing the guitar sounds for the keyboard.\"? Yes, no, or maybe? Maybe\n###\nInferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India. Are we justified in saying that \"Inferno has no plot.\"? Yes, no, or maybe?", "doc_id": 174, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10237, 22357, 15259, 14194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grotto Geyser is a fountain-type geyser located in the Upper Geyser Basin in Yellowstone National Park in the United States. Grotto Geyser is the namesake for the group of geysers that includes Grotto Fountain Geyser, South Grotto Fountain Geyser, Indicator Spring, Spa Geyser, and Rocket Geyser. Are we justified in saying that \"Rocket Geyser is part of a group of five geysers.\"? Yes, no, or maybe? Yes\n###\nSwift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933. Are we justified in saying that \"The novel was set in 1835\"? Yes, no, or maybe? Yes\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel. Are we justified in saying that \"Castle Wolfenstein was first a Playstation game.\"? Yes, no, or maybe? No\n###\nThe San Diego Chargers announced their 40th Anniversary Team in 2000 to honor the top players and coaches in the history of the National Football League team. The Chargers began play in 1960 as part of the American Football League. The anniversary team included 31 players and coaches voted on by fans and a media panel. The team became the Los Angeles Chargers after relocating in 2017. Are we justified in saying that \"The San Diego Chargers has been visited by Clinton.\"? Yes, no, or maybe? Maybe\n###\nHere is a list of all of KF Tirana's Cup seasons from 1939 till end of most recent season. This list shows where they finished the season, how many ties won or lost, how many goals they scored and conceded, how many wins draws and losses they had throughout the season, goal difference, winning difference and number of matches played. Are we justified in saying that \"The implied list shows KF Tirana's Cup season from at least the past 90 years. \"? Yes, no, or maybe?", "doc_id": 925, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33074, 24286, 9449, 24009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company. Are we justified in saying that \"UKOTCF helps other countries with environment issues as well.\"? Yes, no, or maybe? Maybe\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards. Are we justified in saying that \"Many engineers prefer the Proteus Design Suite over other software available in the design automation world. \"? Yes, no, or maybe? Maybe\n###\nSt. Petersburg is a city in Pinellas County, Florida, United States. As of the 2015 census estimate, the population was 257,083, making it the fifth-most populous city in Florida and the largest in the state that is not a county seat (the city of Clearwater is the seat of Pinellas County). Are we justified in saying that \"St. Petersburg is a city in Alabama, United States.\"? Yes, no, or maybe? No\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards. Are we justified in saying that \"The Proteus Design Suite is used by the US army\"? Yes, no, or maybe? Maybe\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \"Golshifteh Farahani was a famous actress when The Pear Tree was released.\"? Yes, no, or maybe?", "doc_id": 179, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38543, 44864, 19240, 40788], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Midnight Tides is the fifth volume of Canadian author Steven Erikson's epic fantasy series, the \"Malazan Book of the Fallen\". Although it is part of the larger series, it has only limited references to the previous books. However, it is not a stand-alone volume as the events of the books \"Reaper's Gale\" and \"Dust of Dreams\" follow on from it. Are we justified in saying that \"Steven Erickson is a citizen of a country that is north of Brazil.\"? Yes, no, or maybe? Yes\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile. Are we justified in saying that \"There were 4 installments of the \"Child's Play\" franchise before Bride of Chucky in 1998\"? Yes, no, or maybe? No\n###\nThe following are lists of the most populous fully defined incorporated settlements in Nigeria by population. This page consists three different tables, with different kinds of settlements; a list for \"defined cities\", listing the population, strictly within the defined city limits, a list for \"urban area\" population, and another list for the population within metropolitan areas. Are we justified in saying that \"This page consists of three of the same tables.\"? Yes, no, or maybe? No\n###\nBath Salt Zombies is a 2013 American horror comedy directed by Dustin Mills, written by Mills and Clint Weller, and starring Josh Eal, Ethan Holey, Jackie McKown, Dave Parker, and Brandon Salkil. It is about zombie attacks brought on by concentrated bath salts. Are we justified in saying that \"Bath Salt Zombies is a comedy\"? Yes, no, or maybe? Yes\n###\nAz-Zahir Ali Hakim (born June 3, 1977) is a former American football wide receiver. He played college football at San Diego State. He was drafted by the St. Louis Rams in the fourth round (96th overall) of the 1998 NFL Draft. He also was a member of the Detroit Lions, New Orleans Saints, San Diego Chargers, Miami Dolphins, and Las Vegas Locomotives. Are we justified in saying that \"Hakim was drafted by both the army and the St. Louis Rams. \"? Yes, no, or maybe?", "doc_id": 339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44079, 34590, 18303, 32683], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zaiga Jansone-Ivanova (born (1951--)24 1951 ) is a former Soviet Latvian tennis player and tennis coach. She was a five-time Soviet champion in women's doubles, 1973 Summer Universiade champion in women's doubles (all with Olga Morozova) and winner of the exhibition tennis event of 1968 Olympics in mixed doubles (with (Vladimir Korotkov). Are we justified in saying that \"Zaiga Jansone-Ivanova will neither or always be remembered as winner of the exhibition tennis event of 1968 Olympics.\"? Yes, no, or maybe? Maybe\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season. Are we justified in saying that \"Their 2016 season was a losing season.\"? Yes, no, or maybe? Maybe\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"Elizabeth City State University is mostly a caucasian school\"? Yes, no, or maybe? No\n###\nAz-Zahir Ali Hakim (born June 3, 1977) is a former American football wide receiver. He played college football at San Diego State. He was drafted by the St. Louis Rams in the fourth round (96th overall) of the 1998 NFL Draft. He also was a member of the Detroit Lions, New Orleans Saints, San Diego Chargers, Miami Dolphins, and Las Vegas Locomotives. Are we justified in saying that \"azzahir played wideouts in football\"? Yes, no, or maybe? Yes\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Weltenbrand was a commercial failure\"? Yes, no, or maybe?", "doc_id": 386, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37520, 17085, 44769, 1995], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "NASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi. Are we justified in saying that \"NASA John H. Glenn Research Center at Lewis Field is a NASA center between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River. \"? Yes, no, or maybe? Yes\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television. Are we justified in saying that \"Sebastian Philip Bierk was a Canadian hip hop artist\"? Yes, no, or maybe? No\n###\nJohn Garman \"J. G.\" Hertzler Jr. (born March 18, 1950) is an American actor, author, screenwriter, and activist best known for his role on \"\" as the Klingon General (and later Chancellor) Martok, whom he portrayed from 1995 until the series' end in 1999. Are we justified in saying that \"John Garman \"J. G.\" Hertzler Jr. doesn't speak the official language of the United States.\"? Yes, no, or maybe? No\n###\nThe \"Minas Geraes\" class, spelled \"Minas Gerais\" in some sources, consisted of two battleships built for the Brazilian Navy in the early twentieth century. Named \"Minas Geraes\" and \"S\u00e3o Paulo\" , the ships were intended to be Brazil's first step towards becoming an international power, and they consequently initiated a South American naval arms race. Are we justified in saying that \"There were two battleships made for a navy in the early twentieth century.\"? Yes, no, or maybe? Yes\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie. Are we justified in saying that \"The film was released in the cinema.\"? Yes, no, or maybe?", "doc_id": 497, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1404, 20968, 39523, 6755], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Concrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr. Are we justified in saying that \"The lead track for \"Concrete Sky\" is not an original song made for the album. \"? Yes, no, or maybe? Yes\n###\nPaul Revere ( ; December 21, 1734 O.S.May 10, 1818) was an American silversmith, engraver, early industrialist, and Patriot in the American Revolution. He is best known for his midnight ride to alert the colonial militia in April 1775 to the approach of British forces before the battles of Lexington and Concord, as dramatized in Henry Wadsworth Longfellow's poem, \"Paul Revere's Ride\" (1861). Are we justified in saying that \"Revere took years to become an accomplished silversmith.\"? Yes, no, or maybe? Maybe\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training. Are we justified in saying that \"The Boulton Paul Balliol and Sea Balliol were expensive aircraft\"? Yes, no, or maybe? Maybe\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg. Are we justified in saying that \"Australia had a great depression in the 20th century.\"? Yes, no, or maybe? Yes\n###\nDallas was a town in Ouray County, Colorado, United States. It lay about 3 miles (5\u00a0km) north of the present town of Ridgway at the confluence of Dallas Creek and the Uncompahgre River. A community named in tribute to the historic town bearing the name Dallas Meadows now exists at its historic location. Are we justified in saying that \"Dallas lies over 5000 yards north of Ridgway\"? Yes, no, or maybe?", "doc_id": 106, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43851, 29207, 29309, 14706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Wire is an American crime drama television series set and produced in Baltimore, Maryland. Created and primarily written by author and former police reporter David Simon, the series was broadcast by the cable network HBO in the United States. \"The Wire\" premiered on June 2, 2002, and ended on March 9, 2008, comprising 60 episodes over five seasons. Are we justified in saying that \"David Simon was a police reporter in Baltimore.\"? Yes, no, or maybe? Yes\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant. Are we justified in saying that \"The USS Voyager does not have many aliens on it\"? Yes, no, or maybe? Maybe\n###\nWalking on Sunshine is a 2014 British romantic musical comedy-drama film directed by Max Giwa and Diana Pasquini. The film features covers of songs from the 1980s and was released on 27 June 2014. It is also a debut role for singer-songwriter Leona Lewis. Are we justified in saying that \"The film Walking on Sunshine was released in Los Angeles, California on June 27, 2014.\"? Yes, no, or maybe? Maybe\n###\nAnsar al-Sharia in Tunisia (\"\"Supporters of Islamic Law in Tunisia\"\") is a radical Islamist group that operates in Tunisia. It has around 1,000 people as part of the movement. It has been listed as a terrorist group by the Tunisian government as well by the United Nations, the UAE, the United Kingdom and the United States. Some of its members may be linked to the 2015 Sousse attacks. Are we justified in saying that \"Sharia Law will become a dominate force withing twenty years!\"? Yes, no, or maybe? Maybe\n###\nG.I. Joe: Ninja Battles is a film that was released on DVD in late 2004, as part of the Ninja Battles set of figures. In it, the history of the Arashikage Clan, as well as the history of Snake Eyes and Storm Shadow's rivalry, are examined through a series of trials. Scenes from both \"\" and \"\" are used, with a brief period of new animation at the end of the movie. Are we justified in saying that \"G.I. Joe: Ninja Battles was released on DVD more than 2002 years ago.\"? Yes, no, or maybe?", "doc_id": 923, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1855, 13658, 6688, 31193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Trois sonneries de la Rose+Croix (\"Three Sonneries of the Rose+Cross\") is a piano composition by Erik Satie, first published in 1892, while he was composer and chapel-master of the Rosicrucian \"Ordre de la Rose-Croix Catholique, du Temple et du Graal \", led by S\u00e2r Jos\u00e9phin P\u00e9ladan. Are we justified in saying that \"S\u00e2r Jos\u00e9phin P\u00e9ladan was not a rosicrucian.\n\"? Yes, no, or maybe? No\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics. Are we justified in saying that \"Am\u00e9lie Simone Mauresmo won two Grand Slam singles titles at age of 17.\"? Yes, no, or maybe? Maybe\n###\nSc\u00e8nes de ballet is a ballet made by New York City Ballet balletmaster John Taras to Stravinsky's eponymous music from 1944. The premiere took place June 22, 1972, as part of the City Ballet's Stravinsky Festival at the New York State Theater, Lincoln Center. Are we justified in saying that \"Scenes de ballet is the only ballet made by New York City balletmaster John Taras\"? Yes, no, or maybe? Maybe\n###\nThe AIR Charts are the official sales charts for Australian independent music released by Australian owned, independent record labels. Presented by AIR, the Australian Independent Record Labels Association, the charts are calculated according to official sales figures provided by the ARIA Charts, which includes legal MP3 download sales. Are we justified in saying that \"The AIR Charts has verified sales data.\"? Yes, no, or maybe? Yes\n###\nNicola or Niccolo Massaro (died 1704) was an Italian painter of the late-Baroque period, active in his native city of Naples. He painted mainly marine vedute and landscapes in the style of his master, Salvatore Rosa. One of his colleagues was Marzio Masturzo. One of his pupils was Gaetano Martoriello, and Massaro's son's Girolamo and Gennaro. Are we justified in saying that \"Salvatore Rosa started painting before Nicola.\"? Yes, no, or maybe?", "doc_id": 639, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31953, 30838, 5175, 26861], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character. Are we justified in saying that \"Adrianne Palicki looked beautiful in the wonder woman costume.\"? Yes, no, or maybe? Maybe\n###\nThe Thebaid ( ; Latin: \"Th\u0113ba\u00efs\") is a Latin epic in 12 books written in dactylic hexameter by Publius Papinius Statius (AD c. 45 \u2013 c. 96). The poem deals with the Theban cycle and treats the assault of the seven champions of Argos against the city of Thebes. Are we justified in saying that \"Statius wrote at least one book\"? Yes, no, or maybe? Yes\n###\nThe Multiwavelength Atlas of Galaxies is a textbook and atlas of 35 well studied galaxies (including our Galaxy) authored by Glen Mackie of the Centre for Astrophysics & Supercomputing, Swinburne University of Technology. It was originally published in 2011 by Cambridge University Press. Are we justified in saying that \"The Multiwavelength Atlas of Galaxies studies more galaxies than any other book.\"? Yes, no, or maybe? Maybe\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups. Are we justified in saying that \"Corn smut is actually a disease of a corn plant, a disease that is eaten in Mexico\"? Yes, no, or maybe? Yes\n###\nMaurice Anthony Foley (9 October 1925 \u2013 8 February 2002) was a British Labour Party politician. He was elected as Member of Parliament (MP) for West Bromwich at a by-election in 1963, and represented the constituency until his resignation from the House of Commons in 1973. His successor in the resulting by-election was the future Speaker of the British House of Commons, Betty Boothroyd. Are we justified in saying that \"Maurice Foley was the Speaker of the British house of commons before Betty Boothroyd.\"? Yes, no, or maybe?", "doc_id": 884, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40701, 7036, 4580, 26029], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Tunn (born 18 January 1974) is an Australian radio announcer and television presenter. He was hired by Australia's national youth station Triple J in 1990 at the age of 17, making him Australia's youngest professional radio presenter at the time. Are we justified in saying that \"Michael Tunn wasn't the youngest professional radio presenter in 1909.\"? Yes, no, or maybe? No\n###\nThe Arboretum Curie, also known as the Arboretum du Col des Trois Soeurs, is a small arboretum located at 1470 metres altitude in the Col des Trois Soeurs near La Panouse, Loz\u00e8re, Languedoc-Roussillon, France. It was created circa 1975 to study conifers suitable for reforestation, and according to Arbez et al., now contains 77 taxa (primarily conifers). Are we justified in saying that \"The Arboretum Curie contains over 77 taxa at an elevation of 1470 metres in the country of France. Its purpose is to research a number of conifers for their possible reforestation properties as of the year 1975.\"? Yes, no, or maybe? Yes\n###\nThe Fondation Prince Pierre was established by Prince Rainier III of Monaco in February 1966 to promote culture and the arts through the creation and the awarding of prizes. Prince Rainier III created the foundation in tribute to his father, Pierre de Polignac a great patron of the arts. Are we justified in saying that \"Prince Rainier III created the foundation in tribute to a great patron of the arts.\"? Yes, no, or maybe? Yes\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\" Are we justified in saying that \"People really like using Flatbush Avenue to get out of queens\n\"? Yes, no, or maybe? Maybe\n###\nCharles Farrell (August 9, 1900 \u2013 May 6, 1990) was an American film actor of the 1920s silent era and into the 1930s, and later a television actor. Farrell is probably best recalled for his onscreen romances with actress Janet Gaynor in more than a dozen films, including \"7th Heaven\", \"Street Angel\", and \"Lucky Star\". Are we justified in saying that \"Farrell and Gaynor were romantically involved.\"? Yes, no, or maybe?", "doc_id": 697, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17163, 31232, 2099, 31480], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Geoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\". Are we justified in saying that \"Geoffrey Zakarian was born July 1, 1955\"? Yes, no, or maybe? No\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle. Are we justified in saying that \"City Mall is a large mall in Jordan that has been open for over a decade. There are many films shown in the cinema here. Some of the films are American.\"? Yes, no, or maybe? Yes\n###\nThe Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper. Are we justified in saying that \"Jack the Ripper was famous for robbing people\"? Yes, no, or maybe? No\n###\nAlbert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion. Are we justified in saying that \"The United States influences judicial rulings in the Virgin Islands\"? Yes, no, or maybe? Yes\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide. Are we justified in saying that \"The show was released the year after 2006.\"? Yes, no, or maybe?", "doc_id": 369, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31425, 12540, 8939, 14427], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The shooting of John Crawford III occurred on August 5, 2014. Crawford was a 22-year-old African-American man shot to death by Beavercreek police officer Sean Williams, in a Walmart store in Beavercreek, Ohio, near Dayton, while holding a toy BB gun. Are we justified in saying that \"The shooting of John Crawford III did not occur in Dayton, Ohio.\"? Yes, no, or maybe? Yes\n###\n\"Eve\" is an American television sitcom that was broadcast on United Paramount Network (UPN) from September 15, 2003, to May 11, 2006. A total of 66 episodes of \"Eve\" were broadcast over three seasons. Created by Meg DeLoatch, the series follows New York City fashion designer Shelly Williams (Eve) through her relationship with physical therapist J.T. Hunter (Jason George). Are we justified in saying that \"Episodes of Eve aired in the month of December.\"? Yes, no, or maybe? Maybe\n###\nDeliver Us Tour was a concert tour by band Darkest Hour, taking place from late 2007, in support of their fifth studio album \"Deliver Us\" and finishing in December 2008. The tour started shortly after the Undoing Ruin Tour ended, earlier in December 2006. Are we justified in saying that \"Deliver Us Tour was performed in large venues.\"? Yes, no, or maybe? Maybe\n###\nCoraz\u00f3n Valiente (\"Fearless Heart\"), originally known as \"Ca\u00eddas del Cielo\", is a Spanish-language telenovela produced by United States-based television network Telemundo Studios, Miami, featuring an ensemble cast. Adriana Fonseca, Ximena Duque, Jos\u00e9 Luis Res\u00e9ndez and Fabi\u00e1n R\u00edos starred as the main protagonists, with Aylin Mujica and Manuel Landeta starred as the main antagonists. Are we justified in saying that \"Coraz\u00f3n Valiente setting was in a Hospital.\"? Yes, no, or maybe? Maybe\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament. Are we justified in saying that \"In the year that equals one thousand times two plus ten plus one, the Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium. \"? Yes, no, or maybe?", "doc_id": 529, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41227, 4851, 31612, 31804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"Chris Gardner conceived a child with a woman before the events of The Pursuit of Happyness.\"? Yes, no, or maybe? Maybe\n###\nJon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International. Are we justified in saying that \"Jon Luther is the Chairman of the Board of many companies, he may be a shareholder in others as well\"? Yes, no, or maybe? Maybe\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism. Are we justified in saying that \"The population is now over 5 thousand\"? Yes, no, or maybe? Maybe\n###\nRubicon Drinks Ltd (formerly Rubicon Products Ltd) is a soft drink manufacturer based in Milton Keynes, UK. The company specialises in exotic soft drinks, and produces a variety of different flavours of drink, many of which contain sugar or artificial sweeteners, and which come in both still and sparkling varieties. In the United Kingdom, their drinks are bottled and distributed by A.G. Barr plc. Are we justified in saying that \"Rubicon Drinks is the 2nd largest soda distributor in the UK. \"? Yes, no, or maybe? Maybe\n###\nJoe Fryer is an American journalist and storyteller working for NBC News as a west coast correspondent based at the NBC News West Coast Bureau in Universal City, California. Fryer joined NBC News in 2013 as a part-time correspondent and officially joined NBC News as a full-time correspondent on October 21, 2013. Are we justified in saying that \"ABC News West Coast Bureau is located in Universal City, CA.\"? Yes, no, or maybe?", "doc_id": 42, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32839, 6917, 36712, 33989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "RAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family. Are we justified in saying that \"The station was renamed less than 100 years ago\"? Yes, no, or maybe? Yes\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances. Are we justified in saying that \"Spaceballs has a cameo appearance by Rudy De Luca\"? Yes, no, or maybe? Yes\n###\nThe Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line. Are we justified in saying that \"The Takoma Langley Crossroads Transit Center has always been the largest of its kind.\"? Yes, no, or maybe? Maybe\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing. Are we justified in saying that \"The Program in Creative Writing is well-renowned.\"? Yes, no, or maybe? Yes\n###\nJames Hagan (21 January 1918 \u2013 26 February 1998), known as Jimmy Hagan, was an English football player and manager born in Washington, County Durham, England. He played between 1938 and 1958 for Sheffield United and once for England. As manager he had his greatest successes with S.L. Benfica in the early 1970s. Are we justified in saying that \"Sheffield United was formed in nineteen hundred twenty one.\"? Yes, no, or maybe?", "doc_id": 430, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20429, 37644, 7066, 14135], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"the Lord's Cricket Ground on 25 June 1983 was filled to maximum capacity.\"? Yes, no, or maybe? Maybe\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea. Are we justified in saying that \"the Albums was recorded during the 2014 olympics\"? Yes, no, or maybe? Maybe\n###\nJulian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality. Are we justified in saying that \"Julian Ricardo Marley is a well-known musician. \"? Yes, no, or maybe? Maybe\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing. Are we justified in saying that \"The Charter Township of Lansing is only a movie.\"? Yes, no, or maybe? No\n###\nThe Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper. Are we justified in saying that \"The last murder committed by Jack the Ripper occurred on February 13th, 1891.\"? Yes, no, or maybe?", "doc_id": 294, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32006, 7621, 37861, 43898], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shilpa Shukla is an Indian theatre, television and Bollywood film actor from Bihar. She is known for her roles in the 2007 sports drama \"Chak De! India\" and the 2013 neo-noir film \"B.A. Pass\", for which she was awarded the Filmfare Critics Award for Best Actress. Are we justified in saying that \"B.A. Pass is an example i the neo-noir genre.\"? Yes, no, or maybe? Yes\n###\nChristopher Seton Abele (born January 28, 1967) is an American businessman and Democratic Party politician. He is the current Milwaukee County Executive. Abele is the son of American businessman John Abele, the co-founder of Boston Scientific. Abele serves as a trustee of the Argosy Foundation, a charitable trust established with an endowment from his father. Are we justified in saying that \"Christopher Seton Abele (born January 28, 1967) is an American businessman and Republic Party politician\"? Yes, no, or maybe? No\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\". Are we justified in saying that \"Greg Lazarus is the pen name of Nigerian husband-and-wife writing duo.\"? Yes, no, or maybe? No\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"He was born in the 5th month of the year\"? Yes, no, or maybe? Yes\n###\nWarrant Officer Kenji Yanagiya (\u67f3\u8c37 \u8b19\u6cbb , Yanagiya Kenji , March 1919 \u2013 February 29, 2008) was a member of the Imperial Japanese Navy's Zero fighter aces who fought the Battle of Solomon Islands in October 1942 \u2013 June 1943. He is best known as the only escort fighter pilot of the Yamamoto mission to survive the war. Are we justified in saying that \"Warrant Officer Kenji Yanagiya did not learn to fly until 1950.\"? Yes, no, or maybe?", "doc_id": 402, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18907, 24813, 37729, 26916], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Isobel Downer, Lady Downer (13 December 1924 \u2013 14 October 2014) was a prominent South Australian patron, wife of federal MP and high commissioner Sir Alexander \"Alick\" Downer, and mother of Liberal Party leader, Australian Foreign Minister and high commissioner Alexander Downer. Are we justified in saying that \"Mary Isobel Downer, Lady Downer never loved her husband federal MP and high commissioner Sir Alexander \"Alick\" Downer\"? Yes, no, or maybe? Maybe\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"Osella is a millionaire.\"? Yes, no, or maybe? Maybe\n###\nDwight Yoakam is an American country music singer-songwriter. Since his debut single, \"Honky Tonk Man\" in 1986, he has released 46 singles, including two \"Billboard\" Hot Country Songs number one singles, as well as 4 number ones in Canada. In addition to having two number one singles in the United States, Yoakam also has thirteen Top 10 singles on the country chart. Are we justified in saying that \" Dwight Yoakam has released more than 18 singles\"? Yes, no, or maybe? Yes\n###\nThe Thebaid ( ; Latin: \"Th\u0113ba\u00efs\") is a Latin epic in 12 books written in dactylic hexameter by Publius Papinius Statius (AD c. 45 \u2013 c. 96). The poem deals with the Theban cycle and treats the assault of the seven champions of Argos against the city of Thebes. Are we justified in saying that \"The poem deals with the Theban cycle.\"? Yes, no, or maybe? Yes\n###\nPaolo Romano, also known as Paolo Tuccone and as Paolo di Mariano di Tuccio Taccone was an Italian early Renaissance sculptor and goldsmith. Giorgio Vasari in his \"Lives of the Most Excellent Painters, Sculptors, and Architects\" recounts that Paolo Romano was a modest man whose sculpture was far superior to that of his boastful contemporary Mino del Reame. Are we justified in saying that \"Paolo Romano and Giorgio Vasari co-authored \"Lives of the Most Excellent Painters, Sculptors, and Architects\" .\"? Yes, no, or maybe?", "doc_id": 573, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35858, 35892, 14325, 93], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits. Are we justified in saying that \"Many fans will buy this box set\"? Yes, no, or maybe? Maybe\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"The Colorado Avalanche is an organization in the NHL.\"? Yes, no, or maybe? Yes\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas. Are we justified in saying that \"Europrop is a solo venture.\"? Yes, no, or maybe? No\n###\nThe Forum Shops at Caesars (also known as simply The Forum) is a major 636000 sqft shopping mall connected to Caesars Palace on the Las Vegas Strip in Las Vegas, Nevada. Measured in terms of sales per square foot, it is the highest grossing mall in the United States. Are we justified in saying that \"The Forum Shops is in Nevada.\"? Yes, no, or maybe? Yes\n###\nThe Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris. The film was released in the United States on December 13, 1991. Are we justified in saying that \"The Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring ONLY Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris.\"? Yes, no, or maybe?", "doc_id": 913, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32156, 17948, 35608, 43823], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County. Are we justified in saying that \"It is considered a city\"? Yes, no, or maybe? Yes\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart. Are we justified in saying that \"\"Break the World\" was never a popular song, and didn't gather much attention.\"? Yes, no, or maybe? No\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively. Are we justified in saying that \"Shadowgun Legends will win many awards\"? Yes, no, or maybe? Maybe\n###\nFarrell Temata ( 1944 \u2013 26 April 2013) was a New Zealand rugby union player and coach. He was a prop who played 44 times for the Waikato provincial rugby union team and later was the side's assistant coach from 1992 to 1994. He was assistant coach of the Chiefs Super Rugby team from 2004 to 2006. Are we justified in saying that \"Farrell Temata gave commands.\"? Yes, no, or maybe? Yes\n###\nPhilips Classics Records was started in the 1980s as the new classics record label for Philips Records. It was successful with artists including Alfred Brendel, Sir John Eliot Gardiner, Sir Neville Marriner and the Academy of St. Martin in the Fields, Mitsuko Uchida, Julian Lloyd Webber, Sir Colin Davis and Andr\u00e9 Rieu. Are we justified in saying that \"It wasn't popular.\"? Yes, no, or maybe?", "doc_id": 951, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31023, 38589, 14697, 1192], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boxcar Bertha is a 1972 American romantic crime drama film directed by Martin Scorsese. It is a loose adaptation of \"Sister of the Road\", a pseudo-autobiographical account of the fictional character Bertha Thompson, written by Ben L. Reitman. It was Scorsese's second feature film. Are we justified in saying that \"1972 was the year the Dolphins won the Super Bowl\"? Yes, no, or maybe? Maybe\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator. Are we justified in saying that \"Wishart is an expert at analyzing football.\"? Yes, no, or maybe? Maybe\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band. Are we justified in saying that \"What Happens Next had bad members.\"? Yes, no, or maybe? Maybe\n###\nThe Attorney () is a 2013 South Korean courtroom drama film directed and co-written by Yang Woo-suk, in his directorial debut (Yang was previously a film producer and webtoon author). With 11,375,954 tickets sold and a revenue of \u20a982.9 billion , \"The Attorney\" became the 8th best-selling Korean film of all time, and the second highest-grossing Korean film of 2013. Are we justified in saying that \"The Attorney is the highest-grossing Korean film of 2013.\"? Yes, no, or maybe? No\n###\nThe Exterminating Angel (Spanish: El \u00e1ngel exterminador ), is a 1962 surrealist film, written and directed by Luis Bu\u00f1uel, starring Silvia Pinal, and produced by her then-husband Gustavo Alatriste. Sharply satirical and allegorical, the film contains a view of human nature suggesting \"mankind harbors savage instincts and unspeakable secrets\". Are we justified in saying that \"Silvia Pinal starred in several films in her acting career.\"? Yes, no, or maybe?", "doc_id": 919, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18398, 16362, 15870, 14844], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Song to Remember is a 1945 Columbia Pictures Technicolor biographical film which tells a fictionalised life story of Polish pianist and composer Fr\u00e9d\u00e9ric Chopin. Directed by Charles Vidor, the film starred Cornel Wilde (as Chopin), Merle Oberon (as George Sand), Paul Muni (as J\u00f3zef Elsner), Stephen Bekassy (as Franz Liszt), and Nina Foch. Are we justified in saying that \"Cornel Wilde has acted as a musician before.\"? Yes, no, or maybe? Yes\n###\n2009, Year of Us is the third extended play (EP) by South Korean boy group Shinee. It consists of six tracks and it incorporates alternative rock and hip-hop music genres. The digital version of the album was released on October 19, 2009, with a physical release on October 22. The title track, \"Ring Ding Dong\" was released on October 14, 2009 through various music sites. Are we justified in saying that \"Year of Us was released on one day only. \"? Yes, no, or maybe? No\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Chris McKendry has more than one sister\"? Yes, no, or maybe? Maybe\n###\nEdwin John Ellis (1848 \u2013 1916) was a British poet and illustrator. He is now remembered mostly for the three-volume collection of the works of William Blake he edited with W. B. Yeats. It is now criticised, however, for weak scholarship, and preconceptions. Are we justified in saying that \" He is now remembered mostly for the three-volume collection of the works of W. B. Yeats he edited with William Blake.\"? Yes, no, or maybe? No\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Destiny was selected for the Academy Awards.\"? Yes, no, or maybe?", "doc_id": 633, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36773, 8544, 5287, 11421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Westbury Senior High School is a ninth-through-12th-grade school of the Westbury Union Free School District (USFD No. 1), the district covering the village of Westbury, New York, USA. Its current building, located on Post Road at Jericho Turnpike in Old Westbury, New York (just north of Westbury Village), reached its 50th anniversary in 2008. Are we justified in saying that \"Westbury Senior High School is a very bad school\"? Yes, no, or maybe? Maybe\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added. Are we justified in saying that \"Sean O'Hagan switched to keyboard after he left\"? Yes, no, or maybe? Maybe\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show doesn't air on Fridays\"? Yes, no, or maybe? Yes\n###\nArthur Tyde is an American software entrepreneur and private investigator based in San Francisco and SE Asia. He has been an advocate for Open Source software since founding the first Linux Users Group in the San Francisco / Silicon Valley Area. (BALUG). Are we justified in saying that \"Arthur Tyde has never spent time in SE Asia.\"? Yes, no, or maybe? No\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"Elizabeth City State University has post graduate programs.\"? Yes, no, or maybe?", "doc_id": 256, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8737, 15460, 21490, 41961], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis. Are we justified in saying that \"Bj\u00f8rn Lomborg has founded other non-profits besides The Copenhagen Consensus Center.\"? Yes, no, or maybe? Maybe\n###\nCecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa. Are we justified in saying that \"Frere hospital is a teaching hospital.\"? Yes, no, or maybe? Maybe\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp. Are we justified in saying that \"sydney population was 4208 in january 1792\"? Yes, no, or maybe? Maybe\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys. Are we justified in saying that \"Scott Morriss is 45 years old.\"? Yes, no, or maybe? No\n###\nBrown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution. Are we justified in saying that \"Brown was founded before the All star Yankees game.\"? Yes, no, or maybe?", "doc_id": 700, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43055, 42562, 44258, 464], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia. Are we justified in saying that \"There are no members from New York\"? Yes, no, or maybe? Yes\n###\nCape Verde is a volcanic archipelago situated above an oceanic rise that puts the base of the islands 2 km above the rest of the seafloor. Cape Verde has been identified as a hotspot and it has been argued that a mantle plume might be underneath it causing the volcanic activity and associated geothermal anomalies. Are we justified in saying that \"capo verde activity is due to a mantle plume\"? Yes, no, or maybe? Maybe\n###\nDerailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago. Are we justified in saying that \"The novel is set in Chicago.\"? Yes, no, or maybe? Maybe\n###\nThe Blackwater Lightship is a 2004 Hallmark Hall of Fame TV movie adaptation of the novel \"The Blackwater Lightship\" by Colm T\u00f3ib\u00edn. It aired on CBS on February 4, 2004. The movie stars Angela Lansbury, Gina McKee, Sam Robards, Dianne Wiest, and Keith McErlean. Lansbury received an Emmy nomination for it in 2004. Are we justified in saying that \"\"The Blackwater Lightship\" movie was an adaption of the novel of the same name.\"? Yes, no, or maybe? Yes\n###\nPublic Domain Day is an observance of when copyrights expire and works enter into the public domain. This legal transition of copyright works into the public domain usually happens every year on 1 January based on the individual copyright laws of each country. Are we justified in saying that \"Public Domain Day always happens on January 1st.\"? Yes, no, or maybe?", "doc_id": 5, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23291, 36550, 6076, 13272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros. Are we justified in saying that \"Gonz\u00e1lez played baseball in high school.\"? Yes, no, or maybe? Maybe\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"Once Upon a Time premiered over 6 years ago\"? Yes, no, or maybe? Yes\n###\nThe Good Night is a 2007 romantic comedy film written and directed by Jake Paltrow. The film stars his sister Gwyneth Paltrow, Pen\u00e9lope Cruz, Martin Freeman, Danny DeVito, Simon Pegg and others. The movie takes place in London and New York City, where a former pop star (Freeman) who now writes commercial jingles for a living experiences a mid-life crisis. Are we justified in saying that \"Actor Martin Freeman plays the lead role in a movie that features him alongside Danny DeVito.\"? Yes, no, or maybe? Yes\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton). Are we justified in saying that \"Wooden Leather received no airplay\"? Yes, no, or maybe? Maybe\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy. Are we justified in saying that \"The Kamwina Nsapu group spoke the Tshiluba language.\"? Yes, no, or maybe?", "doc_id": 500, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5411, 35938, 1567, 28625], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Living on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc. Are we justified in saying that \"G\u00e9rald Leblanc wrote poetry.\"? Yes, no, or maybe? Yes\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro has been divorced\"? Yes, no, or maybe? Maybe\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe. Are we justified in saying that \"The show was not an animation\"? Yes, no, or maybe? No\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality. Are we justified in saying that \"Marcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first and only regular voice of Minnie Mouse. \"? Yes, no, or maybe? Maybe\n###\nThe 2013 Canadian Olympic Curling Trials were held from December 1 to 8 at the MTS Centre in Winnipeg, Manitoba. The event is also known and advertised as the Tim Hortons Roar of the Rings. The winners of the men's and women's events were chosen to represent Canada at the 2014 Winter Olympics. Are we justified in saying that \"Tim Horton did not donate any amount of money to curling trials\"? Yes, no, or maybe?", "doc_id": 690, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5688, 9428, 14819, 44864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\". Are we justified in saying that \"Daraar is an Indian family entertainment movie.\"? Yes, no, or maybe? No\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh. Are we justified in saying that \"Telephone Shilpa Sangstha failed to launch any devices following 2011\"? Yes, no, or maybe? Maybe\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area. Are we justified in saying that \"The Pikes Peak Center for the Performing Arts is a concert auditorium in Colorado Springs, Colorado. \"? Yes, no, or maybe? Yes\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile. Are we justified in saying that \"There were 4 installments of the \"Child's Play\" franchise before Bride of Chucky in 1998\"? Yes, no, or maybe? No\n###\nThe J.J. Deal and Son Carriage Factory was the largest factory built in Jonesville, Michigan. It is the only 19th century factory remaining in the City. It is located at 117 West Street. On August 1, 2012, the building was added to the National Register of Historic Places. Are we justified in saying that \"The J.J. Deal and Son Carriage Factory is the only building from the 1700s that stands to this day in Jonesville, Michigan.\"? Yes, no, or maybe?", "doc_id": 771, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34739, 4449, 1659, 6565], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD. Are we justified in saying that \"Tight was the first album for Mindless Self Indulgence.\"? Yes, no, or maybe? Yes\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats. Are we justified in saying that \"Allium campanulatum is found more frequently in southeastern Washington than in northern Oregon.\"? Yes, no, or maybe? Maybe\n###\nA Qualified Person Responsible for Pharmacovigilance, or QPPV, is an individual named by a pharmaceutical company as the main person responsible for ensuring that the company (the product's Marketing Authorisation Holder or MAH) meets its legal obligations for the monitoring of the safety of a medicinal product on the market. Are we justified in saying that \"A QPPV monitors safety\"? Yes, no, or maybe? Yes\n###\nBoy Meets Girl is an ITV comedy-drama television miniseries starring Rachael Stirling and Martin Freeman. In the show, Danny Reed (Freeman) is struck by lightning. When he wakes up from the attack, he is inside the body of a woman, fashion journalist Veronica Burton (Stirling). Written by David Allison, the series began on 1 May 2009. Are we justified in saying that \"Boy Meets Girl won an emmy\"? Yes, no, or maybe? Maybe\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982). Are we justified in saying that \"King of the Jungle was released in Korea.\"? Yes, no, or maybe?", "doc_id": 412, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19318, 43966, 18265, 8312], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins. Are we justified in saying that \"Higgins was born the last day of the month.\"? Yes, no, or maybe? Yes\n###\n\"September\" is the third and final single from Daughtry's second album \"Leave This Town\" (2009). This song was co-written by Chris Daughtry and Josh Steely. It was first released June 1, 2010 through RCA Records. The mid-tempo ballad is inspired by Chris's childhood memories growing up with his brother in a small town in North Carolina. Are we justified in saying that \"September is also a month.\"? Yes, no, or maybe? Yes\n###\nJersey Boys is a 2014 American biographical musical drama film directed and produced by Clint Eastwood based on the Tony Award winning jukebox musical of the same name. The film tells the story of the musical group The Four Seasons. The film was released in the United States on June 20, 2014. The film received mixed reviews from critics and grossed $67 million worldwide. Are we justified in saying that \"The musical never won a Tony Award.\"? Yes, no, or maybe? No\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl. Are we justified in saying that \"Avani Modi is an Indian model and film actress that is mostly known for her acting.\"? Yes, no, or maybe? Maybe\n###\nManos Krystalis (Greek: \u039c\u03ac\u03bd\u03bf\u03c2 \u039a\u03c1\u03c5\u03c3\u03c4\u03ac\u03bb\u03b7\u03c2; born November 30, 1989) is a Greek actor, performer, model and businessman who became particularly popular in Greece as a model while he worked as a runway model for a top Greek modeling agency since 2004. He participated in several international film projects as a lead main actor. Are we justified in saying that \"Manos Krystalis was a lead main actor before 2004.\"? Yes, no, or maybe?", "doc_id": 980, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6018, 9509, 20012, 11315], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Weaver is an English actress well known for playing the role of recurring character Pamela Cooper in the E4 sitcom \"The Inbetweeners\" and its feature-length films, \"The Inbetweeners Movie\" and \"The Inbetweeners 2\". She has also appeared in several TV commercials. Are we justified in saying that \"The TV commercials starring Robin Weaver had the character Pamela Cooper. \"? Yes, no, or maybe? Maybe\n###\nO'Sullivan Army Heliport (ICAO: KCSL,\u00a0FAA LID: CSL) is a U.S. Army heliport at Camp San Luis Obispo in San Luis Obispo County, California, United States. It is located just off California State Route 1, northwest of the city of San Luis Obispo, about halfway between it and Morro Bay. O'Sullivan AHP has one helipad designated H1 with a 2,430 by 75\u00a0ft (741 by 23\u00a0m) asphalt surface. Are we justified in saying that \"The Heliport can hold 20 helicopters.\"? Yes, no, or maybe? Maybe\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Weltenbrand is a darkwave band from Liechtenstein formed in 1990\"? Yes, no, or maybe? No\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Jacob Murphy was born 24 February 1992.\"? Yes, no, or maybe? Maybe\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters. Are we justified in saying that \"Larry Ruvo only has female children\"? Yes, no, or maybe?", "doc_id": 48, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43191, 17462, 9235, 7554], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Suntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"Sadhu Kokila is not part of the main cast of Suntaragaali.\"? Yes, no, or maybe? Yes\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed. Are we justified in saying that \"Metal Gear Solid was released for Nintendo.\"? Yes, no, or maybe? No\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film was released in the summer \"? Yes, no, or maybe? Yes\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise). Are we justified in saying that \"Ivor and Josiane Wood were married in France\"? Yes, no, or maybe? Maybe\n###\nMarwin Javier Gonz\u00e1lez (born March 14, 1989) is a Venezuelan professional baseball infielder with the Houston Astros of Major League Baseball (MLB). Primarily a shortstop, Gonz\u00e1lez has appeared at every position except for pitcher and catcher for the Astros. Are we justified in saying that \"He is in his twenties.\"? Yes, no, or maybe?", "doc_id": 869, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11990, 25164, 31462, 78], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi. Are we justified in saying that \"Aatank Hi Aatank is a tv show.\"? Yes, no, or maybe? No\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km . Are we justified in saying that \"Jiaozhou Bay Bridge holds a world record.\"? Yes, no, or maybe? Yes\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013. Are we justified in saying that \"The hurricane weakened rapidly after entering the Gulf of California\"? Yes, no, or maybe? Yes\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious. Are we justified in saying that \"The song was enjoyed by critics.\"? Yes, no, or maybe? Yes\n###\nHoang Anh Gia Lai \u2013 Arsenal JMG Academy is a football academy in Pleiku, Gia Lai Province, Tay Nguyen of Vietnam. The academy is a built as cooperation between Arsenal Football Club, JMG Academy and the Vietnamese privately owned Hoang Anh Gia Lai Corporation. This football academy is the first one in Vietnam so far. It is also a feeder club to Hoang Anh Gia Lai in the V-League. Are we justified in saying that \"Hoang Anh Gia Lai is in the northern hemisphere\"? Yes, no, or maybe?", "doc_id": 431, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30085, 28488, 9289, 19802], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "McColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers. Are we justified in saying that \"McColo was a small firm.\"? Yes, no, or maybe? Maybe\n###\nJeffrey B. Miller, AA, BS, MPA, was the former commissioner of the Pennsylvania State Police. Miller, a native of Harrisburg, Pennsylvania, served in that position from March 24, 2003, after being confirmed by the Pennsylvania State Senate, until August 8, 2008. Are we justified in saying that \"Jeffrey B. Miller was a Harrisburg, Pennsylvania, native. \"? Yes, no, or maybe? Yes\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone has never been called by another name.\"? Yes, no, or maybe? Maybe\n###\nRobert Louis Boozer (April 26, 1937 \u2013 May 19, 2012) was an American professional basketball player in the National Basketball Association (NBA). He won a gold medal in the 1960 Summer Olympics and won an NBA Championship as a member of the Milwaukee Bucks in 1971. Are we justified in saying that \"Robert Louis Boozer won a gold medal in the 1960 Summer Olympics and won and NBA Championship in 1971 and regretted them both.\"? Yes, no, or maybe? Maybe\n###\nPunjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999. Are we justified in saying that \"Punjab Control of Organised Crime Act has no impact as of yet, as it has not yet passed all of the legislative hurdles required for act to be implemented\"? Yes, no, or maybe?", "doc_id": 23, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21503, 6814, 7796, 43631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records. Are we justified in saying that \"Tafoya will move on to create more hip hop collectives across the pacific north west.\"? Yes, no, or maybe? Maybe\n###\n\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child. Are we justified in saying that \"The overall mood of the story was joyous\"? Yes, no, or maybe? Maybe\n###\nEditing In the Mi(d)st is a ballet made by Miriam Mahdaviani to Oliver Knussen's \"The Way to Castle Yonder\" and excerpts from his \"Music for a Puppet Court\" and Aaron Jay Kernis' \"Overture in Feet and Meters\". The premiere took place June 21, 2002, as part of New York City Ballet's Diamond Project V at the New York State Theater, Lincoln Center. Are we justified in saying that \"The ballet was first viewed in the summer of 2002.\"? Yes, no, or maybe? Yes\n###\nLori-Jane Powell (born November 8, 1971) is a retired Canadian racquetball player from Prince Albert, Saskatchewan. Powell was Canadian Champion five times: thrice in singles and twice in doubles. She was forced to retire from competition in 2006 due to a right knee injury. Are we justified in saying that \"Lori-Jane Powell was born in the eleventh month of the year.\"? Yes, no, or maybe? Yes\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority. Are we justified in saying that \"You cannot land a private jet at Dickinson Theodore Roosevelt Regional Airport.\"? Yes, no, or maybe?", "doc_id": 217, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3194, 10990, 42284, 35308], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ryan Potter (born September 12, 1995) is an American actor and martial artist. Beginning his career as a professional actor at the age of 15, Potter is perhaps best known for his starring role as Mike Fukanaga in \"Supah Ninjas\" and for voicing Hiro Hamada in \"Big Hero 6\" (2014). Are we justified in saying that \"Ryan Potter (born September 12, 1998) is an American actor and martial artist.\"? Yes, no, or maybe? No\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area. Are we justified in saying that \"Colorado is not in Canada.\"? Yes, no, or maybe? Yes\n###\nThe American Textile History Museum (ATHM), located in Lowell, Massachusetts, was founded as the Merrimack Valley Textile Museum (MVTM) in North Andover, Massachusetts in 1960 by Caroline Stevens Rogers. ATHM told America\u2019s story through the art, science, and history of textiles. In June 2016, the museum closed. Are we justified in saying that \"The American Textile History Museum is in the New England region.\"? Yes, no, or maybe? Yes\n###\nMarcellite Wall (n\u00e9e Garner; July 3, 1910 \u2013 July 26, 1993) was an American artist and voice actress. She is most remembered as the first regular voice of Minnie Mouse during her time working at Walt Disney Productions and has been partially credited with defining Minnie's personality. Are we justified in saying that \"Wall is not remembered.\"? Yes, no, or maybe? No\n###\n\"Treme\" is an American television drama series created by David Simon and Eric Overmyer. It premiered on HBO on April 11, 2010. The series follows the interconnected lives of a group of New Orleanians in the wake of Hurricane Katrina. Episode titles are primarily taken from a blues or jazz song. The series concluded on December 29, 2013, after four seasons and 36 episodes. Are we justified in saying that \"The show had 8 episodes per season, typically.\"? Yes, no, or maybe?", "doc_id": 121, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6492, 36322, 36452, 44375], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"The 2002 Indian vice-presidential election was held in August.\"? Yes, no, or maybe? Yes\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"It was the 220th edition of the tournament and was held from 5 October through 13 October 1997.\"? Yes, no, or maybe? No\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage. Are we justified in saying that \"KDMD covers local news events.\"? Yes, no, or maybe? Maybe\n###\nCapital Place Office Tower is a skyscraper at Jalan Jenderal Gatot Subroto in South Jakarta, Indonesia. The tower is part of Capital Place complex, which also occupies by Four Seasons Hotel . The office tower is a 215.1 meter tall, has 48 floors above & 6 floors below the ground. Are we justified in saying that \"The Four Seasons Hotel is taller than the Capital Place Office Tower.\"? Yes, no, or maybe? Maybe\n###\nFor Those Who Think Young is a 1964 beach party film shot in Techniscope, directed by Leslie H. Martinson and featuring James Darren, Pamela Tiffin, Paul Lynde, Tina Louise, Bob Denver, Nancy Sinatra, Robert Middleton, Ellen Burstyn (billed as Ellen McRae), Claudia Martin and Woody Woodbury. Are we justified in saying that \"For Those Who Think Young was shot on Techniscope beach as a 1964 party film directed by Leslie Martinson.\"? Yes, no, or maybe?", "doc_id": 150, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38934, 26689, 21162, 28255], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Youth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"Youth in Guatemala mostly come from single parent families\"? Yes, no, or maybe? Maybe\n###\nThe National List (Italian: \"Lista Nazionale\" ) also known as \"Listone\" (literally \"Big List\") was a Fascist and nationalist coalition of political parties in Italy put together for the 1924 general election, and led by Benito Mussolini, Prime Minister of Italy and leader of the National Fascist Party. Are we justified in saying that \"There was a general election in 1924\"? Yes, no, or maybe? Yes\n###\nStephen R. \"Steve\" Bissette (born March 14, 1955) is an American comics artist, editor, and publisher with a focus on the horror genre. He is known for working with writer Alan Moore and inker John Totleben on the DC comic book \"Swamp Thing\" in the 1980s. Are we justified in saying that \"Steve Bissette is also known for other comic book projects.\"? Yes, no, or maybe? Maybe\n###\nThe Oakland Athletics' 1985 season involved the A's finishing 4th in the American League West with a record of 77 wins and 85 losses. While the Athletics' on-field performance continued to disappoint, the debut of slugger Jose Canseco gave fans a measure of hope. Are we justified in saying that \"In 1985 the Oakland A's were the 4th best team in the American league\"? Yes, no, or maybe? Yes\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis. Are we justified in saying that \"Escape from Suburbia: Beyond the American Dream was written right after The End of Suburbia\"? Yes, no, or maybe?", "doc_id": 520, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24262, 2716, 27975, 10355], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Naufer once scored more than 5 goals in one match\"? Yes, no, or maybe? Maybe\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"Amor a la Mexicana is the fourth studio album by Thalia\"? Yes, no, or maybe? No\n###\nDr. Donald N. Sills was a Baptist minister and one of the founders of George Wythe College, and previous chairman of the George Wythe Foundation Board of Trustees. He served as the first president of George Wythe College (now known as George Wythe University), and was succeeded by Oliver DeMille. Are we justified in saying that \"Dr. Donald N. Sills was 58 when he found the george wythe college\"? Yes, no, or maybe? Maybe\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"Zuikaku was sunk during the Battle of Leyte Gulf, but not before helping to bring the United States into the Pacific War.\"? Yes, no, or maybe? Yes\n###\nThe Metal Men are a group of superheroes that appear in DC Comics. The characters first appeared in \"Showcase\" #37 and were created by writer Robert Kanigher and penciller Ross Andru. Debuting in the Silver Age of Comic Books, the characters have appeared in comic books and other \"DC Comics\"-related products such as animated television series, clothing, figurines and trading cards. Are we justified in saying that \"Ross Andru created the look of the Metal Men superheroes that appear in DC Comics. \"? Yes, no, or maybe?", "doc_id": 90, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42862, 28650, 6141, 16881], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014. Are we justified in saying that \"The Hyundai Xcent has been produced for 6 years.\"? Yes, no, or maybe? No\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly. Are we justified in saying that \"Stannis Baratheon is the older brother to Robert and Renly.\"? Yes, no, or maybe? Maybe\n###\nDarrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include: Are we justified in saying that \"Darrell Abbott was only known by one nick\"? Yes, no, or maybe? No\n###\nClay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan. Are we justified in saying that \"Clay County has a soccer team.\"? Yes, no, or maybe? Maybe\n###\nCan't Touch Us Now is the eleventh studio album by the British band Madness, released on their Lucky 7 Records label through Universal Music Catalogue (UMC) on 28 October 2016. The album marked the return of founder member Mark Bedford but the departure of Cathal Smyth (Chas Smash). Are we justified in saying that \"Can't Touch Us Now was released within the past 10,000 days.\"? Yes, no, or maybe?", "doc_id": 265, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32635, 19175, 31299, 33454], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation. Are we justified in saying that \"The 89th Medium Tank Battalion was first used in winter\"? Yes, no, or maybe? No\n###\nThe Flag of the City of Scarborough, Ontario was officially dedicated on August 19, 1969, by then-Mayor Albert Campbell at a special ceremony in Thomson Memorial Park. It was designed by local painter Doris McCarthy (1910\u20132010) in the spring of 1968, who was presented with the idea by her friend Albert Campbell. Are we justified in saying that \"The Flag of the City of Scarborough was officially dedicated in 1969.\"? Yes, no, or maybe? Yes\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons. Are we justified in saying that \"Jules Buisseret won The Grand Prix des Fronti\u00e8res.\"? Yes, no, or maybe? Maybe\n###\nThe Takahashi River (\u9ad8\u6881\u5ddd , Takahashi-gawa ) is a Class A major river in the western part of Okayama Prefecture. It acts as the main drainage for the Takahashi River Drainage System, and is one of the three main drainage rivers in Okayama Prefecture (the others being the Yoshii River and the Asahi River). Are we justified in saying that \"The River is a Class A minor river.\"? Yes, no, or maybe? No\n###\nNantwich Town Football Club is a semi-professional football club based in Nantwich, Cheshire, England. The club was founded in 1884 and is nicknamed \"The Dabbers\", a reference to the town's tanning industry. The club is currently a member of the Northern Premier League Premier Division, the seventh tier in the English football league system, with home matches played at the Weaver Stadium. Are we justified in saying that \"There are seven tiers in the English football league system.\"? Yes, no, or maybe?", "doc_id": 695, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40292, 43103, 44075, 37502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart. Are we justified in saying that \"\"Timeless: Begins\" peaked at #8 on the Gaon album chart in 2012.\"? Yes, no, or maybe? Yes\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee. Are we justified in saying that \"Gwinnett County Public Schools has people who can smell.\"? Yes, no, or maybe? Yes\n###\nVampires appear throughout Stephen King's fictional multiverse. They appear in the novels \"'Salem's Lot\", \"\", \"\", and \"\"; the short stories \"One for the Road\", \"The Night Flier\", \"Popsy\", and \"The Little Sisters of Eluria\"; and are mentioned in a number of other stories. Marvel Comics' \"The Dark Tower: End-World Almanac\" includes a detailed entry on their categorization. Are we justified in saying that \"Stephen King mentions his previous works in the newer things he writes.\"? Yes, no, or maybe? Maybe\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001. Are we justified in saying that \"Wanker Records is a sucessful record label \"? Yes, no, or maybe? Maybe\n###\nChristelyn Karazin is an American writer, columnist, and blogger on the subject of interracial dating, particularly black women dating outside their race. She hosts the blog \"Beyond Black & White\" and has written for \"Woman's Day\", \"Ebony\", \"Jet\", and Reuters. Karazin attended Loyola Marymount University, where she wrote for \"The Los Angeles Loyolan\". Are we justified in saying that \"Christelyn Karazin has written for ten magazines.\"? Yes, no, or maybe?", "doc_id": 212, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18770, 13285, 38395, 39594], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990). Are we justified in saying that \"Zale Dalen is a film and television director. He is not proud of his film the hounds of Notre Dame\"? Yes, no, or maybe? Maybe\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"ABC Western Victoria was sometimes hard to hear.\"? Yes, no, or maybe? Maybe\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage. Are we justified in saying that \"KDMD was not always owned by Ketchikan Television LLC.\"? Yes, no, or maybe? Maybe\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Nashville West was covered by aerosmith.\"? Yes, no, or maybe? Maybe\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title. Are we justified in saying that \"The 2011 Sudirman Cup was held more than 6667 days ago.\"? Yes, no, or maybe?", "doc_id": 486, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35802, 24986, 42734, 26385], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Leventhorpe Academy is a mixed, 11-19 secondary school and sixth form in the historic market town of Sawbridgeworth, Hertfordshire. The school became a business and Enterprise Academy in August 2011. The intake at age 11 is drawn mainly from the pleasant and prosperous towns of Sawbridgeworth and Bishop's Stortford and from the surrounding villages. Are we justified in saying that \"Leventhorpe Academy receives children at age 11.\"? Yes, no, or maybe? Yes\n###\nThe 1999 IAAF Grand Prix Final was the fifteenth edition of the season-ending competition for the IAAF Grand Prix track and field circuit, organised by the International Association of Athletics Federations. It was held on 11 September at the Olympic Stadium in Munich, Germany. Are we justified in saying that \"The IAFF Grand Prix Final in 1999 was held in Munich, Germany at the Olympic Stadium, and the track and field circuits season ending competition was coming to a close that day, a year still away from the sixteenth edition on that 11 September day.\"? Yes, no, or maybe? Yes\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums). Are we justified in saying that \"The Asteroids Galaxy Tour could only perform live with a five-piece\"? Yes, no, or maybe? Yes\n###\nForever the Moment () is a 2008 South Korean film. It is a fictionalized account of the South Korea women's handball team which competed in the 2004 Summer Olympics. The Korean title translates as \"The Best Moment in Our Lives,\" and it is believed to be the first film that revolves around the sport of handball. Are we justified in saying that \"Forever the Moment is a very wel lthough tof film\"? Yes, no, or maybe? Maybe\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Allen Weiner has taught civics at Stanford Law.\"? Yes, no, or maybe?", "doc_id": 545, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9724, 42258, 43120, 32334], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Last Exorcism Part II is a 2013 American supernatural drama horror film co-written and directed by Ed Gass-Donnelly. It stars Ashley Bell, Julia Garner, Spencer Treat Clark, David Jensen, Tarra Riggs, Louis Herthum, and Muse Watson. It is a sequel to 2010's \"The Last Exorcism\", and released on March 1, 2013. Are we justified in saying that \"The Last Exorcism Part II became a cult classic\"? Yes, no, or maybe? Maybe\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\". Are we justified in saying that \"Tadpoles won an oscar\"? Yes, no, or maybe? Maybe\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord. Are we justified in saying that \"The next celebration in after 1941 was held in 1943.\"? Yes, no, or maybe? Yes\n###\nLarry Ruvo (1946) is the VP/GM of Southern Wine and Spirits of Nevada. He is a wine expert and philanthropist. He founded the Keep Memory Alive foundation and The Lou Ruvo Brain Institute. He serves on the American Gaming Association Board of Directors and is a member of the Gaming Hall of Fame (2005). He is married and has three daughters. Are we justified in saying that \"Larry Ruvo has never drunk a beer.\"? Yes, no, or maybe? Maybe\n###\nPrincess Masako Takeda (\u6052\u4e45\u738b\u5983\u660c\u5b50\u5185\u89aa\u738b , Tsunehisa \u014chi Masako naishinn\u014d ) , born Masako, Princess Tsune (\u5e38\u5bae\u660c\u5b50\u5185\u89aa\u738b , Tsune-no-miya Masako Naishinn\u014d , 30 September 1888 \u2013 8 March 1940) , was the tenth child and sixth daughter of Emperor Meiji of Japan and one of his consorts, Sono Sachiko. Are we justified in saying that \"Princess Takeda had five older sisters.\"? Yes, no, or maybe?", "doc_id": 599, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25349, 30389, 42869, 42309], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Demoniac were a heavy metal band from New Zealand formed in Wellington in 1993 by singer and bass player Lindsay Dawson, guitarist Sam Totman and Drummer Steve Francis. They later moved to London, UK. Three of the members went on to form DragonForce. Their rather unusual musical style has often been labeled as \"blackened power metal\". Are we justified in saying that \"Demoniac were formed over 10 years ago\"? Yes, no, or maybe? Yes\n###\nThe Market towns of Telemark and Aust-Agder counties (Norwegian: \"Kj\u00f8pstedene i Telemark og Aust-Agder fylker\" ) was an electoral district for parliamentary elections in Norway. It comprised the market towns (Norwegian: \"kj\u00f8psteder\" ) of Brevik, Krager\u00f8, Notodden, Porsgrunn and Skien in Telemark county and Arendal, Grimstad and Ris\u00f8r in Aust-Agder county. Are we justified in saying that \"Telemark has a population over 5000\"? Yes, no, or maybe? Maybe\n###\nAnastasija Sevastova (born 13 April 1990) is a professional tennis player from Latvia. Having retired in 2013 due to recurring injuries, Sevastova returned to competition in 2015 and became known for her campaign at the 2016 US Open, where she defeated third-seeded Garbi\u00f1e Muguruza as well as Johanna Konta en route to her first ever Grand Slam quarterfinal. Are we justified in saying that \"Seveastova passed away in 2016.\"? Yes, no, or maybe? No\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character. Are we justified in saying that \"NBC did not show a Wonder Woman show.\"? Yes, no, or maybe? Yes\n###\nThe Deputy Assistant to the President for National Security Affairs, also known as the Deputy National Security Advisor, is a member of the Executive Office of the President of the United States and the United States National Security Council, serving as deputy to the President's National Security Advisor. Are we justified in saying that \"The Deputy Assistant to the President for National Security Affairs, also known as the Deputy National Security Advisor, is a member of the Oval Office of the President of the United States.\"? Yes, no, or maybe?", "doc_id": 167, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41873, 37077, 26082, 28325], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"V. S. Reid is intelligent.\"? Yes, no, or maybe? Maybe\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit. Are we justified in saying that \"Sandstone can be found near the top of a mountain in the Spring Mountain range.\"? Yes, no, or maybe? Yes\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars. Are we justified in saying that \"X X X X is a Japanese beer brand.\"? Yes, no, or maybe? No\n###\nGlenn Martin Christopher Francis Quinn (May 28, 1970 \u2013 December 3, 2002) was an Irish actor in television and film, known for playing Mark Healy in the American sitcom \"Roseanne\", and Doyle, a half-demon, on \"Angel\", a spin-off series of \"Buffy the Vampire Slayer\". Are we justified in saying that \"Glenn Martin Christopher Francis Quinn is an Irish actor living in America.\"? Yes, no, or maybe? Maybe\n###\nHoward Culver (June 4, 1918 - August 4, 1984) was an American radio and television actor, best known as hotel clerk Howie Uzzell during the entire run of TV's \"Gunsmoke\". On radio he starred in the title role of the Western adventure series \"Straight Arrow\", which aired on Mutual from May 6, 1948 to June 21, 1951. Are we justified in saying that \"His radio show was on the air for over 3 years.\"? Yes, no, or maybe?", "doc_id": 410, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23968, 41307, 41910, 4960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ralph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Malone sat on the bench often while playing for the Cleveland Browns.\"? Yes, no, or maybe? Maybe\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005. Are we justified in saying that \"The series ended on 21 July 2005\"? Yes, no, or maybe? No\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points. Are we justified in saying that \"Greivis Josu\u00e9 V\u00e1squez Rodr\u00edguez was born in the winter of 1987.\"? Yes, no, or maybe? Yes\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity. Are we justified in saying that \"Filipino jurists are members in the Lex Talionis Fraternitas, Inc.\"? Yes, no, or maybe? Yes\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street. Are we justified in saying that \"A civil parish and village are the same thing.\"? Yes, no, or maybe?", "doc_id": 65, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18700, 5755, 13478, 29216], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948. Are we justified in saying that \"The Lonely Londoners is a book\"? Yes, no, or maybe? Yes\n###\nAmandil is a fictional character from J.R.R. Tolkien's Middle-earth legendarium. Amandil was a Lord of And\u00fani\u00eb, succeeding his father N\u00famendil upon his death. Amandil is most noted for being the father of Elendil, founder of the N\u00famen\u00f3rean Realms in Exile. Are we justified in saying that \"Amandil is related to Elendil.\"? Yes, no, or maybe? Yes\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle. Are we justified in saying that \"A symphonic cycle is not the same as a song cycle.\"? Yes, no, or maybe? Yes\n###\nThe 1989 European Cup Winners' Cup Final was a football match contested between Barcelona of Spain and Sampdoria of Italy. It was the final match of the 1988\u201389 European Cup Winners' Cup and the 29th European Cup Winners' Cup Final. The final was held at Wankdorf Stadium in Bern, Switzerland, on 10 May 1989. Barcelona won the match 2\u20130 thanks to goals by Julio Salinas and Luis L\u00f3pez Rekarte. Are we justified in saying that \"This was Sampdoria's first loss in the cup.\"? Yes, no, or maybe? Maybe\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942. Are we justified in saying that \"Peters was alive in 1953.\"? Yes, no, or maybe?", "doc_id": 651, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17217, 13438, 25142, 42686], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "State Route 204 (SR 204) is part of Maine's system of numbered state highways, located in Hancock County. It runs from State Route 3 in Trenton, passing State Route 184 in Lamoine, and ending at the intersection with Seal Point and Marlboro Beach roads. The route is 6.5 mi long. Are we justified in saying that \"State Route 204 is not fun to drive on\"? Yes, no, or maybe? Maybe\n###\nCastle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel. Are we justified in saying that \"The 1981 game Castle Wolfenstein also spawned a later follow-up \"? Yes, no, or maybe? Yes\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"After the severing of relations in 1961, the Cuban Embassy in Washington, DC was shuttered.\"? Yes, no, or maybe? Maybe\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name. Are we justified in saying that \"Benoit is the female form of the name.\"? Yes, no, or maybe? No\n###\nMineral County Airport (FAA LID: 9S4) is a county-owned public-use airport located two nautical miles (3.7 km) southeast of the central business district of Superior, a town in Mineral County, Montana, United States. According to the FAA's National Plan of Integrated Airport Systems for 2011-2015, it is categorized as a \"general aviation\" facility. Are we justified in saying that \"Mineral County Airport (FAA LID: 9S4) will neither nor always be a county-owned public-use airport.\"? Yes, no, or maybe?", "doc_id": 995, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5389, 42690, 25753, 32566], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Living on the Edge (full title Living on the Edge, the Poetic Works of G\u00e9rald Leblanc also known by its French language title L'extr\u00eame fronti\u00e8re, l'oeuvre po\u00e9tique de G\u00e9rald Leblanc) is a 2005 documentary film by Canadian director of Acadian origin Rodrigue Jean. In this documentary, Rodrigue Jean pays tribute to his Acadian roots, focussing on the poetry of G\u00e9rald Leblanc. Are we justified in saying that \"Rodrigue Jean is an Acadian Canadian. \"? Yes, no, or maybe? Yes\n###\nHim & Her is a British television sitcom about a lazy twenty-something couple: Steve and Becky, who live in Walthamstow, London. It was first broadcast in the United Kingdom on BBC Three on 6 September 2010. It is written by Stefan Golaszewski and stars Russell Tovey and Sarah Solemani. The theme tune is the song \"Boom Bang-a-Bang\" by Lulu. Are we justified in saying that \"Steve and Becky each had 3 jobs\"? Yes, no, or maybe? Maybe\n###\nCooper Manning (born March 6, 1974) is the host for the show \"The Manning Hour\" for Fox Sports. He is the oldest son of former professional football quarterback Archie Manning, and the older brother of former professional football quarterback Peyton Manning and current New York Giants quarterback Eli Manning. Are we justified in saying that \"Cooper Manning hates his brothers\"? Yes, no, or maybe? Maybe\n###\nResil B. Mojares is a Filipino ambassador, historian, and critic of Philippine literature. He has a Ph.D. in Literature from the University of the Philippines, Diliman. A retired Professor at the University of San Carlos (USC) in Cebu City, he was a founding director (1975\u201396) of USC's Cebuano Studies Center, a pioneering local studies center in the Philippines. Are we justified in saying that \"Resil B. Mojares will run for President in 2020\"? Yes, no, or maybe? Maybe\n###\nBrian Wardle (born October 9, 1979) is an American college basketball coach and the current men's basketball coach at Bradley University. He was an assistant at Marquette from 2003\u20132005 and UW-Green Bay from 2005\u20132010. After the 2009-2010 season, Wardle was named head coach at UW-Green Bay. Upon his hiring, Wardle became the youngest head coach in NCAA Division I basketball. Are we justified in saying that \"Brian Wardle has never told a group of people what to do \"? Yes, no, or maybe?", "doc_id": 93, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43981, 18646, 19301, 41064], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole is from a country whose capital is London.\"? Yes, no, or maybe? Yes\n###\nJake McGing (born 22 May 1994) is a professional Australian footballer who plays as a central defender for Central Coast Mariners FC in the A-League. On 11 August 2015, he made his professional senior debut for Central Coast Mariners FC in the 2015 FFA Cup against Wellington Phoenix FC. Are we justified in saying that \"Jake McGing (born 12 May 1994) is a professional Australian footballer \"? Yes, no, or maybe? No\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records. Are we justified in saying that \"Sleep has thought about retiring.\"? Yes, no, or maybe? Maybe\n###\nKing Mongkut's University of Technology Thonburi (KMUTT or KMUT Thonburi, ) is an engineering and technology academy in Thailand, focusing on teaching, researching as well as serving industry. It is located in Thung Khru District, Bangkok and was founded on April 18, 1960. Are we justified in saying that \"The teachers at King Mongkut's University of Technology Thonburi teach teachers.\"? Yes, no, or maybe? Maybe\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes. Are we justified in saying that \"Max & Shred has never aired outside the U.S. or Canada.\"? Yes, no, or maybe?", "doc_id": 63, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41939, 6339, 39796, 33349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity. Are we justified in saying that \"Leveraging productivity and creativity are two of Kdan Mobile Software's functions.\"? Yes, no, or maybe? Yes\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats. Are we justified in saying that \"Alliam campanulatum is a variety of potato.\"? Yes, no, or maybe? No\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Every member of Nashville West was happy the band broke up.\"? Yes, no, or maybe? Maybe\n###\nThe 2007 Porsche Tennis Grand Prix was a tennis tournament played on indoor hard courts. It was the 30th year of Porsche Tennis Grand Prix, and was part of the Tier II Series of the 2007 WTA Tour. It took place at the Porsche Arena in Stuttgart, Germany, from October 1 through October 7, 2007 Are we justified in saying that \"The 2008 Porsche Tennis Grand Prix took place at the Porsche Arena in Stuttgart, Germany, from October 2 through October 9, 2008.\"? Yes, no, or maybe? Maybe\n###\nMidnight Tides is the fifth volume of Canadian author Steven Erikson's epic fantasy series, the \"Malazan Book of the Fallen\". Although it is part of the larger series, it has only limited references to the previous books. However, it is not a stand-alone volume as the events of the books \"Reaper's Gale\" and \"Dust of Dreams\" follow on from it. Are we justified in saying that \"Steven Erikson has written different series.\"? Yes, no, or maybe?", "doc_id": 926, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36242, 18967, 14770, 28602], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field was torn down\"? Yes, no, or maybe? Yes\n###\nBaker College Preparatory High School (also known as Baker College Prep) is a public four-year charter high school located in the South Chicago neighborhood on the far south side of Chicago, Illinois. It is operated by the Noble Network of Charter Schools. It shares its campus with Bowen High School. Baker is named for civil and human rights activist Ella Baker. Are we justified in saying that \"Bowen High School is also know as Bowen College Prep a two year private that is named after an activist Baker. \"? Yes, no, or maybe? No\n###\nThe Zora Neale Hurston House was the home of author Zora Neale Hurston in Fort Pierce, Florida. It was originally located at 1734 School Court but was moved north 500 feet in 1995 to 1734 Avenue L to allow for expansion of Lincoln Park Academy, the school at which Hurston taught. On December 4, 1991, it was designated as a U.S. National Historic Landmark. Are we justified in saying that \"The Zora Neale Hurston House was moved North by flatbed truck.\"? Yes, no, or maybe? Maybe\n###\nBlood Red Shoes are an alternative rock duo from Brighton, England consisting of Laura-Mary Carter and Steven Ansell. They have released four full-length albums, \"Box of Secrets\" (2008), \"Fire Like This\" (2010), \"In Time to Voices\" (2012), and \"Blood Red Shoes\" (2014) as well as several EPs and a number of singles. In 2014, they founded their own label, Jazz Life. Are we justified in saying that \"Blood Red Shoes did better after founding their own labe.\"? Yes, no, or maybe? Maybe\n###\nCupid Car Club, also known as Cupid Car Club M.P., was a short-lived American post-hardcore band consisting of Ian Svenonius on vocals, James Canty on drums, Steve Gamboa on guitar (all of which were previously members of Nation of Ulysses and later went on to form The Make-Up), and Kim Thompson (of The Delta 72) on bass and vocals. Are we justified in saying that \"Cupid Car Club was a hardcore band\"? Yes, no, or maybe?", "doc_id": 124, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4134, 36215, 13865, 8252], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016. Are we justified in saying that \"All professional tennis tournaments are played on grass courts.\"? Yes, no, or maybe? No\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia. Are we justified in saying that \"Alexander Ivanovich Bastrykin had his finaces santioned\"? Yes, no, or maybe? Maybe\n###\nThe Ligier JS17 was a Formula One car designed by G\u00e9rard Ducarouge and Michel Beaujon for use by the Ligier team during the season. Powered by a Talbot-badged Matra V12, the JS17 was driven to two Grand Prix wins by Jacques Laffite. It was updated to JS17B specification for the season until it was replaced later that year by the JS19. Are we justified in saying that \"The Ligier JS17 is a person.\"? Yes, no, or maybe? No\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart. Are we justified in saying that \"The album Timeless: Begins peaked at the eighth position on the Gaon Weekly Album chart\"? Yes, no, or maybe? Yes\n###\nHearts of Stone is the fifth studio album by American rock band Stoneground, released in 1978 on Warner Bros. Produced by Bob Gaudio, it marked Stoneground's return to a major label, having released their previous album, \"Flat Out\" (1976), on their own label. \"Prove It\" was released as the first single from \"Hearts of Stone\". Are we justified in saying that \"Stoneground released their first studio album in 1970 \"? Yes, no, or maybe?", "doc_id": 460, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43763, 19358, 35951, 32571], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Third Option is Vince Flynn's third novel, and the 2nd to feature Mitch Rapp, an American agent that works for the CIA as an operative for a covert counterterrorism unit called the \"Orion Team\". The first in the Mitch Rapp series American Assassin, was written later, but was a prologue to Kill Shot. Are we justified in saying that \"Flynn writes about real institutions among other topics\"? Yes, no, or maybe? Yes\n###\nDuncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. His play \"In Cold Light\" is currently in the production phase as it is turned into a feature film by Peter Slee Productions. Are we justified in saying that \"Duncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. whose only play is \"In Cold Light\"\"? Yes, no, or maybe? Maybe\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro will switch teams\"? Yes, no, or maybe? Maybe\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns. Are we justified in saying that \"The German 88mm cannon was found to be very difficult to use.\"? Yes, no, or maybe? Maybe\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) . Are we justified in saying that \"Sabanci University was open for enrollment in 1994.\"? Yes, no, or maybe?", "doc_id": 379, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41192, 38341, 21063, 34810], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\". Are we justified in saying that \"\"Emigrante del Mundo\" was released on two separate occasions. \"? Yes, no, or maybe? Yes\n###\nAlexander Ivanovich Bastrykin (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0418\u0432\u0430\u0301\u043d\u043e\u0432\u0438\u0447 \u0411\u0430\u0441\u0442\u0440\u044b\u0301\u043a\u0438\u043d , born August 27, 1953 in Pskov) is a Russian official, former First Deputy Prosecutor General of Russia, and former Chairman of The Investigative Committee of the Prosecutor General's Office. Since January 15, 2011, he is the Head of The Investigative Committee of Russia. Are we justified in saying that \"Pskov is located next to a Russian mountain range.\"? Yes, no, or maybe? Maybe\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Most of the house is brick\"? Yes, no, or maybe? Maybe\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery. Are we justified in saying that \"The Anchor Bankside is a comic book store in London\"? Yes, no, or maybe? No\n###\nRefried beans (Spanish: \"frijoles refritos\") is a dish of cooked and mashed beans and is a traditional staple of Mexican and Tex-Mex cuisine, although each cuisine has a different approach when making the dish. Refried beans are also popular in many other Latin American countries. Are we justified in saying that \"Refried beans is also known as frijoles refritis in Spanish\"? Yes, no, or maybe?", "doc_id": 609, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12815, 17960, 11815, 10680], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Edward Annis (born February 2, 1980), is a Canadian professional wrestler, better known by his ring name Teddy Hart. He is currently working on the American independent circuit. He has also wrestled for AAA, the short-lived Wrestling Society X, Jersey All Pro Wrestling, and Dragon Gate USA. He operates a wrestling school in Edmonton. He is the son of Georgia Hart and wrestler B.J. Annis. Are we justified in saying that \"Edward Annis has wrestled in 4 different wrestling associations\"? Yes, no, or maybe? Yes\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp. Are we justified in saying that \"The Town was named after a person.\"? Yes, no, or maybe? Yes\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"G\u00f6tz Freiherr von Houwald met Celine Dion.\"? Yes, no, or maybe? Maybe\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold. Are we justified in saying that \"The film Camping 3 sold over 3 million tickets in France in 2016.\"? Yes, no, or maybe? Yes\n###\nLee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador. Are we justified in saying that \"Wolosky was 48 years old when he was appointed Ambassador.\"? Yes, no, or maybe?", "doc_id": 931, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12035, 44815, 17145, 34864], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmhurst is a residential neighborhood in the southernmost part of Oakland, California. Originally a separate town, it was annexed by Oakland in 1909, and today is considered part of East Oakland. It lies at an elevation of 39 feet (12 m). It contains the Eastmont Town Center. Are we justified in saying that \"Elmhurst lies at an elevation of 13 meters.\"? Yes, no, or maybe? No\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\". Are we justified in saying that \"Dr. Edward Vivian Scobie was a journalist and eventually became a politician \"? Yes, no, or maybe? No\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Witchhunt Records is a record company that has produced albums in the darkwave genre of music\"? Yes, no, or maybe? Yes\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"the big planes operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe\"? Yes, no, or maybe? Maybe\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title. Are we justified in saying that \"Chris Harris won his first British Speedway Championship beating Edward Kennett for the second time. \"? Yes, no, or maybe?", "doc_id": 103, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38087, 3114, 5812, 12632], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis. Are we justified in saying that \"Jose Celestino Mutis. gets visits from people around the world.\"? Yes, no, or maybe? Maybe\n###\nGiovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg. Are we justified in saying that \"Giovanni Ferrero is an Italian soccer fan. \"? Yes, no, or maybe? Maybe\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered. Are we justified in saying that \"Justicia umbrosa thrives best in direct sunlight conditions\"? Yes, no, or maybe? No\n###\nThe Raid on Le Havre was a two-day naval bombardment of the French port of Le Havre early in July 1759 by Royal Navy forces under Rear-Admiral George Rodney during the Seven Years' War, which succeeded in its aim of destroying many of the invasion barges being gathered there for the planned French invasion of Great Britain. Are we justified in saying that \"France planned to invade Great Britain by sea\"? Yes, no, or maybe? Yes\n###\nClay County is a county located in the U.S. state of Tennessee. As of the 2010 census, the population was 7,861. Its county seat and only incorporated city is Celina. Clay County is named in honor of American statesman Henry Clay, member of the United States Senate from Kentucky and United States Secretary of State in the 19th century. Its current mayor is Dale Reagan. Are we justified in saying that \"As of the census after two thousand nine, the population was 7,861. Its county seat and only incorporated city is Celina.\"? Yes, no, or maybe?", "doc_id": 707, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [724, 13841, 27077, 25673], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling. Are we justified in saying that \"Rudyard Kipling's The Jungle Book debuted in the year preceding 1995\"? Yes, no, or maybe? Yes\n###\nNatalya Eduardovna Andrejchenko, Honored Artist of the RSFSR (1984) (Russian: \u041d\u0430\u0442\u0430\u0301\u043b\u044c\u044f \u042d\u0434\u0443\u0430\u0301\u0440\u0434\u043e\u0432\u043d\u0430 \u0410\u043d\u0434\u0440\u0435\u0301\u0439\u0447\u0435\u043d\u043a\u043e ; born May 3, 1956) is an actress. Her most famous roles include the title character in \"Mary Poppins, Goodbye\" and Lyuba in \"Wartime Romance\". Are we justified in saying that \"Natalya Andrejchenko was Mary Poppins.\"? Yes, no, or maybe? Yes\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon was born in the year 1995. \"? Yes, no, or maybe? Yes\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums). Are we justified in saying that \"Emperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. After splitting up in 2001, they reunited from 2005 to 2007 for a few festival dates and brief US tours, and again in 2013 to 2014. The group was founded by Ihsahn and Samoth.\"? Yes, no, or maybe? Yes\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono. Are we justified in saying that \"Bono sang in Black Wind, White Land.\"? Yes, no, or maybe?", "doc_id": 858, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40745, 7470, 13094, 24671], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Cover on My Heart\" is a pop ballad performed by Guy Sebastian and is the third single from his third album \"Closer to the Sun\". Sebastian announced that this song was the album's third single in April 2007. The single was released on 28 July 2007 in Australia, set by his record label Sony BMG Australia. Sebastian performed the song on various programmes such as \"Sunrise\" and \"Rove Live\". Are we justified in saying that \"Guy Sebastian is a nice guy.\"? Yes, no, or maybe? Maybe\n###\nTarget Field is a baseball park in the historic warehouse (or North Loop) district of downtown Minneapolis. It is the home ballpark of the Minnesota Twins, the state's Major League Baseball (MLB) franchise. It also has served as the occasional home of Minnesota Golden Gophers baseball, and other local and regional baseball events. Are we justified in saying that \"The Minnesota Twins have never lost a game at Target Field\"? Yes, no, or maybe? Maybe\n###\nClear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Chris Vrenna of Nine Inch Nails/Tweaker, it was released in July 2000 on the now-defunct label Risk Records. After \"Clear Hearts, Grey Flowers\" the band formally split up and moved on to establish other projects. Are we justified in saying that \"Risk Records released Clear Hearts Grey Flowers before they went defunct.\"? Yes, no, or maybe? Yes\n###\nKalavu (Kannada: \u0c95\u0cb3\u0cb5\u0cc1) is a 2013 Kannada movie based on Dr KY Narayanaswamy's novel of the same title. The movie is the directorial debut of Ravi M who has worked with the production of the Hollywood film Inferno . Two French films, \"Blue Mountains\" and \"Child in Pondicherry\", launched his career in art direction. The film stars Umashree, Kari Subbu, Hulagappa Kattimani and others. Are we justified in saying that \"\"Blue Mountains\" was filmed before \"Child in Pondicherry\".\"? Yes, no, or maybe? Maybe\n###\nBudapest Gypsy Symphony Orchestra is a Hungarian symphony orchestra of Romani (Gypsy) musicians. It emphasizes works by composers inspired by Hungarian folk music including Johannes Brahms, Vittorio Monti, Piotr Tcha\u00efkovski, Johann Strauss and Johann Strauss II. The orchestra has been performing for Are we justified in saying that \"Budapest Gypsy Symphony Orchestra consists of ethnically diverse musicians\"? Yes, no, or maybe?", "doc_id": 753, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30321, 31805, 15437, 15999], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "City Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle. Are we justified in saying that \"City Mall is the largest mall in Jordan\"? Yes, no, or maybe? Maybe\n###\nIntervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009. Are we justified in saying that \"Intervilles is a popular French comedy game show\"? Yes, no, or maybe? Maybe\n###\nThe Last Exorcism Part II is a 2013 American supernatural drama horror film co-written and directed by Ed Gass-Donnelly. It stars Ashley Bell, Julia Garner, Spencer Treat Clark, David Jensen, Tarra Riggs, Louis Herthum, and Muse Watson. It is a sequel to 2010's \"The Last Exorcism\", and released on March 1, 2013. Are we justified in saying that \"There was only a writer for this movie\"? Yes, no, or maybe? No\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey. Are we justified in saying that \"Ballymena United Football Club consists of ethnically diverse players\"? Yes, no, or maybe? Maybe\n###\nNico is the third studio album by American rock band Blind Melon, released in 1996 by Capitol Records. The album was released after lead singer Shannon Hoon's cocaine overdose that resulted in his death in 1995. The album was named for his daughter, Nico Blue, and the proceeds arising from album sales were placed in a college trust for her. It features Are we justified in saying that \"Lead singer Shannon Hoon was dead in 2000.\"? Yes, no, or maybe?", "doc_id": 398, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17028, 28374, 29148, 15396], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries. Are we justified in saying that \"All of the monuments were made after 900 CE.\"? Yes, no, or maybe? No\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989. Are we justified in saying that \"The Times of India covered the award ceremony for the Padma Bhushan.\"? Yes, no, or maybe? Maybe\n###\nCranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century. Are we justified in saying that \" Cranborne Priory is the church that I belong to of the 12th century with the building, which leads into the parish\"? Yes, no, or maybe? Maybe\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"Colorz of Rage featured hip-hop star Redman and the singer Cheryl as well as other stars.\"? Yes, no, or maybe? Maybe\n###\nSamuel Bronston (Samuel Bronshtein, March 26, 1908, Bessarabia \u2013 January 12, 1994, Sacramento, California) was a Bessarabian-born American film producer, film director, and a nephew of socialist revolutionary figure, Leon Trotsky. He was also the petitioner in a U.S. Supreme Court case that set a major precedent for perjury prosecutions when it overturned his conviction. Are we justified in saying that \"Samuel Bronston was not born in the United States.\"? Yes, no, or maybe?", "doc_id": 395, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10837, 32737, 33818, 43981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Anlo Youth Organisation (also known as the Anlo Youth Association) was a political party that existed in the Gold Coast and later Ghana. It campaigned for the Ewe people under British rule to stay within Ghana after independence. It ended by merging with other parties to form a united opposition to the Convention People's Party. Are we justified in saying that \"The Anlo Youth Association's views were different than those of the Convention People's Party.\"? Yes, no, or maybe? Yes\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m). Are we justified in saying that \"Sonnette is home to many people.\"? Yes, no, or maybe? Maybe\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"Sadhu Kokila is one of the main cast in Suntaragaali \"? Yes, no, or maybe? No\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole is from a country whose capital is London.\"? Yes, no, or maybe? Yes\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory. Are we justified in saying that \"The England national cricket team lost every game of the 1912 Triangular Tournament\"? Yes, no, or maybe?", "doc_id": 115, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34070, 789, 21925, 12824], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boonie Bears III is a 2016 Chinese animated adventure comedy film directed by Ding Liang and Lin Yongchang. The film is the third installment in the \"Boonie Bears\" film series based on the animated series of the same name, following the 2015 film \"\". It was released in China on January 16, 2016. It will be followed by \"\", scheduled for release in 2017. Are we justified in saying that \"Boonie Bears III debut at number 1\"? Yes, no, or maybe? Maybe\n###\nA Moment to Remember (; lit. \"Eraser in My Head\") is a 2004 South Korean film based on the 2001 Japanese television drama \"Pure Soul\". It stars Son Ye-jin and Jung Woo-sung and follows the theme of discovery in a relationship and the burdens of loss caused by Alzheimer's disease. Are we justified in saying that \"In the film, the characters played by Son Ye-jin and Jung Woo-sung break up\"? Yes, no, or maybe? Maybe\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \"Ranila has houses.\"? Yes, no, or maybe? Yes\n###\nThe Bowling Green Falcons men's basketball team is the basketball team that represent Bowling Green State University in Bowling Green, Ohio. The school's team currently competes in the Mid-American Conference. The team last played in the NCAA Division I Men's Basketball Tournament in 1968. The Falcons are now coached by Michael Huger, their 17th head coach. Are we justified in saying that \"Bowling Green was founded in 1950\"? Yes, no, or maybe? Maybe\n###\nBabes in Arms is a 1937 musical comedy with music by Richard Rodgers, lyrics by Lorenz Hart and book by Rodgers and Hart. It concerns a group of small-town Long Island teenagers who put on a show to avoid being sent to a work farm by the town sheriff when their actor parents go on the road for five months in an effort to earn some money by reviving vaudeville. Are we justified in saying that \"Rodgers and Hart wrote a funny musical \"? Yes, no, or maybe?", "doc_id": 470, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31000, 27639, 10809, 5706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement. Are we justified in saying that \"Princess Caroline of Gloucester died from measles\"? Yes, no, or maybe? Maybe\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013. Are we justified in saying that \"Kimberly Ane Peirce has directed more than one film\"? Yes, no, or maybe? Yes\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University. Are we justified in saying that \"Pryor began dancing at age 6\"? Yes, no, or maybe? Maybe\n###\nThe Emami Kolkata Open ATP Challenger Tour (formerly known as State Bank of India ATP Challenger Tour) is a professional tennis tournament played on outdoor hard courts. It is currently part of the Association of Tennis Professionals (ATP) Challenger Tour. It is held annually at the Bengal Tennis Association Stadium in Kolkata, India since 2014. Are we justified in saying that \"The Emami Kolkata Open ATP Challenger Tour is mostly known as the State Bank of India ATP Challenger Tour.\"? Yes, no, or maybe? Maybe\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title. Are we justified in saying that \"The tournament was postponed due to a terror threat\"? Yes, no, or maybe?", "doc_id": 181, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11101, 23085, 40301, 10316], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Orange, Red, Yellow is a 1961 Color Field painting by Mark Rothko. It sold at Christie's for $86.882.500 on May 8, 2012. The seller was the estate of David Pincus and the sale price represents a record nominal price for Post-War / contemporary art at public auction and for Rothko works in general. Are we justified in saying that \"If you look at Orange, Red, Yellow long enough you will go blind.\"? Yes, no, or maybe? Maybe\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL). Are we justified in saying that \"He was a popular player\"? Yes, no, or maybe? Maybe\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944. Are we justified in saying that \"Tippet was a prolific recording artist before meeting Holst.\"? Yes, no, or maybe? Maybe\n###\nKXST is a radio station licensed to North Las Vegas, Nevada, broadcasting to the Las Vegas, Nevada area on 1140 AM. The station is owned by CBS Radio, and broadcasts a sports talk format as part of the CBS Sports Radio network. The station's studios are located in the unincorporated Clark County area of Spring Valley, while its transmitter is near Nellis Air Force Base. Are we justified in saying that \"kxst broadcasts a sports talk format\"? Yes, no, or maybe? Yes\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8. Are we justified in saying that \"Dave Dennis spent most of his life in New Zealand\"? Yes, no, or maybe?", "doc_id": 962, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27370, 409, 24386, 10469], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "WJMF-LP is a low-power television station in Jackson, Mississippi. The station, which currently operates on Channel 6, is owned by Rainey Radio. The station currently acts as a radio station broadcasting a Oldies & Classic Hits format as \"EZ 87.7\", taking advantage of that station's audio signal on 87.75 MHz FM. Are we justified in saying that \"Rainey Radio owns more radio stations in the area.\"? Yes, no, or maybe? Maybe\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart. Are we justified in saying that \"Early Mac's debut EP was released four years ago by Foolay Ent., LLC.\"? Yes, no, or maybe? No\n###\nFatsia japonica(syn. \"Aralia japonica\" Thunb., \"A. sieboldii\" Hort. ex K.Koch), also glossy-leaf paper plant, fatsi, paperplant or Japanese aralia, is a species of flowering plant in the family Araliaceae, native to southern Japan, southern Korea, and Taiwan. Are we justified in saying that \"Fatsia japonica is only native to Japan.\"? Yes, no, or maybe? No\n###\nThe 2007 Grand National (known as the John Smith's Grand National for sponsorship reasons) was the 160th official annual running of the world-famous Grand National steeplechase which took place at Aintree Racecourse near Liverpool, England, on 14 April 2007 and attracted the maximum permitted field of forty competitors for a total prize money of \u00a3700,000 including \u00a3399,140 to the winner. Are we justified in saying that \"There was one winner of the 160th official annual Grand National steeplechase at Aintree Racecourse who won \u00a3399,140 out of the total prize money of \u00a3700,000.\"? Yes, no, or maybe? Yes\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy. Are we justified in saying that \"The Sisters of Mercy are a Christian organization.\"? Yes, no, or maybe?", "doc_id": 319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28636, 17519, 28076, 21825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Laura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture. Are we justified in saying that \"Alexander Theroux loved to eat pizza.\"? Yes, no, or maybe? Maybe\n###\nSpittal is a hamlet or small village in East Lothian, Scotland, UK, on the B1377, east of Longniddry, south-south-west of Aberlady and to the west of Garleton and north of Gladsmuir. It is close to both Redhouse Castle, Gosford House and Spittal House. Are we justified in saying that \"Spittal is an urban city with millions of people.\"? Yes, no, or maybe? No\n###\nThe 2011 Sudirman Cup was the twelfth tournament of the Sudirman Cup. It was held from May 22\u201329, 2011 in Qingdao, China. According to the Badminton World Federation (BWF) 32 teams have confirmed their participation, for the first time twelve teams competed in the elite group to battle for the title. Are we justified in saying that \"May 23, 2011 was a sunny day in Qingdao.\"? Yes, no, or maybe? Maybe\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \"ranila is a village in southern japan\"? Yes, no, or maybe? No\n###\nAbdessadeq Cheqara (1931 \u2013 October 31, 1998) (in Arabic: \u0639\u0628\u062f \u0627\u0644\u0635\u0627\u062f\u0642 \u0634\u0642\u0627\u0631\u0629) was a Moroccan singer of traditional Andalusian classical music and Moroccan folk music. Known as the \"grand master of al-Ala (Andalusian music)\", he was also a violin and oud virtuoso. Are we justified in saying that \"He was well-regarded in his field.\"? Yes, no, or maybe?", "doc_id": 850, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19635, 1688, 42593, 32602], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shitanshu Hargovindbhai Kotak (born 19 October 1972 in Rajkot) was an Indian first-class cricketer. A left-handed batsman, he has been a prolific run scorer for Saurashtra. Now he is the coach of Saurastra Cricket Team & soon will join Gujarat Lions IPL team as Assistant Coach. Are we justified in saying that \"Shitanshu Hargovindbhai Kotak is a thin man\"? Yes, no, or maybe? Maybe\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture. Are we justified in saying that \"Villeneuve has never won an award.\"? Yes, no, or maybe? No\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars. Are we justified in saying that \"X X X X is served cold or warm\"? Yes, no, or maybe? Maybe\n###\nThe Combat Box was a tactical formation used by heavy (strategic) bombers of the U.S. Army Air Forces during World War II. The combat box was also referred to as a \"staggered formation\". Its defensive purpose was in massing the firepower of the bombers' guns, while offensively it concentrated the release of bombs on a target. Are we justified in saying that \"The Combat Box was an extremely successful tactical formation.\"? Yes, no, or maybe? Maybe\n###\nTobias Svantesson (born April 1, 1963, in Malmo, Sweden), is a former professional tennis player from Sweden. He enjoyed most of his tennis success while playing doubles. During his career he won 2 doubles titles. He achieved a career-high doubles ranking of World No. 65 in 1991. His career high world ranking in singles was no 89. Are we justified in saying that \"Tobias Svantesson has played other than doubles, despite them being where he had the most success.\"? Yes, no, or maybe?", "doc_id": 702, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1567, 33519, 6787, 23697], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "David Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe. Are we justified in saying that \"The show was not an animation\"? Yes, no, or maybe? No\n###\nPiazza Colonna is a piazza at the center of the Rione of Colonna in the historic heart of Rome, Italy. It is named for the marble Column of Marcus Aurelius, which has stood there since AD 193. The bronze statue of Saint Paul that crowns the column was placed in 1589, by order of Pope Sixtus V. The Roman Via Lata (now the Via del Corso) runs through the piazza's eastern end, from south to north. Are we justified in saying that \"The Column of Marcus Aurelius was in Rome, Italy in 1589.\"? Yes, no, or maybe? Yes\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing. Are we justified in saying that \"An experience point is gained through harassing other players\"? Yes, no, or maybe? No\n###\nThe final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"The final of the 1983 Prudential Cup was the most exciting game of the century.\"? Yes, no, or maybe? Maybe\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"The building was originally constructed over 1999 days ago.\"? Yes, no, or maybe?", "doc_id": 61, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28058, 20191, 827, 38693], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Joseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars. Are we justified in saying that \"Joseph was born in the summer of 1987. \"? Yes, no, or maybe? Yes\n###\nBel Ami (; also known as \"Pretty Boy\", and \"'Pretty Man\", is a South Korean romantic comedy television series starring Jang Keun-suk, IU, Lee Jang-woo and Han Chae-young. Based on the same-titled 17-volume manhwa by Chon Kye-young, it aired on KBS2 from November 20, 2013 to January 9, 2014 on Wednesdays and Thursdays at 21:55 for 16 episodes. Are we justified in saying that \"Bel Ami had a very short run. \"? Yes, no, or maybe? Yes\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced. Are we justified in saying that \"Vincenzo Natali Directed splice.\"? Yes, no, or maybe? Yes\n###\nThe November 2004 San Francisco general elections were held on November 2, 2004, in San Francisco, California. The elections included seven seats to the San Francisco Board of Supervisors, four seats to the San Francisco Community College Board, four seats to the San Francisco Board of Education, and fourteen San Francisco ballot measures. Are we justified in saying that \"There was a recount in the elections.\"? Yes, no, or maybe? Maybe\n###\nAnne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952. Are we justified in saying that \"The even of Rash Hashanah was the first time it was aired. \"? Yes, no, or maybe?", "doc_id": 982, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40680, 10317, 37293, 27512], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kindred is a 1987 American horror film directed by Jeffrey Obrow and Stephen Carpenter. Obrow also produced the film and co-wrote it along with Carpenter, Earl Ghaffari and John Penney. The film stars David Allen Brooks, Amanda Pays and Rod Steiger. It was released on January 9, 1987 and grossed just over $2 million. Are we justified in saying that \"Kindred is written by at least 4 people\"? Yes, no, or maybe? Yes\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"The Shokaku class aircraft carriers were part of the reason the United States was brought into the Pacific War.\"? Yes, no, or maybe? Yes\n###\nVampire Vs Vampire (\u4e00\u7709\u9053\u4eba) is a 1989 Hong Kong comedy horror film directed by and starring Lam Ching-ying. The title references the interaction in the film between a jiangshi child, a creature from Chinese \"hopping\" corpse fiction, and a British vampire based on Western vampire fiction. Are we justified in saying that \"Vampire Vs Vampire was released 11 years prior to the next century.\"? Yes, no, or maybe? Yes\n###\nMoody 4B is an instrumental album released by jazz musician James Moody. The album was released in 2010 on IPO Recordings, Moody's fifth release on the label, and was produced by Michael Patterson, Bill Sorin was executive producer. It won the 2011 Grammy Award for Best Jazz Instrumental Album, Individual or Group. Are we justified in saying that \"The 2011 Grammy Award for Best Jazz Instrumental Album goes to James Moody for his exceptional instrumental album Moody 4B, released a year earlier.\"? Yes, no, or maybe? Yes\n###\nDuncan Ley is an Australian playwright, actor, theatrical producer and director who has also written for Sydney's Motion Picture Company. His play \"In Cold Light\" is currently in the production phase as it is turned into a feature film by Peter Slee Productions. Are we justified in saying that \"Duncan Ley is from the Northern Hemisphere.\"? Yes, no, or maybe?", "doc_id": 105, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12182, 31556, 15861, 33904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\". Are we justified in saying that \"Aniket Vishwasrao got a role only because he knew the director\"? Yes, no, or maybe? Maybe\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005. Are we justified in saying that \"Newlyweds was a show about a boyband member and his wife.\"? Yes, no, or maybe? Yes\n###\nJesco White, also known as the \"Dancing Outlaw\" (born July 30, 1956) is an American folk dancer and entertainer. He is best known as the subject of three American documentary films that detail his desire to follow in his famous father's footsteps while dealing with depression, drug addiction, alcoholism, and the poverty that permeates much of rural Appalachia. Are we justified in saying that \"poverty was the main reason of jesco white drug addiction and alcoholism\"? Yes, no, or maybe? Maybe\n###\nMax & Shred is an American sitcom created by Josh Greenbaum and Ben McMillan. The series stars Jonny Gray, Jake Goodman, Saara Chaudry, Emilia McCarthy, Jean-Michel Le Gal, and Siobhan Murphy. The series premiered on Nickelodeon in the United States on October 6, 2014, and on YTV in Canada on October 7, 2014. The series ended on March 31, 2016, with a total of 34 episodes. Are we justified in saying that \"Max & Shred appeared on Nickelodeon and YTV at the same time. \"? Yes, no, or maybe? Yes\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C. Are we justified in saying that \"The Luton Town Ladies Football Club partnered with the Luton Town F.C. the year after it was founded.\"? Yes, no, or maybe?", "doc_id": 254, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7022, 26970, 40726, 20848], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Drifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016. Are we justified in saying that \"Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke have all worked together before.\"? Yes, no, or maybe? Yes\n###\nBeastie Boys were an American hip hop group from New York City, formed in 1981. For the majority of their career, the group consisted of Michael \"Mike D\" Diamond (vocals, drums), Adam \"MCA\" Yauch (vocals, bass) and Adam \"Ad-Rock\" Horovitz (vocals, guitar). Are we justified in saying that \"Beastie Boys were a bad American hip hop group.\"? Yes, no, or maybe? Maybe\n###\nAnnabelle's Affairs is a 1931 American pre-Code romantic comedy film directed by Alfred L. Werker and starring Victor McLaglen, Jeanette MacDonald and Roland Young. The film is based on the play \"Good Gracious Annabelle\" by Clare Kummer. It is the only one of MacDonald's films to be considered lost. It was well received by critics, but did not perform well at the box office. Are we justified in saying that \"Jeannette MacDonald made a lot of bad movies.\"? Yes, no, or maybe? Maybe\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel. Are we justified in saying that \"kiss and tell was the best song on the album\"? Yes, no, or maybe? Maybe\n###\nIn tabletop role-playing games, the character race represents the people to which a player character (PC) or a non-player character (NPC) belongs. \"People\" is to be taken in the broader sense, and may encompass ethnic groups, species, nationality or social groups. Are we justified in saying that \"\"people\" could mean elves in tabletop rpg games\"? Yes, no, or maybe?", "doc_id": 876, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14925, 38422, 5802, 45059], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues. Are we justified in saying that \"Wireshark is used for communications protocol development.\"? Yes, no, or maybe? Yes\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"The Santa Cova Funicular is not in England.\"? Yes, no, or maybe? Yes\n###\nCherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\". Are we justified in saying that \"Charles Napier first appeared in the film \"Vixen!\" in the 60's.\"? Yes, no, or maybe? Yes\n###\nA madrigal is a secular vocal music composition of the Renaissance and early Baroque eras. Traditionally, polyphonic madrigals are unaccompanied; the number of voices varies from two to eight, and most frequently from three to six. It is quite distinct from the Italian Trecento madrigal of the late 13th and 14th centuries, with which it shares only the name. Are we justified in saying that \"A madrigal is a secular vocal music composition distinct from the Italian Trecento madrigal of the 15th century.\"? Yes, no, or maybe? No\n###\nCooper Manning (born March 6, 1974) is the host for the show \"The Manning Hour\" for Fox Sports. He is the oldest son of former professional football quarterback Archie Manning, and the older brother of former professional football quarterback Peyton Manning and current New York Giants quarterback Eli Manning. Are we justified in saying that \"Cooper Manning was born before his father retired from professional football. \"? Yes, no, or maybe?", "doc_id": 361, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7721, 18103, 6011, 7370], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Resorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming. Are we justified in saying that \"Resorts Casino Tunica has gone through a couple name changes over the years.\"? Yes, no, or maybe? Yes\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\". Are we justified in saying that \"J. R. R. Tolkien wrote the scripts for the The Lord of the Rings trilogy films as well as The Lord of the Rings books.\"? Yes, no, or maybe? Maybe\n###\nThe 1983 Summer Universiade, also known as the 1983 World University Games or XII Summer Universiade, took place in Edmonton, Alberta, Canada between July 1 and 12, 1983. Over 2400 athletes from 73 countries participated. It was the first time Canada hosted these Games. Edmonton also hosted the 1978 Commonwealth Games. Are we justified in saying that \"The 1983 World University Games took place in Edmonton, Alberta, Canada during winter\"? Yes, no, or maybe? No\n###\nB&Q plc is a British multinational DIY and home improvement retailing company, headquartered in Eastleigh, England, United Kingdom and is a wholly owned subsidiary of Kingfisher plc. Founded by Richard Block and David Quayle in 1969 originally as Block & Quayle, the retail chain offers over 40,000 products across 300 stores and online. Are we justified in saying that \"B&Q plc is founded by Richard Block and Donald Trump\"? Yes, no, or maybe? No\n###\nThe Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name. Are we justified in saying that \"Shulman died in 1964\"? Yes, no, or maybe?", "doc_id": 424, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4914, 25048, 32512, 34051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006. Are we justified in saying that \"Neko's absence left a void in the band that fans feel Kathryn has not adequately filled.\"? Yes, no, or maybe? Maybe\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University. Are we justified in saying that \"The Old Time Gospel Hour Quartet performed twice a week.\"? Yes, no, or maybe? No\n###\nRa\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"Osella was born in 1985.\"? Yes, no, or maybe? No\n###\nThe Last Boy Scout is a 1991 American action comedy film directed by Tony Scott, starring Bruce Willis, Damon Wayans, Chelsea Field, Noble Willingham, Taylor Negron and Danielle Harris. The film was released in the United States on December 13, 1991. Are we justified in saying that \"It was a canadian movie\"? Yes, no, or maybe? No\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior. Are we justified in saying that \"A semi-automatic pistol can fire cartridges in rapid succession by holding down the trigger.\"? Yes, no, or maybe?", "doc_id": 682, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25861, 16292, 23838, 38132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The Most Valuable Player was given to less than three players.\"? Yes, no, or maybe? Yes\n###\nCarol Ann Crawford (February 22, 1934 \u2013 August 10, 1982), also known as Carol Stolkin and Carol Ross, was an American backgammon and bridge player from Buffalo, New York who spent many years in Detroit, Michigan.. In 1973, she became the second woman to win the world backgammon championships. Are we justified in saying that \"Carol Ann Crawford never won.\"? Yes, no, or maybe? No\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Izzadeen Mohamed Naufer has won an AFC challenge cup.\"? Yes, no, or maybe? Maybe\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996. Are we justified in saying that \"Trainspotting shows explicit drug use.\"? Yes, no, or maybe? Maybe\n###\nCorn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin. Are we justified in saying that \"corn crab is a goumet dish \"? Yes, no, or maybe?", "doc_id": 656, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28775, 5818, 25614, 18959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"Kristen Bell was in the episode \"Fight or Flight\" on the show \"Heroes\".\"? Yes, no, or maybe? Yes\n###\nJusticia umbrosa (Brazilian plume, yellow jacobinia; syn. \"Adhatoda umbrosa\" Ness, and \"Justicia aurea\" Schltdl.) is an ornamental shrub native of Cerrado vegetation of Brazil. This plant may be propagated by herbaceous stem cutting, and it can usually get to 1,50 - 2,50 m tall. They flourish in the shade, and will not do well if overwatered. Are we justified in saying that \"Justicia umbrosa will wilt in the shade\"? Yes, no, or maybe? No\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001). Are we justified in saying that \"Susan Lynch filmed multiple films in 2000\"? Yes, no, or maybe? Maybe\n###\nRonald Francis Arias (born November 30, 1941) is a former senior writer and correspondent for \"People magazine\" and \"People en Espa\u00f1ol\". He is also a highly regarded author whose novel \"The Road to Tamazunchale\" has been recognized as a milestone in Chicano literature. Are we justified in saying that \"He has only written non-fiction.\"? Yes, no, or maybe? No\n###\nEMP Merchandising also known as EMP Merchandising Handelsgesellschaft mbH, Large Popmerchandising, and Sweden Rock Shop is a German-based music mail order and merchandising store. The company distributes a quarterly catalog to customers. In a 2003 report the Osnabr\u00fcck Chamber of Commerce considered the company to be the largest mail order business for Heavy Metal and Hard Rock music in Germany. Are we justified in saying that \"EMP Merchandising was founded in 2003.\"? Yes, no, or maybe?", "doc_id": 192, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20268, 34067, 10923, 36809], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society. Are we justified in saying that \"Christmas Eve and day are the most important holidays in Western Society.\"? Yes, no, or maybe? Maybe\n###\nBoonie Bears III is a 2016 Chinese animated adventure comedy film directed by Ding Liang and Lin Yongchang. The film is the third installment in the \"Boonie Bears\" film series based on the animated series of the same name, following the 2015 film \"\". It was released in China on January 16, 2016. It will be followed by \"\", scheduled for release in 2017. Are we justified in saying that \"Boonie Bears III was dubbed for an english version\"? Yes, no, or maybe? Maybe\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium. Are we justified in saying that \"moulton grew up in a four story mansion\"? Yes, no, or maybe? Maybe\n###\nUniversity Church of England Academy is a secondary school located in Ellesmere Port, Cheshire. It was formed in 2009 by the merger of Ellesmere Port Specialist School of Performing Arts (located at Woodchurch Lane) and Cheshire Oaks High School (located at Stanney Lane). Are we justified in saying that \"University Church of England Academy is a very clean school\"? Yes, no, or maybe? Maybe\n###\nUdinese Channel is a subscription-based channel, entirely dedicated to the Italian football team Udinese Calcio. The channel offers Udinese Calcio fans exclusive interviews with players and staff, full matches, including replays of all Serie A, Coppa Italia, and UEFA Cup games, in addition to vintage matches, footballing news, and other themed programming. Are we justified in saying that \"The channel features baseball once a month.\"? Yes, no, or maybe?", "doc_id": 790, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14301, 32350, 10527, 36827], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Globacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide. Are we justified in saying that \"GLO operates in less than 20 countries.\"? Yes, no, or maybe? Yes\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Slender mint is native to Tasmania, Queensland, New South Wales, Victoria, and South Australia.\"? Yes, no, or maybe? Yes\n###\n\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\". Are we justified in saying that \"Lucenzo wrote more than one song.\"? Yes, no, or maybe? Yes\n###\n\"Inbetweener\" is a song by English Britpop band Sleeper, written by the band's vocalist and guitarist Louise Wener. It was the third single to be released from their debut album \"Smart\" in 1995 (see 1995 in British music). It was their breakthrough single, Are we justified in saying that \"The song Inbetweener by English Britpop band Sleeper is over 3 years old\"? Yes, no, or maybe? Yes\n###\nMinoo Mumtaz (born Malikunnisa Ali on 26 April 1942) is a former Indian film actress. She is the sister of India's ace comedian Mehmood Ali and part of the Mehmood Ali film family. Minoo Mumtaz appeared in many Hindi films of the 1950s and 1960s, mostly as a dancer and character actress. Are we justified in saying that \"Malikunnisa Ali was born more than 1942 years ago.\"? Yes, no, or maybe?", "doc_id": 222, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3185, 22207, 39962, 40773], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points. Are we justified in saying that \"At least four games have been played in Memorial Stadium.\"? Yes, no, or maybe? Yes\n###\nStand-In is a 1937 American comedy film directed by Tay Garnett and starring Leslie Howard, Joan Blondell, and Humphrey Bogart. The picture was produced by the independent Walter Wanger, and released by United Artists. It is set in Hollywood and parodies many aspects of the film industry during the Classical Era. Are we justified in saying that \"The film was very popular\"? Yes, no, or maybe? Maybe\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production. Are we justified in saying that \"The Key was Williams Holden first appearance in a film.\"? Yes, no, or maybe? Maybe\n###\nLois Cleveland Chiles (born April 15, 1947) is an American actress and former fashion model known for her roles as Dr. Holly Goodhead in the 1979 James Bond film \"Moonraker\", and as a hit and run driver in 1987's \"Creepshow 2\", as well as such films as \"The Great Gatsby\", \"The Way We Were\", \"Death on the Nile\" and \"Broadcast News\". Are we justified in saying that \"Lois was a fashion model in America.\"? Yes, no, or maybe? Yes\n###\nMichael Tunn (born 18 January 1974) is an Australian radio announcer and television presenter. He was hired by Australia's national youth station Triple J in 1990 at the age of 17, making him Australia's youngest professional radio presenter at the time. Are we justified in saying that \"Michael Tunn was hired by Australia's national youth station Triple J 13 years after he was born.\"? Yes, no, or maybe?", "doc_id": 785, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17869, 27679, 6160, 14783], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"Kapp Heights had a population of less than 1000 in 2010.\"? Yes, no, or maybe? Yes\n###\nThe 2011 Atlantic Sun Conference Baseball Tournament was held at Ken Dugan Field at Stephen Lee Marsh Stadium on the campus of Lipscomb University in Nashville, TN from May 25 through 28. Belmont won its first tournament championship to earn the Atlantic Sun Conference's automatic bid to the 2011 NCAA Division I Baseball Tournament. Are we justified in saying that \"Belmont plays in the NCAA Division II. \"? Yes, no, or maybe? No\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006. Are we justified in saying that \"Oliver was a very good priest.\"? Yes, no, or maybe? No\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards. Are we justified in saying that \"Jake Deckard has won more than two awards.\"? Yes, no, or maybe? Maybe\n###\nSilver Bow County is a county in the State of Montana. As of the 2010 census, the population was 34,200. Its county seat is Butte. In 1977, the city and county governments consolidated to form the single entity of Butte-Silver Bow. Additionally, the town of Walkerville is a separate municipality from Butte and is within the county. Are we justified in saying that \"Silver Bow County borders South Dakota.\"? Yes, no, or maybe?", "doc_id": 92, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4657, 39455, 9619, 19280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sophie Charlene Akland Monk (born 14 December 1979) is an English-born Australian singer, songwriter, actress, model and radio personality. Monk was a member of the girl group Bardot and released a solo album called \"Calendar Girl\" (2003). She has appeared in films such as \"Date Movie\" (2006), \"Click\" (2006), and \"Spring Breakdown\" (2009). Are we justified in saying that \"Sophie never appeared in movies.\"? Yes, no, or maybe? No\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"Gulf Air has been used by dan.\"? Yes, no, or maybe? Maybe\n###\nEMP Merchandising also known as EMP Merchandising Handelsgesellschaft mbH, Large Popmerchandising, and Sweden Rock Shop is a German-based music mail order and merchandising store. The company distributes a quarterly catalog to customers. In a 2003 report the Osnabr\u00fcck Chamber of Commerce considered the company to be the largest mail order business for Heavy Metal and Hard Rock music in Germany. Are we justified in saying that \"There are no larger heavy metal mail order businesses in Germany.\"? Yes, no, or maybe? Yes\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Weiner Still works at stanford\"? Yes, no, or maybe? Yes\n###\nHakea microcarpa , commonly known as small-fruit hakea is a flowering plant in the family Proteaceae and is endemic to eastern Australia. It is a spreading shrub, often growing in woodlands, heathlands and near swamps in montane areas of eastern Australia. Are we justified in saying that \"the plant can grow in mountains\"? Yes, no, or maybe?", "doc_id": 484, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44865, 14245, 3362, 11591], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Airline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya. Are we justified in saying that \"No person on the continent of Africa has become a certified Captain of the Boeing 787.\"? Yes, no, or maybe? No\n###\nJulia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\". Are we justified in saying that \"Julia Goldani Telles have been in an ABC Family Seris.\"? Yes, no, or maybe? Yes\n###\nThe Col de la Croix Fry (1467 m ) is a mountain pass located in the Cha\u00eene des Aravis, between Manigod and La Clusaz in the Haute-Savoie department of France. The road over the col is used occasionally by the Tour de France cycle race with the tour crossing the pass on Stage 19 of the 2013 Tour. At the summit is the village of La Croix Fry. Are we justified in saying that \"The Tour de France is a cycle race.\"? Yes, no, or maybe? Yes\n###\nDickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority. Are we justified in saying that \"Dickinson owns an airport in North Dakota\"? Yes, no, or maybe? Yes\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175. Are we justified in saying that \"Look at Me (When I Rock Wichoo) ends with an O.\"? Yes, no, or maybe?", "doc_id": 948, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6177, 16902, 8805, 906], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002. Are we justified in saying that \"The conservative Political Party is a member of the EDU\"? Yes, no, or maybe? Yes\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington. Are we justified in saying that \"The Angel and the Soldier Boy was the 13th song by Clannad.\"? Yes, no, or maybe? No\n###\nThe Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms. Are we justified in saying that \"Nolan funk stared in a movie that had a limited release on the month of august 2013.\"? Yes, no, or maybe? Yes\n###\nHenry Gabriel Murphy (1903\u20132001) was an American businessman, sportsman and Major League Baseball club owner. From June 1950 through April 1984, he was a minority stockholder in the Washington Senators/Minnesota Twins franchise of the American League. Are we justified in saying that \"Murphy was a Major League Baseball club player from 1950-1984.\"? Yes, no, or maybe? No\n###\nGeorge White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation. Are we justified in saying that \"George White's Scandals was released more than 1934 seconds ago.\"? Yes, no, or maybe?", "doc_id": 986, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14283, 7248, 31053, 36117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Flatline\" is the ninth episode of the eighth series of the British science fiction television programme \"Doctor Who\", written by Jamie Mathieson, and directed by Douglas Mackinnon. The episode stars Peter Capaldi and Jenna Coleman, with Joivan Wade and Christopher Fairbank guest starring. The episode received critical acclaim, with particular praise directed at Coleman's performance. Are we justified in saying that \"Flatline was loved by many people. \"? Yes, no, or maybe? Yes\n###\nMartin John Christopher Freeman (born 8 September 1971) is an English actor, who became known for portraying Tim Canterbury in the original UK version of sitcom mockumentary \"The Office\", Dr. John Watson in the British crime drama \"Sherlock\", Bilbo Baggins in Peter Jackson's \"The Hobbit\" film trilogy, and Lester Nygaard in the dark comedy-crime drama TV series \"Fargo\". Are we justified in saying that \"Martin John Christopher Freeman did not play Bilbo Baggins in \"The Hobbit\".\"? Yes, no, or maybe? No\n###\nAmelio Robles Avila was a colonel during the Mexican Revolution. He was born a woman with the name of Amelia Robles \u00c1vila on November 3, 1889 in Xochipala, Guerrero. His father was named Casimiro Robles and his mother Josefa \u00c1vila. His father was a wealthy farmer who owned 42 acres of land and owned a small Mezcal factory. Are we justified in saying that \"The colonel was born Amelio and changed his name to Amelia.\"? Yes, no, or maybe? No\n###\nMentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Mentha diemenica is a popular species of mint from Tasmania and very good in cocktails.\"? Yes, no, or maybe? Maybe\n###\nEngine is the second album by American Music Club. It was jointly released by Frontier and Grifter in the US and by Zippo in the UK and Europe in 1987. The 1998 Warner Bros. Records reissue added three additional tracks from the same period. The artwork for the Zippo UK release features an incorrect track listing, putting the songs in the wrong order. Are we justified in saying that \"American Music Club released an album in 1986.\"? Yes, no, or maybe?", "doc_id": 970, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33563, 14965, 2731, 20138], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\". Are we justified in saying that \"It was a made up story\"? Yes, no, or maybe? Maybe\n###\nCamping 3 is a 2016 French comedy film directed by Fabien Onteniente. It is a sequel to the 2010 film \"Camping 2\". The film was a box office success, having grossed over US$24.2 million in France, becoming the second highest-grossing domestic film in 2016, with 3,228,313 tickets sold. Are we justified in saying that \"One of the highest earning films and with a high number of tickets sold Camping 3 made lots of money.\"? Yes, no, or maybe? Yes\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture. Are we justified in saying that \"Denis Villeneuve is a French Canadian film director and writer who won the Canadian Screen Award for writing four times, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013.\"? Yes, no, or maybe? No\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808 Are we justified in saying that \"The house is standing on a place that is now called Warren Hall Farm.\"? Yes, no, or maybe? Yes\n###\nNBA 2K9 is a basketball simulation video game developed by Visual Concepts and published by 2K Sports. It is the tenth installment in the \"NBA 2K\" franchise and the successor to \"NBA 2K8\". It was released in 2008 for PlayStation 2, PlayStation 3, Xbox 360, and Microsoft Windows. Kevin Garnett is the cover athlete of the game. \"NBA 2K9\" is the predecessor to \"NBA 2K10\" in the \"NBA 2K\" series. Are we justified in saying that \"NBA 2K10 is the last in the \"NBA 2K\" series.\"? Yes, no, or maybe?", "doc_id": 492, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8660, 18999, 21025, 19250], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart. Are we justified in saying that \"The band Cross Gene does not contain any female band members.\"? Yes, no, or maybe? Yes\n###\nQuetta: A City of Forgotten Dreams is an upcoming Pakistani drama film directed by Murtaza Chaudary, written by Faysal Chaudary and co-produced by Faysal Chaudary, Sana Bucha under the Production banner \"Filmsaaz\", \"Sana Bucha Productions\". The film star Asal Din Khan, Abdullah Ghaznavi, Ali Karimi, Fayaz Hussain and Danyal Ali in lead roles. Are we justified in saying that \"The film star Asal Din Khan had roles in over 20 films.\"? Yes, no, or maybe? Maybe\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington. Are we justified in saying that \"The Angel and the Soldier Boy is not an animated movie.\"? Yes, no, or maybe? No\n###\nThe Albany Great Danes men's lacrosse team represents the University at Albany in NCAA Division I men's college lacrosse. Albany currently competes in the America East Conference and plays its home games on John Fallon Field. The team has reached the NCAA Men's Lacrosse Championship tournament nine times. The Great Danes are currently coached by Scott Marr. Are we justified in saying that \"The lacrosse team represents the University at Albany in NCAA Division 2\"? Yes, no, or maybe? No\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty. Are we justified in saying that \"Kilpatrick was a police officer\"? Yes, no, or maybe?", "doc_id": 11, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37792, 36852, 1268, 27036], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Humans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video. Are we justified in saying that \"Humans Need Not Apply is a 2014 short Internet documentary film about economics in the last century.\"? Yes, no, or maybe? No\n###\nShannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University. Are we justified in saying that \"Shannon Kelley wants to coach SMU.\"? Yes, no, or maybe? Maybe\n###\nGrimsby Town Football Club is a professional football club based in the town of Cleethorpes, Lincolnshire, England, that competes in League Two, the fourth-highest division overall in the English football league system. Nicknamed \"the Mariners\", the club was founded as Grimsby Pelham in 1878, changed its name to Grimsby Town a year later and moved to its current stadium, Blundell Park, in 1898. Are we justified in saying that \"the mariner will rise up\"? Yes, no, or maybe? Maybe\n###\nBarry Redden (born July 21, 1960) is a former American football running back who played for the Los Angeles Rams, the San Diego Chargers, and the Cleveland Browns of the National Football League (NFL). He spent much of his career playing in the shadow of Pro Football Hall of Fame running back Eric Dickerson. Are we justified in saying that \"Barry Redden is a very funny man\"? Yes, no, or maybe? Maybe\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson. Are we justified in saying that \"Fri\u00f0rik Ingi R\u00fanarsson is a player on the basketball team.\"? Yes, no, or maybe?", "doc_id": 521, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16622, 40520, 13273, 20091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mary Eliza Mahoney (May 7, 1845 \u2013 January 4, 1926) was the first African American to study and work as a professionally trained nurse in the United States, graduating in 1879. Mahoney was one of the first African Americans to graduate from a nursing school, and she prospered in a predominantly white society. She also challenged discrimination against African Americans in nursing. Are we justified in saying that \"Mary Eliza Mahoney passed her classes.\"? Yes, no, or maybe? Yes\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire. Are we justified in saying that \"Eugene Gearty has worked on over 100 films since 1983\"? Yes, no, or maybe? No\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"ABC Western Victoria also owned a television studio.\"? Yes, no, or maybe? Maybe\n###\nErnest Asi Afiesimama is a Nigerian environmental and climate scientist who has worked for the Nigerian Meteorological Agency and was a consultant in environmental and climate affairs at Stern Integrated Projects. He currently works with the World Meteorological Organisation. Are we justified in saying that \"Hes a consultant in environmental and climate affairs \"? Yes, no, or maybe? Yes\n###\nHumphrey Mieno Ochieng (born 25 December 1989 in Nairobi) is a Kenyan footballer who currently plays for Kenyan Premier League side Tusker and the Kenya national team as a midfielder. He previously played for A.F.C. Leopards Sofapaka and Kenya Commercial Bank in the Kenyan Premier League, as well as Tunisian side Club Africain and Tanzanian club Azam. Are we justified in saying that \"Humphrey Mieno Ochieng was born on Christmas Day\"? Yes, no, or maybe?", "doc_id": 758, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5874, 33853, 1395, 20924], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise). Are we justified in saying that \"Josiane starred in as well as created The Magic Roundabout.\"? Yes, no, or maybe? Maybe\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics. Are we justified in saying that \"ALGOL 68 is a language for computer programming.\"? Yes, no, or maybe? Yes\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\". Are we justified in saying that \"b.j. thomas song hooked on a feeling was a success in 1968\"? Yes, no, or maybe? Maybe\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy. Are we justified in saying that \"toronto fc is predicted to beat the LA galaxy\"? Yes, no, or maybe? Maybe\n###\nMarvin Karlton Rainwater (July 2, 1925 \u2013 September 17, 2013) was an American country and rockabilly singer and songwriter who had several hits during the late 1950s, including \"Gonna Find Me a Bluebird\" and \"Whole Lotta Woman\", a UK no.1 record. He was known for wearing Native American-themed outfits on stage and was 25 percent Cherokee. Are we justified in saying that \"Marvin Karlton Rainwater was a very badsinger\"? Yes, no, or maybe?", "doc_id": 728, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11826, 44887, 3326, 32547], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Speedway Field was the original name for the airfield that was to evolve into Minneapolis-St. Paul International Airport, the twelfth busiest airport in the United States; it was also the largest hub for Northwest Airlines and the third largest hub for Delta Air Lines, Northwest's successor. Are we justified in saying that \"The former Speedway Field is the twelfth busiest airport in the U.S.\"? Yes, no, or maybe? Yes\n###\nAnti-D\u00fchring (German: \"Herrn Eugen D\u00fchrings Umw\u00e4lzung der Wissenschaft\" , \"Herr Eugen D\u00fchring's Revolution in Science\") is a book by Friedrich Engels, first published in German in 1878. It had previously been serialised in a periodical. There were two further German editions in Engels' lifetime. \"Anti-D\u00fchring\" was first published in English translation in 1907. Are we justified in saying that \"Anti-D\u00fchring starts with C.\"? Yes, no, or maybe? No\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song. Are we justified in saying that \"\"I'm Not the One's\" lead vocalist has a first name that starts with the letter R.\"? Yes, no, or maybe? Yes\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there. Are we justified in saying that \" Joseph Bouck was born in Schoharie County\"? Yes, no, or maybe? Yes\n###\n\"Chasing Colors\" is a song recorded by electronic DJs Marshmello and Ookay featuring the vocals of American singer Noah Cyrus. It was written by Marshmello, Ookay, Skyler Stonestreet and Chase Duddy and released on 24 February 2017 via Marshmello's label Joytime Collective. Are we justified in saying that \"DJ Ookay released a track on a Friday in February 2017\"? Yes, no, or maybe?", "doc_id": 327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23868, 34861, 3431, 7530], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Grindhouse Releasing is a Hollywood-based independent cult film distribution company led by film editor Bob Murawski and co-founded by Sage Stallone. Grindhouse digitally remasters, restores, and produces bonus materials and video documentaries for cult film DVDs and Blu-rays which it distributes on the CAV label. Are we justified in saying that \"Grindhouse Releasing plans to release bonus scenes to star wars\"? Yes, no, or maybe? Maybe\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"the airline operates scheduled services to 41 destinations in 23 countries across all of Africa, Asia and Europe\"? Yes, no, or maybe? Maybe\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The North African ostrich or red-necked ostrich can also be seen in European countries.\"? Yes, no, or maybe? Maybe\n###\nRobots is a 2005 American computer-animated adventure comedy film produced by Blue Sky Studios for 20th Century Fox. It was directed by Chris Wedge and produced by Jerry Davis, William Joyce, and John C. Donkin. It features the voices of Ewan McGregor, Halle Berry, Greg Kinnear, Mel Brooks, Amanda Bynes, Drew Carey, and Robin Williams. Are we justified in saying that \"Robots was the last film produced by Blue Sky Studios.\"? Yes, no, or maybe? Maybe\n###\nCarl Frederik Tietgen (19 March 1829 \u2013 19 October 1901) was a Danish financier and industrialist. The founder of numerous prominent Danish companies, many of which are still in operation today, he played an important role in the industrialisation of Denmark. Notably also forming conglomerates, several of Tietgen's companies attained a monopoly-like status, which cemented their durability. Are we justified in saying that \"Carl Frederik Tietgen was a popular figure in the industrialization of denmark \"? Yes, no, or maybe?", "doc_id": 783, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30771, 20839, 17612, 24464], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Selma Diamond (August 6, 1920 \u2013 May 13, 1985) was an American comedic actress and radio and television writer, known for her high-range, raspy voice, and her portrayal of Selma Hacker on the first two seasons of the NBC television comedy series \"Night Court\". Are we justified in saying that \"Selma Diamond was married several times\"? Yes, no, or maybe? Maybe\n###\nSpring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch. Three years after principal photography, and after the film's owner, Warner Independent Pictures, was shut down by its parent company, it was released direct-to-video in 2009. Are we justified in saying that \"Spring Breakdown made a lot of money\"? Yes, no, or maybe? Maybe\n###\nThe Ravenswood City School District is a public school district headquartered in East Palo Alto, California, US. The district, in the San Francisco Bay Area, serves the communities of East Palo Alto and eastern Menlo Park. Students from this school district who continue on with public schooling matriculate to the Sequoia Union High School District. In 2008-09 it served over 4,500 students. Are we justified in saying that \"The Ravenswood City School District served less than 4500 students in the 2008 school year\"? Yes, no, or maybe? No\n###\nNew American Writing is a once-a-year American literary magazine emphasizing contemporary American poetry, including a range of innovative contemporary writing. The magazine is published in association with San Francisco State University. \"New American Writing\" is published by OINK! Press, a nonprofit organization. The magazine appears in early June each year. First published in 1986. Are we justified in saying that \"Students of San Francisco State University have contemporary poetry published in New American Writing.\"? Yes, no, or maybe? Maybe\n###\nTasmanian Devils is a 2013 television film directed by Zach Lipovsky and starring Danica McKellar and Apolo Ohno. The movie was first released onto the Syfy channel on January 19, 2013 and centers around a group of friends that get attacked by extremely large tasmanian devils. \"Radio Times\" rated the film poorly, giving it two out of 5 stars. Are we justified in saying that \"The Tasmanian devils is a film about a group of Tasmanian devils that get attacked by a large group of friends.\"? Yes, no, or maybe?", "doc_id": 918, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34289, 20757, 2527, 30842], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"A lot of people liked her\"? Yes, no, or maybe? Maybe\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old. Are we justified in saying that \"Urschel played football professionally into his second decade.\"? Yes, no, or maybe? No\n###\nKim Hyang-gi (born August 9, 2000) is a South Korean actress. Kim began her career as a child actress, and has starred in films and television series such as \"Wedding Dress\" (2010), \"The Queen's Classroom\" (2013), \"Thread of Lies\" (2014) and \"Snowy Road\" (2017). Are we justified in saying that \"The film Snowy Road was well received\"? Yes, no, or maybe? Maybe\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east. Are we justified in saying that \"It is bordered by a sea named after a color\"? Yes, no, or maybe? Yes\n###\nThe 1976 European Cup Winners' Cup Final was a football match between West Ham United of England and Anderlecht of Belgium. The final was held at Heysel Stadium in Brussels on 5 May 1976. It was the final match of the 1975\u201376 European Cup Winners' Cup tournament and the 16th European Cup Winners' Cup Final. Are we justified in saying that \"West Ham United played against another British team in the 1976 European Cup Winners' Cup Final in Brussels.\"? Yes, no, or maybe?", "doc_id": 559, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9344, 16003, 16571, 12178], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A Day with Wilbur Robinson is a 1990 children's picture book (slightly expanded for a 2006 reissue) written and illustrated by William Joyce. A film adaptation called \"Meet the Robinsons\" was released by Walt Disney Pictures in 2007 in the United States. Are we justified in saying that \"Meet the Robinsons was 200 minutes long.\"? Yes, no, or maybe? Maybe\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey. Are we justified in saying that \"Ballymena United Football Club will compete in the next World Cup \"? Yes, no, or maybe? Maybe\n###\nLeft Hand Spring was a well-known watering stop on the old Chisholm Trail in present-day Blaine County, Oklahoma. The spring was named for \"Left Hand\", an Arapaho chief. Jesse Chisholm died there in 1868 and is buried nearby. His grave is marked with a granite historical marker. Are we justified in saying that \"Left Hand Spring isn't located in Canada.\"? Yes, no, or maybe? Yes\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\". Are we justified in saying that \"Aniket Vishwasrao was paid more than his female counterparts\"? Yes, no, or maybe? Maybe\n###\nCatherine Breillat (] ; born 13 July 1948) is a French filmmaker, novelist and Professor of Auteur Cinema at the European Graduate School. She has often courted controversy with her films' frank treatment of sexual themes. For example, her 1976 debut film, \"A Real Young Girl\", was not released in theaters until 2000. Are we justified in saying that \"Catherine Breillat was born in 19488\"? Yes, no, or maybe?", "doc_id": 621, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35857, 9031, 23218, 3633], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon plays in the position of forward.\"? Yes, no, or maybe? Yes\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming. Are we justified in saying that \"The hotel will expand to a 250-room hotel.\"? Yes, no, or maybe? Maybe\n###\nJuan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974. Are we justified in saying that \"Juan Domingo Per\u00f3n met Clinton.\"? Yes, no, or maybe? Maybe\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire. Are we justified in saying that \"Peter John Reynolds worked for free at Butser Ancient Farm\"? Yes, no, or maybe? Maybe\n###\nState Highway\u00a0128 (SH-128) is a 2.198 mi state highway in the U.S. state of Idaho, serving the city of Lewiston in Nez Perce County. The highway travels east along the Clearwater River within Lewiston from Washington State Route\u00a0128 (SR\u00a0128) to U.S. Route\u00a012 (US-12). Are we justified in saying that \"Highway 128 traverses several states\"? Yes, no, or maybe?", "doc_id": 420, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1962, 17697, 18722, 32281], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "SuperpinkyMandy is the debut studio album of British singer Beth Orton. Largely in the style of electronica, and produced closely with then boyfriend William Orbit, it was a limited Japan-only release, with about 5000 copies pressed. As such, it is very much sought after. Orton largely passes over the release when interviewed, citing 1996's \"Trailer Park\" as her first release. Are we justified in saying that \"SuperpinkyMandy is a rare album\"? Yes, no, or maybe? Yes\n###\nCavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas. Are we justified in saying that \"The parade involves a commemoration.\"? Yes, no, or maybe? Yes\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Ritchie Wenaweser and Simone Steiner do not love working together\"? Yes, no, or maybe? Maybe\n###\nReal Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded. Are we justified in saying that \"Real Madrid C was disbanded on May 2015\"? Yes, no, or maybe? Maybe\n###\n\"Aven Romale\" (Come in Gypsies), is a song by the Czech group Gipsy.cz that was the Czech entry at the 2009 Eurovision Song Contest held in Moscow, Russia. It scored zero points at the Eurovision Song Contest semi-final, thereby failing to qualify for the final. Are we justified in saying that \"\"Aven Romale\" was actually born in Poland\"? Yes, no, or maybe?", "doc_id": 291, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24382, 42536, 8761, 32111], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \" Dariush Mehrjui is also known for his work in the field of Physics\"? Yes, no, or maybe? Maybe\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records. Are we justified in saying that \"Suge Knight and Dr. Dre were rivals\"? Yes, no, or maybe? Maybe\n###\nJunun is a 2015 album by the Israeli composer Shye Ben Tzur, the English composer and Radiohead guitarist Jonny Greenwood, and the Indian ensemble the Rajasthan Express. It was produced by Greenwood and recorded, mixed, and engineered by Radiohead producer Nigel Godrich. Are we justified in saying that \"The album was nominated for a Grammy.\"? Yes, no, or maybe? Maybe\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement. Are we justified in saying that \"Princess Caroline was 1 day old when she died\"? Yes, no, or maybe? No\n###\nThe Volkswagen Citi Golf was a car produced by Volkswagen in South Africa from 1984 until 21 August 2009. It was a face-lifted version of the original Volkswagen Golf Mk1 hatchback, which ceased production in Germany in 1983. The car was produced only with right-hand drive. Are we justified in saying that \"The Citi Golf was occasionally produced with left-hand drive.\"? Yes, no, or maybe?", "doc_id": 859, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39278, 1244, 3656, 24100], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network. Are we justified in saying that \" Liverpool Street station has issues with the homeless.\"? Yes, no, or maybe? Maybe\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire. Are we justified in saying that \"Peter John Reynolds was born in Scotland.\"? Yes, no, or maybe? No\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration. Are we justified in saying that \"The High Bridge Branch never was near a body of water.\"? Yes, no, or maybe? No\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808 Are we justified in saying that \"The manor house was struck by lightning several times\"? Yes, no, or maybe? Maybe\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart. Are we justified in saying that \"Nine Lashes did not debut with \"Break the World\"\"? Yes, no, or maybe?", "doc_id": 583, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9748, 33563, 18412, 25293], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award. Are we justified in saying that \"Simon & Schuster approached David McCullough to ask him to write the book The Path Between the Seas.\"? Yes, no, or maybe? Maybe\n###\nThe Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\". Are we justified in saying that \"It was a made up story\"? Yes, no, or maybe? Maybe\n###\n[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis. Are we justified in saying that \"Michael Keene had not previously worked with the band\"? Yes, no, or maybe? No\n###\nKulte is a clothing label from Marseille. It was created in 1998 and in 2013 it owns more than 10 shops mainly in France (its first foreign shop opened in Athens in 2011). The brand collaborated with several artists (MGMT, Na\u00efve New Beaters) and music related organizations (including the music festivals, Marsatac and Transmusicales, and record labels, Because Music and Kitsun\u00e9). Are we justified in saying that \"Kulte's first shop outside France was in Greece.\"? Yes, no, or maybe? Yes\n###\nBrennan Hesser (born 1980) is an American television actress, best known for co-starring in Tori Spelling's VH1 sitcom, \"So NoTORIous\". She also starred in Fox's drama, \"Jonny Zero\". She also guest starred in an episode of the CBS television show, \"The Guardian\". As a youngster, she attended the prestigious Interlochen Arts Camp in Northern Michigan. Are we justified in saying that \"Tori Spelling had a sitcom on VH1 called \"So.\"\"? Yes, no, or maybe?", "doc_id": 897, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43626, 28930, 29202, 12981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Albert Ball, VC, DSO & Two Bars, MC (14 August 1896 \u2013 7 May 1917) was an English fighter pilot during the First World War. At the time of his death he was the United Kingdom's leading flying ace, with 44 victories, and remained its fourth-highest scorer behind Edward Mannock, James McCudden, and George McElroy. Are we justified in saying that \"Albert Ball did not partake in the second World War.\"? Yes, no, or maybe? Yes\n###\nDie Antwoord (] , Afrikaans for \"The Answer\") is a South African hip hop group formed in Cape Town in 2008. It comprises rappers Ninja and Yolandi Visser and producer God (formerly DJ Hi-Tek). Their image revolves around the South African counterculture movement known as zef and has incorporated work by other artists associated with the movement, such as photographer Roger Ballen. Are we justified in saying that \"Die Antwoord translates to the answer\"? Yes, no, or maybe? Yes\n###\nNeelix is a character in the science fiction television series \"\", played by actor Ethan Phillips since the series' inception. Neelix is an alien native to the distant far side of the galaxy, who has joined the crew of the United Federation of Planets starship USS \"Voyager\" as cook after its being captured by a mysterious shock wave to the Delta Quadrant. Are we justified in saying that \"Voyager did not want Neelix to join the crew.\"? Yes, no, or maybe? Maybe\n###\nThe 2015 J&T Banka Prague Open was a professional tennis tournaments played on outdoor clay courts. It was the 6th edition of the tournament which was an International tournament on the 2015 WTA Tour. It took place at the Sparta Prague Tennis Club in Prague, Czech Republic, from 27 April to 2 May 2015. This was the event's first edition as a WTA International tournament. Are we justified in saying that \"The first WTA International Tournament started in May of 2015.\"? Yes, no, or maybe? No\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title. Are we justified in saying that \"The 2010 ASB Classic was for men and women\"? Yes, no, or maybe?", "doc_id": 383, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22146, 15255, 5801, 18371], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Final Blow is a compilation album by Fred Wesley and the Horny Horns. The album first released on the P-Vine record label in 1994, then on the Sequel label in the United Kingdom, and the AEM label in the United States. The album is made up of unreleased tracks recorded during band's heyday in the late 1970s. Are we justified in saying that \"The album was released in the century before the current century\"? Yes, no, or maybe? Yes\n###\nUna questione privata is a 1993 Italian film directed by Alberto Negrin with a screenplay based on the WWII partisan novel of the same name by Beppe Fenoglio (1963) adapted by Raffaele La Capria. The film stars the young British actor Rupert Graves as Milton, C\u00e9line Beauvallet, and Claudio Mazzenga. Are we justified in saying that \"Una questione privata was based on a WWI novel that had the same name.\"? Yes, no, or maybe? No\n###\nH\u00e9rcules de Alicante Club de F\u00fatbol, S.A.D. (] ) is a Spanish football team based in Alicante, in the autonomous community of Valencian Community. Founded in 1922, it currently plays in Segunda Divisi\u00f3n B \u2013 Group 3 and plays its home games at the 30,000-capacity Estadio Jos\u00e9 Rico P\u00e9rez. Are we justified in saying that \"Estadio Jos\u00e9 Rico P\u00e9rez has always been a 30,000 capacity stadium. \"? Yes, no, or maybe? Maybe\n###\nBlack Panther (\"H\u0113i B\u00e0o\" \u9ed1\u8c79 ) was a seminal Chinese rock band founded in 1987. It was originally fronted by one of China's alternative music pioneers Dou Wei. The band reunited and released a new album in 2013. The band's best known songs include \"Don't break my heart\", \"Shameful\" \u300a\u65e0\u5730\u81ea\u5bb9\u300b, \"Spirit of Light\" \u300a\u5149\u8292\u4e4b\u795e\u300b\uff0c \"No Right, No Wrong\" \u300a\u65e0\u662f\u65e0\u975e\u300b\uff0c and \"Our generation\" \u300a\u6211\u4eec\u8fd9\u4e00\u4ee3\u300b Are we justified in saying that \"Black Panther was founded in the 20th century \"? Yes, no, or maybe? Yes\n###\nNew Hampshire Route 78 (abbreviated NH 78) is a 3.456 mi secondary state highway in Cheshire County in the southern part of the U.S. state of New Hampshire. A northward extension of Massachusetts Route 78, NH 78 runs entirely within the town of Winchester from the state border to downtown, where it ends at New Hampshire Route 10 and New Hampshire Route 119. Are we justified in saying that \"NH 78 is less than five miles long\"? Yes, no, or maybe?", "doc_id": 329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34686, 26159, 7457, 19399], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "New Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"Campbell aged over the course of the book.\"? Yes, no, or maybe? Yes\n###\n\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\". Are we justified in saying that \"The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1980\"? Yes, no, or maybe? No\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music. Are we justified in saying that \"Mission: Impossible III did not have enough money to have an album release\"? Yes, no, or maybe? Maybe\n###\nFrancis Gary Powers (August 17, 1929 \u2013 August 1, 1977) \u2013 often referred to as simply Gary Powers \u2013 was an American pilot whose Central Intelligence Agency (CIA) U-2 spy plane was shot down while flying a reconnaissance mission in Soviet Union airspace, causing the 1960 U-2 incident. Are we justified in saying that \"Francis Gary Powers went by G. Powers.\"? Yes, no, or maybe? No\n###\nThe first season of \"Charmed\", an American supernatural drama television series created by Constance M. Burge, premiered on October 7, 1998 on The WB. Airing on Wednesdays at 9:00 pm, the season consisted of 22 episodes and concluded its airing on May 26, 1999. Paramount Home Entertainment released the complete first season in a six-disc box set on February 1, 2005. Are we justified in saying that \"The airing of the first season of \"Charmed\" lasted 8 months \"? Yes, no, or maybe?", "doc_id": 334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36219, 20776, 30211, 38195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"Mckay has never ridden a skateboard.\"? Yes, no, or maybe? No\n###\nAlex Rider is a title character and the protagonist of the popular \"Alex Rider\" novel series by British author Anthony Horowitz. He has also been featured in three short stories written by Horowitz based in the same canon as the series; \"\", \"\" and \"\". Are we justified in saying that \"Alex Rider is the main character of the \"Alex Rider\" series. \"? Yes, no, or maybe? Yes\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life. Are we justified in saying that \"Health For All has been used by dan.\"? Yes, no, or maybe? Maybe\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures. Are we justified in saying that \"There were speaking lines in A Daughter of the Wolf.\"? Yes, no, or maybe? No\n###\nCorrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed. Are we justified in saying that \"Corrina, Corrina was released in an even-numbered year.\"? Yes, no, or maybe?", "doc_id": 782, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25131, 358, 36074, 540], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"The United States suffered minimally after relations with Cuba were severed.\"? Yes, no, or maybe? Maybe\n###\nSpy Corps is a spy film for Christian families that was written and directed by J David Baker. It stars Sarah Beth Hill as a fearful high school teenager, and Adam Hale as a secret member of the Reserve Spy Training Corps, a training program for high school students who want to pursue a career as a spy. Are we justified in saying that \"Reserve Spy Training Corps is a training program for Christian high school students.\"? Yes, no, or maybe? Maybe\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"Sebo lives in Venice, Los Angeles.\"? Yes, no, or maybe? Yes\n###\nThe Mini Hatch, stylized as MINI hatch or MINI Hardtop in the US, also known as Mini Cooper or Mini One or simply the Mini, is a three-door hatchback first introduced in late 2000, with a second generation launched in 2006 and a third generation model launched in 2014. A convertible version was introduced in 2004, with the second generation following in 2008. Are we justified in saying that \"The second generation Mini was release twice.\"? Yes, no, or maybe? Yes\n###\nWriting Degree Zero (French: \"Le degr\u00e9 z\u00e9ro de l'\u00e9criture\" ) is a book of literary criticism by Roland Barthes. First published in 1953, it was Barthes' first full-length book and was intended, as Barthes writes in the introduction, as \"no more than an Introduction to what a History of Writing might be.\" Are we justified in saying that \"Le degr\u00e9 z\u00e9ro de l'\u00e9criture is french for writing degree zero.\"? Yes, no, or maybe?", "doc_id": 538, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24189, 45060, 18488, 9249], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Petasites have another nickname of Kinderbars.\"? Yes, no, or maybe? Maybe\n###\nSpring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch. Three years after principal photography, and after the film's owner, Warner Independent Pictures, was shut down by its parent company, it was released direct-to-video in 2009. Are we justified in saying that \" Spring Breakdown is a comedy film starring Amy Poehler, Parker Posey, and Rachel Dratch released direct-to-video before two thousand nine.\"? Yes, no, or maybe? No\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"LYU offers degrees in over 5 major disciplines\"? Yes, no, or maybe? Yes\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone is an unincorporated community and with 59 people on their census in central Keith County, Nebraska, United States.\"? Yes, no, or maybe? Maybe\n###\nBrandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more. Are we justified in saying that \"All of 6 Tre G's albums were released after 1980.\"? Yes, no, or maybe?", "doc_id": 49, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27353, 8780, 16041, 22634], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park. Are we justified in saying that \"Newnes railway line is in London.\"? Yes, no, or maybe? No\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone is incorporated.\"? Yes, no, or maybe? No\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"The cultural evolution of Pakistan is portrayed in the book End of the Past.\"? Yes, no, or maybe? Yes\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Allen S. Weiner was a co-director\"? Yes, no, or maybe? Yes\n###\nSulejman Vokshi (1815 - 1890) was an Albanian military commander and leader of the League of Prizren. A member of the central committee of the league as head of the finances commission, Vokshi also was an important leader of the organization's military branch and an officer of its military staff. Are we justified in saying that \"Sulejman lived three quarters of a century before dying \"? Yes, no, or maybe?", "doc_id": 901, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14787, 33021, 7575, 45132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Versailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S. Are we justified in saying that \"Versailles was filmed in Canada.\"? Yes, no, or maybe? Maybe\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC. Are we justified in saying that \"Tory Woodbury will become the next coach of the New England Patriots \"? Yes, no, or maybe? Maybe\n###\nThe Portezuelo Formation is a geologic formation outcropping in the Mendoza, R\u00edo Negro and Neuqu\u00e9n provinces of Argentina. It is the fourth-oldest formation in the Neuqu\u00e9n Group and the older of the two formations in the R\u00edo Neuqu\u00e9n Subgroup. Formerly, that subgroup was treated as a formation, and the Portezuelo Formation was known as the Portezuelo Member. Are we justified in saying that \"Portezuelo Formation was not discovered by Argentinians.\"? Yes, no, or maybe? Maybe\n###\nThe Rhodesian ridgeback is a dog breed developed in South Africa. Its European forebears can be traced to the early pioneers of the Cape Colony of southern Africa, who crossed their dogs with the semi-domesticated, ridged hunting dogs of the Khoikhoi. Are we justified in saying that \"European Ridgebacks were developed in South Africa\"? Yes, no, or maybe? No\n###\nMiriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua also known as Ka\u02bb ahumanu III (July 27, 1794 \u2013 June 7, 1845), was Kuhina Nui of the Kingdom of Hawaii, a queen consort of both King Kamehameha I and Kamehameha II, and mother of another king. Are we justified in saying that \"Miriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua's name was likely more than a dozen syllables\"? Yes, no, or maybe?", "doc_id": 703, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37596, 32474, 12695, 33886], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Circus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Circus Palestine wan and Israeli Film Award in 2005\"? Yes, no, or maybe? Maybe\n###\nThe American Canadian Tour (ACT) is a late model stock car racing series based in the northeastern United States, and Quebec, Canada. The American Canadian Tour has two different late model divisions: the ACT Late Model Tour, founded in 1986 conducts races in New England and New York, and the La S\u00e9rie ACT (formerly known as S\u00e9rie ACT Castrol Tour), founded in 2005 runs in Quebec, Canada. Are we justified in saying that \"The ACT has events in New York City.\"? Yes, no, or maybe? Maybe\n###\nVladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic. Are we justified in saying that \"Vladislav Adolfovitch Rusanov began writing fantasy stories while in high school\"? Yes, no, or maybe? Maybe\n###\nThe Coca-Cola Bottling Company of Cape Cod is a former bottler of Coca-Cola, Dr Pepper and Canada Dry soft drinks located in Sandwich, Massachusetts, United States. The company was bought out in 2000 by the Coca-Cola Bottling Company of Northern New England. Are we justified in saying that \"The Coca-Cola Bottling Company of Cape Cod was bought out before 2000 by the Coca-Cola Bottling Company of Northern New England.\"? Yes, no, or maybe? No\n###\nJaeden Wesley Lieberher (born January 4, 2003) is an American actor. He is known for starring as Bill Denbrough in the horror film \"It\" (2017), and for his leading roles in the films \"St. Vincent\", as Oliver Bronstein, \"Midnight Special\", as Alton Meyer, \"The Confirmation\", as Anthony, \"The Book of Henry\", as Henry Carpenter. Are we justified in saying that \"Henry Carpenter was a character in Midnight Special\"? Yes, no, or maybe?", "doc_id": 416, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2697, 16080, 19448, 33972], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harry Spencer Davis (born 24 September 1991) is an English professional footballer, who plays as a defender for Scottish Championship side St Mirren. Davis previously played with Crewe Alexandra. Early in his career, he was loaned by Crewe to Nantwich Town, Stafford Rangers and Curzon Ashton. Are we justified in saying that \"Harry Spencer Davis has been loaned to at least three other teams by Crewe Alexandra.\"? Yes, no, or maybe? Yes\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title. Are we justified in saying that \"The 1982 Bavarian Tennis Championships was held in Berlin, Germany\"? Yes, no, or maybe? No\n###\nHearts of Stone is the fifth studio album by American rock band Stoneground, released in 1978 on Warner Bros. Produced by Bob Gaudio, it marked Stoneground's return to a major label, having released their previous album, \"Flat Out\" (1976), on their own label. \"Prove It\" was released as the first single from \"Hearts of Stone\". Are we justified in saying that \"Prove It was released as the first single from Hearts of Stone.\"? Yes, no, or maybe? Yes\n###\nThe Program in Creative Writing, more commonly known as the Iowa Writers' Workshop, at the University of Iowa in Iowa City, Iowa, is a much-celebrated graduate-level creative writing program in the United States. Writer Lan Samantha Chang is its director. Graduates earn a Master of Fine Arts (MFA) degree in Creative Writing. Are we justified in saying that \"The Program in Creative Writing is not located at Iowa State University.\"? Yes, no, or maybe? Yes\n###\nTanya McQueen is an American reality television personality and interior designer on TV's . She made her debut on \"Extreme Makeover\" in an October 2005 episode titled, \"The Teas Family\". On August 2, 2011, McQueen and fellow Extreme Makeover personality Tracy Hutson debuted the show \"Picker Sisters\" on Lifetime. Are we justified in saying that \"Tanya McQueen had no friends\"? Yes, no, or maybe?", "doc_id": 969, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19378, 35138, 44061, 40031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Louis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels. Are we justified in saying that \"Louis Glenn Marson played for the Cleveland Indians.\"? Yes, no, or maybe? Yes\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"ATO records is the best label for their type of music\"? Yes, no, or maybe? Maybe\n###\nIn tabletop role-playing games, the character race represents the people to which a player character (PC) or a non-player character (NPC) belongs. \"People\" is to be taken in the broader sense, and may encompass ethnic groups, species, nationality or social groups. Are we justified in saying that \"People is discontinued due to its criticism\"? Yes, no, or maybe? Maybe\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall. Are we justified in saying that \"The fortress also function as a dining hall.\"? Yes, no, or maybe? Maybe\n###\nOlga Bay larch or Olgan larch (\"Larix olgensis\"), a species of larch, is named after Olga Bay in the Sea of Japan. The common name in Chinese is \u9ec4\u82b1\u843d\u53f6\u677e (pinyin: huang hua luo ye song). This species occurs in Central Sikhote-Alin, and rarely occurs in North Korea, and Jilin and eastern Heilongjiang provinces of China, between 500 and 1100 metres in elevation. Are we justified in saying that \"Olga Bay larch is the same thing as huang hua luo ye song\"? Yes, no, or maybe?", "doc_id": 252, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4710, 23847, 43369, 10039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146. Are we justified in saying that \"Not all communities in Keith County are unincorporated. \"? Yes, no, or maybe? Maybe\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani. Are we justified in saying that \"FS Kozani has lost most of it's games.\"? Yes, no, or maybe? Maybe\n###\nPillars of Eternity: The White March is a two-part expansion pack for the 2015 role-playing video game \"Pillars of Eternity\", developed by Obsidian Entertainment and published by Paradox Interactive. The first part was released on August 25, 2015, while the second was released on February 16, 2016. Are we justified in saying that \"In Pillars of Eternity players pretend to be someone they're not.\"? Yes, no, or maybe? Yes\n###\nG\u00f6tz Freiherr von Houwald (May 13, 1913 \u2013 August 16, 2001) was a German diplomat, historian and ethnographer. He was born in Posen and died in Bonn. His full name was Maximilian Otto Gustav Albrecht Hubert Wilhelm G\u00f6tz-Dieter Freiherr von Houwald.G\u00f6tz-Dieter von Houwald's parents were Albrecht Freiherr von Houwald und Helene Gr\u00e4fin von Carmer. Are we justified in saying that \"Gotz Freiherr von Houwald died on 7/16/2001\"? Yes, no, or maybe? No\n###\nNick Davis is a visual effects supervisor who has worked in visual effects since the early 1990s. He was nominated at the 81st Academy Awards for \"The Dark Knight\". He was nominated in the category of Best Visual Effects, he shared his nomination with Chris Corbould, Paul Franklin and Tim Webber. Are we justified in saying that \"Nick Davis began his career in visual effects during the final decade of the 20th century\"? Yes, no, or maybe?", "doc_id": 202, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2909, 13371, 20586, 7124], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amara Karan (born 1984) is a Sri Lankan-English actress who made her film d\u00e9but as the love interest in Wes Anderson's \"The Darjeeling Limited\". The film premi\u00e8red at the 2007 Venice Film Festival. Karan's second film role was as schoolgirl Peaches in the 2007 film \"St Trinian's\". Are we justified in saying that \"Karan played many schoolgirls in films\"? Yes, no, or maybe? Maybe\n###\nFan and Mortar Geysers are two geysers in the Upper Geyser Basin in Yellowstone National Park. For the past several decades, they have erupted in concert with one another and are generally talked about together. The records detailing these geysers' known eruptive history shows that they have been infrequent and irregular performers. Are we justified in saying that \"Yellowstone National Park is the largest national park\"? Yes, no, or maybe? Maybe\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry. Are we justified in saying that \"Chess Records was working with Chuck Berry 63 years ago.\"? Yes, no, or maybe? Yes\n###\nThe Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books. Are we justified in saying that \"Paul Brooks produced a film starring Kelsey Grammer about a detective who recruits the help of a young girl to write children's books\"? Yes, no, or maybe? Yes\n###\nThe Kur\u0161ininkai (Curonians; German: \"Kuren\" ; Lithuanian: \"kur\u0161ininkai, kur\u0161iai\" ; Latvian: \"kursenieki, kur\u0161i\" ; Polish: \"kuronowie pruscy\" ) are a nearly extinct Baltic ethnic group living along the Curonian Spit. \"Kur\u0161ininkai\" refers only to inhabitants of Lithuania and former East Prussia that speak a dialect of Latvian. Are we justified in saying that \"The Curonians were linguistically influenced by the Latvians.\"? Yes, no, or maybe?", "doc_id": 680, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20051, 30929, 15405, 38418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th. Are we justified in saying that \"Irina Cherniaeva was born in Moscow.\"? Yes, no, or maybe? No\n###\nStoked (stylized as \"St\u014dked\") is a Canadian animated series produced by Fresh TV that premiered on Teletoon on June 25, 2009 and ended on January 26, 2013. It formerly aired on Teletoon in Canada and ABC3 in Australia, and on Cartoon Network in the United States. The series is from the same creators as \"6teen\" and the \"Total Drama\" series. Are we justified in saying that \"Stoked aired for less than exactly 4 years. \"? Yes, no, or maybe? Yes\n###\nRyman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc. Are we justified in saying that \"Ryman Auditorium is the largest in Nashville, Tennessee.\"? Yes, no, or maybe? Maybe\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic. Are we justified in saying that \"Lausche is not shaped like a cone.\"? Yes, no, or maybe? No\n###\n\"Drivin' Around Song\" is a song recorded by American country rap singer Colt Ford and country music singer Jason Aldean. It is the third single from his fourth studio album, \"Declaration of Independence\". The song was written by Chris Tompkins and Craig Wiseman. Are we justified in saying that \"Colt Ford was born in nineteen hundred fifty seven.\"? Yes, no, or maybe?", "doc_id": 902, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9018, 33751, 33736, 32233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is situated by water.\"? Yes, no, or maybe? Yes\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012. Are we justified in saying that \"The Cite du Cinema is in France.\"? Yes, no, or maybe? Yes\n###\n\"You'll Be Back\" is the seventh song from Act 1 of the musical \"Hamilton\", based on the life of Alexander Hamilton, which premiered on Broadway in 2015. Lin-Manuel Miranda wrote both the music and lyrics to the song. It is sung by Jonathan Groff in the show's original cast recording. Are we justified in saying that \"Jonathan Groff wrote the music and lyrics to the song.\"? Yes, no, or maybe? No\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"Dave Lee Murphy released a song that made it to the top 10 on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996.\"? Yes, no, or maybe? Yes\n###\nCoptosapelteae is a tribe incertae sedis of flowering plants in the Rubiaceae family and contains about 55 species in 2 genera. Its representatives are found in tropical and subtropical Asia. This tribe has not been placed within as subfamily of Rubiaceae, but is sister to the rest of the family. Are we justified in saying that \"I am the tribe of Coptosapelteae \"? Yes, no, or maybe?", "doc_id": 813, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17689, 24953, 13048, 1706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas. Are we justified in saying that \"The marchers usually wear bowler hats.\"? Yes, no, or maybe? Yes\n###\nCarol Hernandez is an American journalist from Miami Florida. She won a 1996 Goldsmith Prize for Investigative Reporting. She won the 1996 Pulitzer Prize for National Reporting. She currently resides in Long Island with her husband, and three children, (the oldest being the best and most funny and creative). Are we justified in saying that \"Carol Hernandez has the same last name as her husband.\"? Yes, no, or maybe? Maybe\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues. Are we justified in saying that \"Ethereal ran into legal issues with its trademark so it was renamed.\"? Yes, no, or maybe? Yes\n###\nThe Death and Life of John F. Donovan is an upcoming Canadian drama film, co-written, co-produced and directed by Xavier Dolan in his English-language debut. It stars Kit Harington, Natalie Portman, Jessica Chastain, Susan Sarandon, Kathy Bates, Jacob Tremblay, Ben Schnetzer, Thandie Newton, Amara Karan, Chris Zylka, Jared Keeso, Emily Hampshire and Michael Gambon. Are we justified in saying that \"The Death and Life of John F. Donovan is Kit Harington's English-language debut\"? Yes, no, or maybe? Maybe\n###\nCorrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed. Are we justified in saying that \"Corrina, Corrina was based on a happy event.\"? Yes, no, or maybe?", "doc_id": 591, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34093, 17050, 27903, 37500], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Lyra\" is a song written, produced, and performed by British recording artist Kate Bush, from the 2007 soundtrack album \"The Golden Compass\" from the film of the same name. It is used in the closing credits of the film. Bush was commissioned to write the song, with the request that it make reference to the lead character, Lyra Belacqua. Are we justified in saying that \"Kate bush writes songs in the early 2000's\"? Yes, no, or maybe? Yes\n###\nThe Alfa Romeo Brera and the Alfa Romeo Spider (Type 939) are two sports cars manufactured by Alfa Romeo respectively between 2005-2010 and 2006-2010. The Brera is a 2+2 coup\u00e9, while the Spider is its roadster version. Both models were built by Pininfarina. Are we justified in saying that \"Pininfarina has only ever built one sports car model for Alfa Romeo \"? Yes, no, or maybe? No\n###\nShabbona Township is one of nineteen townships in DeKalb County, Illinois, USA. As of the 2010 census, its population was 1,453 and it contained 603 housing units. The township contains the Chief Shabbona Forest Preserve and Shabbona Lake State Park. Are we justified in saying that \"In 2010, there was at least 600 housing units. \"? Yes, no, or maybe? Yes\n###\nWanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001. Are we justified in saying that \"Wanker Records is a fair record label \"? Yes, no, or maybe? Maybe\n###\nNational Highway 26 (NH 26), (previously National Highway 43), is a National Highway in India, that connects Raipur in Chhattisgarh and passes through Odisha to connect with Natavalasa in Vizianagaram district of Andhra Pradesh. It connects National Highway 5 and National Highway 6 and transverses the Eastern Ghats. Are we justified in saying that \"National Highway 5 and 6 traverse the Eastern Ghats.\"? Yes, no, or maybe?", "doc_id": 508, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24736, 19573, 12506, 5442], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Two Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations. Are we justified in saying that \"British customers use Two Men And A Truck.\"? Yes, no, or maybe? Yes\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy. Are we justified in saying that \"The National Rehabilitation Hospital is a very bad hospital\"? Yes, no, or maybe? Maybe\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing. Are we justified in saying that \"Ingham County is Michigan's most sparsely populated county\"? Yes, no, or maybe? Maybe\n###\nBronwen (] ) is a Welsh feminine given name. It is closely associated with the similar name \"Branwen\", which appears in medieval Welsh literature. Used in Wales since the 19th century, it was introduced to the English-speaking public at large by a character in the Richard Llewellyn novel \"How Green Was My Valley\" (1939). Are we justified in saying that \"The name has seen a decline in use since the 70's.\"? Yes, no, or maybe? Maybe\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\". Are we justified in saying that \"Mark Donovan was born on an even day\"? Yes, no, or maybe?", "doc_id": 473, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10463, 25801, 26795, 27039], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released. Are we justified in saying that \"It was recorded during the tour celebrating four decades on the road.\"? Yes, no, or maybe? Yes\n###\nMurder Rock (Italian: Murderock - uccide a passo di danza; also known as Murder-Rock: Dancing Death, Slashdance and The Demon Is Loose!) is a 1984 Italian giallo film starring Olga Karlatos and Ray Lovelock, and written and directed by Lucio Fulci. Fulci recalled the producer forced him to turn the film into a musical with the music of Keith Emerson due to the success of \"Flashdance\". Are we justified in saying that \"Murder Rock was directed by Donald Trump\"? Yes, no, or maybe? No\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles). Are we justified in saying that \"The first British Grand Prix was seen all over the world.\"? Yes, no, or maybe? Maybe\n###\nThe Hamas-Jund Ansar Allah clash was a battle, fought between the police forces of the Islamist group Hamas controlling Gaza, and the radical Islamist group Jund Ansar Allah. The fighting began on 14 August 2009 and concluded the next day. In total, 24 people were killed in the fighting, including six Hamas police officers and an 11-year-old girl, and a further 150 were wounded. Are we justified in saying that \"The battle was fought with incendiaries.\"? Yes, no, or maybe? Maybe\n###\nThe 1941 Cabo San Lucas hurricane is considered one of the worst tropical cyclones on record to affect Cabo San Lucas. The hurricane was first reported on September\u00a08 off the coast of Mexico. It slowly moved northwestward while intensifying. After peaking in intensity, it entered the Gulf of California, and weakened rapidly. It dissipated on September\u00a013. Are we justified in saying that \"The 1941 Cabo San Lucas hurricane was not a weather formation that one would consider taking precautions with\"? Yes, no, or maybe?", "doc_id": 846, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23509, 25288, 23634, 29875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"Maps members are based in two bordering states.\"? Yes, no, or maybe? Yes\n###\nKulte is a clothing label from Marseille. It was created in 1998 and in 2013 it owns more than 10 shops mainly in France (its first foreign shop opened in Athens in 2011). The brand collaborated with several artists (MGMT, Na\u00efve New Beaters) and music related organizations (including the music festivals, Marsatac and Transmusicales, and record labels, Because Music and Kitsun\u00e9). Are we justified in saying that \"MGMT and Marsatac are commercial partners of Kulte.\"? Yes, no, or maybe? Yes\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Tillya tepe contains a lot of ivory.\"? Yes, no, or maybe? Maybe\n###\nWilliam V. Bidwill Sr. (born July 31, 1931) is the principal owner and chairman of the board of the Arizona Cardinals of the National Football League (NFL). He was co-owner from 1962 for ten seasons with his brother Charles Jr. and has been sole owner since 1972. Are we justified in saying that \"The Arizona Cardinals existed in 1962.\"? Yes, no, or maybe? Yes\n###\nWinnie the Pooh and a Day for Eeyore is a 1983 Disney Winnie the Pooh animated featurette, based on two chapters from the books \"Winnie-the-Pooh\" and \"The House at Pooh Corner\", originally released theatrically on March 25, 1983, with the 1983 re-issue of \"The Sword in the Stone\". It is the fourth and final of Disney's original theatrical featurettes adapted from the Pooh books by A. A. Milne. Are we justified in saying that \"A. A. Milne was successful\"? Yes, no, or maybe?", "doc_id": 582, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2658, 19665, 31282, 22719], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cyrinda Foxe (born Kathleen Victoria Hetzekian; February 22, 1952 \u2013 September 7, 2002) was an American actress, model and publicist, best known for her role in \"Andy Warhol's Bad\" (1977). She was married to both David Johansen of the proto-punk band New York Dolls and Steven Tyler of the hard rock band Aerosmith. She is the mother of Mia Tyler. Are we justified in saying that \"Mia Tyler was in Andy Warhol's Bad\"? Yes, no, or maybe? No\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign. Are we justified in saying that \"The brigade did not fight in Asia.\"? Yes, no, or maybe? No\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons. Are we justified in saying that \"The Grand Prix des Fronti\u00e8res was a motor race held caused much stress to Jules Buisseret\"? Yes, no, or maybe? Maybe\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee. Are we justified in saying that \"Gwinnett County Public Schools has teachers.\"? Yes, no, or maybe? Yes\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft. Are we justified in saying that \"Carl Filip Anton Forsberg was born more than 3 hours ago.\"? Yes, no, or maybe?", "doc_id": 205, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43807, 21591, 6428, 36886], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Player\" is a song recorded by American singer Tinashe. It features guest vocals by American singer Chris Brown. The song was released by RCA Records as the intended first single off her second album on October 2, 2015, but was later scrapped. \"Player\" was written by Tinashe, Myron Birdsong, Brown, its producers Lulou and Alexander Kronlund, and Chloe Angelides. Are we justified in saying that \"The song had six writers.\"? Yes, no, or maybe? Yes\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"People pick black-eyed susans\"? Yes, no, or maybe? Maybe\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg. Are we justified in saying that \"Trevor wanted to call it the Currano Diagram\"? Yes, no, or maybe? Maybe\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"The Santa Cova Funicular is a expensive railway\"? Yes, no, or maybe? Maybe\n###\nEuroprop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas. Are we justified in saying that \"Europrop International is known by his first aircraft. \"? Yes, no, or maybe?", "doc_id": 851, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36899, 774, 10264, 4488], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador. Are we justified in saying that \"Wolosky was in the Air Force.\"? Yes, no, or maybe? Maybe\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced. Are we justified in saying that \"Splice is a 2008 Canadian-\"? Yes, no, or maybe? No\n###\n\"Se Telefonando\" is a song performed by the Italian singer Mina, released in May 1966. The music was composed, orchestrated and conducted by Ennio Morricone to Italian lyrics by Di Chiara and Costanzo. (Reportedly Costanzo only contributed one word, in editing a previous version of a verse, to avoid censorship). The song was written for a radio broadcast, called \u201cAria condizionata\u201d. Are we justified in saying that \"Mina passed away a few years ago.\"? Yes, no, or maybe? Maybe\n###\nWalkin' is the debut mini-album by South Korean singer Suran. It was released on June 2, 2017, by Million Market and distribuited by LOEN Entertainment. It consists of five songs, including \"Wine\" featuring rapper Changmo, previously released as a digital single, and the title track \"1+1=0\" featuring singer Dean. Are we justified in saying that \"Wine was one of five featured songs on the album from Suran.\"? Yes, no, or maybe? Yes\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh. Are we justified in saying that \"Shehzad Sheikh and Javed Sheikh have both acted in the same film.\"? Yes, no, or maybe?", "doc_id": 320, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36374, 34403, 15581, 2210], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Child Whispers (published in 1922) is the first published work of the English children's author Enid Blyton, illustrated by her childhood friend and collaborator Phyllis Chase. It is a collection of 28 poems, and one of Blyton's most popular and best-known poetry books. Are we justified in saying that \"Child Whispers was published in 1927\"? Yes, no, or maybe? No\n###\nThe 1919 PGA Championship was the second PGA Championship, which is now considered one of golf's major championships. It was held September 16\u201320 at the Engineers Country Club in Roslyn Harbor, New York, east of New York City on Long Island in Nassau County. Are we justified in saying that \"Golf's major championship was held in Nassau county\"? Yes, no, or maybe? Yes\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser played a huge role on the mass killing of jews.\"? Yes, no, or maybe? Maybe\n###\nDenis Villeneuve (] ; born October 3, 1967) is a French Canadian film director and writer. He is a four-time recipient of the Canadian Screen Award (formerly Genie Award) for Best Direction, for \"Maelstr\u00f6m\" in 2001, \"Polytechnique\" in 2010, \"Incendies\" in 2011, and \"Enemy\" in 2013. The first three films also won the Academy of Canadian Cinema and Television Award for Best Motion Picture. Are we justified in saying that \"Denis Villeneuve was born in Montreal.\"? Yes, no, or maybe? Maybe\n###\nA meat analogue, also called a meat alternative, meat substitute, mock meat, faux meat, imitation meat, or (where applicable) vegetarian meat or vegan meat, approximates certain aesthetic qualities (primarily texture, flavor and appearance) and/or chemical characteristics of specific types of meat. Many analogues are soy-based (see: tofu, tempeh) or gluten-based. Are we justified in saying that \"Meat analogues do not contain meat.\"? Yes, no, or maybe?", "doc_id": 528, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22069, 31219, 20396, 2562], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Penthouse is a 1933 American Pre-Code crime film starring Warner Baxter as a lawyer and Myrna Loy, as a call girl who helps him with a murder case. It was directed by W. S. Van Dyke and written by Frances Goodrich and Albert Hackett, based on a novel by Arthur Somers Roche. The film was later remade as the more sanitized \"Society Lawyer\" (1939), without the risqu\u00e9 pre-Code dialogue. Are we justified in saying that \"Penthouse is a complex film.\"? Yes, no, or maybe? Maybe\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport. Are we justified in saying that \"Camair-Co has a big headquarters.\"? Yes, no, or maybe? Maybe\n###\nLittle Fluffy Gigolo Pelu (Japanese: \u30d5\u30a1\u30f3\u30b7\u30fc\u30b8\u30b4\u30ed \u30da\u30eb , Hepburn: Fansh\u012b Jigoro Peru , a.k.a. \"Fancy Gigolo Pelu\") is a three \"tank\u014dbon\" manga series written and illustrated by Junko Mizuno and published by Enterbrain. The series has been licensed in North America and France where the first volume received mostly positive reviews. Are we justified in saying that \"The Series is available in at least 2 countries\"? Yes, no, or maybe? Yes\n###\nSugar & Spice is a 2001 American teen crime comedy film directed by Francine McDougall, and starring Marley Shelton, Marla Sokoloff, Mena Suvari, James Marsden, and Melissa George. The plot follows a group of high school cheerleaders who conspire and commit armed robbery when one of them becomes pregnant and desperate for income. Are we justified in saying that \"The desperation of a group of cheerleaders to support their pregnant friend, starring Marla Sokoloff, is what lead to the movie plot in Sugar & Spice\"? Yes, no, or maybe? Maybe\n###\nAsbury First United Methodist Church is located on East Avenue in Rochester, New York, United States. It traces its heritage to several Rochester congregations dating back to the 1820s. In its current form, it is the result of a 1934 merger of First Church and Asbury Methodist Episcopal Church. With a congregation of 2,300 people, it is the largest United Methodist church in the Rochester area. Are we justified in saying that \"Asbury Methodist Episcopal Church will always be the largest church in the Rochester area.\"? Yes, no, or maybe?", "doc_id": 437, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25983, 25129, 16092, 10317], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German. Are we justified in saying that \"Franz Schuber really liked The Lady of the Lake\"? Yes, no, or maybe? Maybe\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist. Are we justified in saying that \"Ana B\u00e1rbara sings in Spanish exclusively\"? Yes, no, or maybe? Maybe\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit. Are we justified in saying that \"Cloverdale is the least populated town in Sonoma County\"? Yes, no, or maybe? Maybe\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"The Shokaku class aircraft carriers were part of the reason the United States was brought into the Pacific War.\"? Yes, no, or maybe? Yes\n###\nJatin\u2013Lalit are a Bollywood film composer duo consisting of Jatin Pandit and his younger brother Lalit. They have written the scores for films such as \"Khiladi\", \"Jo Jeeta Wohi Sikandar\", \"\", \"Dilwale Dulhania Le Jayenge\", \"Yes Boss\", \"Jab Pyaar Kisise Hota Hai\", \"Kuch Kuch Hota Hai\", \"Mohabbatein\", \"Kabhi Khushi Kabhi Gham\" and \"Fanaa\" . Are we justified in saying that \"Jatin-Lalit write primarily for an Indian market\"? Yes, no, or maybe?", "doc_id": 318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42798, 1493, 33329, 44028], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phoenix Police Department is the law enforcement agency responsible for the city of Phoenix, Arizona. Today , the Phoenix Police Department comprises more than 2,900 officers and more than 1,000 support personnel. The department serves a population of more than 1.6 million and patrol almost 516 sqmi of the sixth largest city in the United States. Are we justified in saying that \"The police department in Phoenix is named after the city its responsible for.\"? Yes, no, or maybe? Yes\n###\nTabitha Anastasia \"Tibby\" Tomko-Rollins is a fictional character in the 2001 novel \"The Sisterhood of the Traveling Pants\" and the 2005 film based upon it. She is a member of the titular club, along with her friends Lena Kaligaris, Bridget Vreeland and Carmen Lowell. She was portrayed by Amber Tamblyn in the film. Are we justified in saying that \"Lena Kaligaris, Bridget Vreeland, Carmen Lowell and Amber Tamblyn each portrayed the same character in the 2005 film based on the 2001 novel \"The Sisterhood of the Traveling Pants\", except Lena Kaligaris.\"? Yes, no, or maybe? No\n###\nSyracuse IMG Sports Network is the radio and television name for Syracuse University sports. The radio affiliates broadcast football, as well as men's and women's basketball and men's lacrosse games. Time Warner Cable Sports broadcasts the coaches' show and a weekly program titled \"Syracuse Sidelines\". Are we justified in saying that \"Syracuse IMG Sports Network can be listened to on the radio as well as watched on television\"? Yes, no, or maybe? Yes\n###\nVirginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly. Are we justified in saying that \"Virginia's Eleventh Congressional District is led by Gerry Connolly\"? Yes, no, or maybe? Yes\n###\nFlatbush Avenue is a major avenue in the New York City Borough of Brooklyn. It runs from the Manhattan Bridge south-southeastward to Jamaica Bay, where it joins the Marine Parkway\u2013Gil Hodges Memorial Bridge, which connects Brooklyn to the Rockaway Peninsula in Queens. The north end was extended to the Manhattan Bridge as \"Flatbush Avenue Extension.\" Are we justified in saying that \"The north end extension was going to be called \"Flatbush Avenue Extension,\" Pt. 2, but wasn't.\"? Yes, no, or maybe?", "doc_id": 159, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7991, 3064, 26785, 23722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added. Are we justified in saying that \"Stereolab created many albumns\"? Yes, no, or maybe? Maybe\n###\n\"Show Me Love\" is a song by German DJ and record producer Robin Schulz and British singer J.U.D.G.E. The song was released as a digital download in Germany on 13 November 2015 as the third single from his second studio album \"Sugar\" (2015). The song was written by Dennis Bierbrodt, J\u00fcrgen Dohr, Guido Kramer, Robin Schulz and Richard Judge. Are we justified in saying that \"The only song Dennis wrote was Show Me Love\"? Yes, no, or maybe? Maybe\n###\nThe 1980 British Grand Prix (formally the XXXIII Marlboro British Grand Prix) was a Formula One motor race held at Brands Hatch on 13 July 1980. It was the eighth round of the 1980 Formula One season. The race was held over 76 laps of the 4.207-km (2.614-mile) circuit for a total race distance of 319.73 km (198.67 miles). Are we justified in saying that \"The total race distance of 319.73 km was the longest race of the season.\"? Yes, no, or maybe? Maybe\n###\nThis is a list of Japanese idols; a type of celebrity in Japan. The word \"idol\" is almost always used to refer to a young woman, although there a significant number of male idols. The following list includes both female and male idols as well as both solo idols and idol groups. Are we justified in saying that \"Female idols earn more money than male idols in Japan.\"? Yes, no, or maybe? Maybe\n###\nD\u00fcrnstein is a small town on the Danube river in the Krems-Land district, in the Austrian state of Lower Austria. It is one of the most-visited tourist destinations in the Wachau region and also a well-known wine growing area. The municipality consists of the Katastralgemeinden \"D\u00fcrnstein, Oberloiben\" and \"Unterloiben\". Are we justified in saying that \"D\u00fcrnstein is well known for its wineries\"? Yes, no, or maybe?", "doc_id": 567, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31251, 37881, 43097, 1027], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "TOTO is a legalized form of lottery sold in Singapore, known by different names elsewhere. Singapore Pools is the only legal lottery operator in Singapore. It was established on 23 May 1968 to control widespread illegal gambling in Singapore during the 1960s. Are we justified in saying that \"TOTO is known by different names in other countries\"? Yes, no, or maybe? Yes\n###\nWilliston Municipal Airport (FAA LID: X60) is a city-owned, public-use airport located two\u00a0nautical miles (4\u00a0km) southwest of the central business district of Williston, a city in Levy County, Florida, United States. Commonly referred to as Williston Airport, it is located 23 mi southwest of Gainesville Regional Airport (GNV). Opened in 1974 for public use, it does not have a control tower. Are we justified in saying that \"Levy County, Florida doesn't have enough airports.\"? Yes, no, or maybe? Maybe\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"The frequency 594 kHz AM covers all of Victoria, Australia. \"? Yes, no, or maybe? Yes\n###\nBaya M. Harrison, Jr. (1912 in Tampa, Florida \u2013 1975) was a politician and an attorney in Florida. He served as Chairman of the Florida Board of Control from 1960\u20131964. Harrison greatly impacted the State University System of Florida and helped desegregate Florida colleges and universities. He served as President of the Florida Bar in 1957. Are we justified in saying that \"Baya M. Harrison, Jr. held a position during his lifetime on a Board.\"? Yes, no, or maybe? Yes\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"Gulf Air features only international flights.\"? Yes, no, or maybe?", "doc_id": 462, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10304, 11234, 42734, 2962], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arthur John Duckworth (born 19 January 1949) is a former Australian rules footballer who played for Fitzroy in the Victorian Football League (VFL), West Perth in the West Australian National Football League (WANFL), and Central District in the South Australian National Football League (SANFL). He is the older brother of former Essendon footballer Billy Duckworth. Are we justified in saying that \"Arthur John Duckworth is more than 60 years old\"? Yes, no, or maybe? Yes\n###\nYissachar Dov Rokeach (born 19 January 1948) is the fifth and present Rebbe of the Hasidic dynasty of Belz. He is the son of Rabbi Mordechai of Bilgoray (1902 \u2013 1949), the grandson of the third Belzer Rebbe, Rabbi Yissachar Dov Rokeach, and the nephew of the fourth Belzer Rebbe, Rabbi Aharon Rokeach, who raised him. He has led Belz since 1966. Are we justified in saying that \"Yissachar Dov Rokeach is 71 years old.\"? Yes, no, or maybe? Yes\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums). Are we justified in saying that \"The Asteroids Galaxy Tour could only perform live with a five-piece\"? Yes, no, or maybe? Yes\n###\nIn the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese). Are we justified in saying that \"Prologica must also make its own cell phones if it makes its own computers\"? Yes, no, or maybe? Maybe\n###\nThe 2007 Hertsmere Borough Council election took place on 3 May 2007 to elect members of Hertsmere Borough Council in Hertfordshire, England. One third of the council was up for election and the Conservative party stayed in overall control of the council. Are we justified in saying that \"The Hertsmere Borough Council held an election in 2007 in which the liberal members were in the minority.\"? Yes, no, or maybe?", "doc_id": 165, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32357, 23912, 20952, 14686], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company. Are we justified in saying that \"When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office.\"? Yes, no, or maybe? Maybe\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television. Are we justified in saying that \"Sebastian Philip Bierk was born before April 4, 1968.\"? Yes, no, or maybe? Yes\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison. Are we justified in saying that \"Alix Bancourt is a blogger who writes in french but it is translated to English on her online blog for people who can't speak french\"? Yes, no, or maybe? Yes\n###\nVersailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S. Are we justified in saying that \"Versailles has multiple episodes.\"? Yes, no, or maybe? Yes\n###\nThe Alfa Romeo Brera and the Alfa Romeo Spider (Type 939) are two sports cars manufactured by Alfa Romeo respectively between 2005-2010 and 2006-2010. The Brera is a 2+2 coup\u00e9, while the Spider is its roadster version. Both models were built by Pininfarina. Are we justified in saying that \"The Alfa Romeo Brera and the Alfa Romeo Spider were manufactured for the same duration of time \"? Yes, no, or maybe?", "doc_id": 475, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7859, 13927, 7261, 38510], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amethyst: Princess of Gemworld is a comic book series published by DC Comics in the 1980s. The series tells the story of a teenage girl named Amy Winston who discovers that she is the orphaned princess of the magical Gemworld. Amy learns that an evil ruler called Dark Opal is out to destroy her and travels to Gemworld to overthrow him. Are we justified in saying that \"In the Amethyst: Princess of Gemworld comic book series, Amy Winston succeeds in defeating Dark Opal \"? Yes, no, or maybe? Maybe\n###\nThe Mannlicher\u2013Sch\u00f6nauer (sometimes Anglicized as \"Mannlicher Schoenauer,\" Hellenized as \u03a4\u03c5\u03c6\u03ad\u03ba\u03b9\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1 or \u038c\u03c0\u03bb\u03bf\u03bd \u039c\u03ac\u03bd\u03bb\u03b9\u03c7\u03b5\u03c1-\u03a3\u03b5\u03bd\u03ac\u03bf\u03c5\u03b5\u03c1) is a type of rotary-magazine bolt-action rifle produced by Steyr Mannlicher for the Greek Army in 1903 and later was also used in small numbers by the Austro-Hungarian armies. Post war use was for civilian use such as hunting and target practice. Are we justified in saying that \"The Mannlicher\u2013Sch\u00f6nauer killed the least amount of people.\"? Yes, no, or maybe? Maybe\n###\nCross Gene (Korean: \ud06c\ub85c\uc2a4\uc9c4; stylized as CROSS GENE) is a five-member boy group based in South Korea. Signed under Amuse Korea, the group consists of South Korean members: Shin, Sangmin, Yongseok and Seyoung and Japanese member Takuya. They released their debut mini-album \"Timeless: Begins\" in 2012, which peaked at #8 on the Gaon weekly album chart. Are we justified in saying that \"The group Cross gene has more than one member, but only one of the members is Japanese.\"? Yes, no, or maybe? Yes\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015. Are we justified in saying that \"The state was created in 2001.\"? Yes, no, or maybe? No\n###\nThe Lei \u00c1urea (] ; English: Golden Law ), adopted on May 13, 1888, was the law that abolished slavery in Brazil. It was signed by Isabel, Princess Imperial of Brazil (1846\u20131921), an opponent of slavery, who acted as regent to Emperor Dom Pedro II, who was in Europe. Are we justified in saying that \"The Lei Aurea law which was adopted on May 13, 1888 abolished slavery. It was opposed by Princess Imperial.\"? Yes, no, or maybe?", "doc_id": 191, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18337, 16473, 39983, 20426], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Alice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Alice Sue Claeys enjoys skiing\"? Yes, no, or maybe? Maybe\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex. Are we justified in saying that \"Sangre de Grado is hard to find. \"? Yes, no, or maybe? Maybe\n###\nTotal Film is a UK-based film magazine published 13 times a year (published monthly and a summer issue is added every year since issue 91, 2004 which is published between July and August issue) by Future Publishing. The magazine was launched in 1997 and offers cinema, DVD and Blu-ray news, reviews and features. \"Total Film\" is available both in print and interactive iPad editions. Are we justified in saying that \"Total Film launched before the millennium happened. \"? Yes, no, or maybe? Yes\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"All the songs of the album \"Maps & Companions\" were recorded on one weekend \"? Yes, no, or maybe? Maybe\n###\nAn Evening With Groucho is the title of a 1972 recording at New York City's Carnegie Hall of the last one-man show by American comedian Groucho Marx. Introduced by Dick Cavett, the show was released as a double album by A&M Records. Marx shared family stories and performed songs from Marx Brothers movies. Marvin Hamlisch accompanied Groucho on the piano. Are we justified in saying that \"An Evening with Groucho was recorded less than half a century ago\"? Yes, no, or maybe?", "doc_id": 51, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23112, 44834, 25118, 20123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League. Are we justified in saying that \"He won a cricket championship\"? Yes, no, or maybe? Maybe\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"Lanshan district is in Linyi province.\"? Yes, no, or maybe? No\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist. Are we justified in saying that \"Ana B\u00e1rbara was recorded in Mexico\"? Yes, no, or maybe? Maybe\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars. Are we justified in saying that \"Via Dante is a theater in Milan.\"? Yes, no, or maybe? No\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"People named Susan typically have blue eyes.\"? Yes, no, or maybe?", "doc_id": 39, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36505, 17516, 18772, 38858], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots. Are we justified in saying that \"102 Squadron was formed in 1962\"? Yes, no, or maybe? Yes\n###\nThe 1955 NCAA Skiing Championships were contested in Northfield, Vermont at the second annual NCAA-sanctioned ski tournament to determine the individual and team national champions of men's collegiate alpine, cross country skiing, and ski jumping in the United States. Are we justified in saying that \"The results of the 1955 NCAA Skiing Championships were contested.\"? Yes, no, or maybe? Maybe\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990). Are we justified in saying that \"Zale Dalen is a film director. He is not proud of his film the hounds of Notre Dame\"? Yes, no, or maybe? Maybe\n###\nStannis Baratheon is a fictional character in the \"A Song of Ice and Fire\" series of epic fantasy novels by American author George R. R. Martin, and its television adaptation \"Game of Thrones\". He is the second son of Steffon Baratheon, the lord of Storm's End, and his wife Lady Cassana Estermont, and brother to Robert and Renly. Are we justified in saying that \"Stannis Baratheon is in \"Game of Thrones\" and \"A Song of Ice and Fire\".\"? Yes, no, or maybe? Yes\n###\nAatank Hi Aatank (English:Terror everywhere) is a 1995 Indian crime film was written, edited and directed by Dilip Shankar. The movie is highly inspired by \"The Godfather\". It stars Rajinikanth, Aamir Khan, Juhi Chawla and Archana Joglekar in the lead. In 2000, the film was dubbed into Tamil as \"Aandavan\" with additional scenes reshot with Ponvannan and Vadivukkarasi. Are we justified in saying that \"Dilip Shankar was directly influenced by American culture.\"? Yes, no, or maybe?", "doc_id": 221, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7296, 18185, 3601, 29339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds. Are we justified in saying that \"The Phu Quoc Ridgeback only exists in Vietnam \"? Yes, no, or maybe? Maybe\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign. Are we justified in saying that \"The 23rd Infantry Brigade was the only infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II\"? Yes, no, or maybe? Maybe\n###\nDon Sinclair Davis, PhD (August 4, 1942 \u2013 June 29, 2008) was an American character actor best-known for playing General Hammond in the television series \"Stargate SG-1\" (1997\u20132007), and earlier for playing Major Garland Briggs on the television series \"Twin Peaks\" (1990\u20131991). He was also a theater professor, painter, and United States Army captain. Are we justified in saying that \"Don Sinclair Davis, Phd was an United States Army captain.\"? Yes, no, or maybe? Yes\n###\nThe Cable Guy is a 1996 American comedy film directed by Ben Stiller, starring Jim Carrey and Matthew Broderick. It was released in the United States on June 14, 1996. The film co-stars Leslie Mann, Jack Black, George Segal, Diane Baker, Eric Roberts, Owen Wilson, Janeane Garofalo, David Cross, Andy Dick, Amy Stiller, and Bob Odenkirk. Are we justified in saying that \"The Cable Guy is a 1000 + 997 American comedy film directed by Ben Stiller\"? Yes, no, or maybe? No\n###\nFlorence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution. Are we justified in saying that \"Harper's account of the early stages of the Russian revolution is the only account of that revolution we have. \"? Yes, no, or maybe?", "doc_id": 788, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17748, 16310, 14854, 24479], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis. Are we justified in saying that \"Pantelis is a musician.\"? Yes, no, or maybe? Yes\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films. Are we justified in saying that \"Jonathan Michael Lovitz is seen by millions.\"? Yes, no, or maybe? Maybe\n###\nLes Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads: Are we justified in saying that \"Poulenc began the work in January 1930.\"? Yes, no, or maybe? Maybe\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Viktor was the first person to discover Tillya tepe.\"? Yes, no, or maybe? Maybe\n###\nIrfan Khoosat (Urdu: \u0639\u0631\u0641\u0627\u0646 \u06a9\u06be\u0648\u0633\u0679\u200e ) is a Pakistani actor, producer and a well-known comedian. He is famous for his comic role as \"Hawaldar Karamdad\" in the TV series Andhera Ujala in which he portrayed simpleton and blabbermouth character of a low-ranked policeman. He is also known as stage comedian. He also won Nigar Award for his comic role in 1985 film \"Hum se hai zamana\". Are we justified in saying that \"The film Hum se hai zamana was released more than 3000 days ago.\"? Yes, no, or maybe?", "doc_id": 937, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41110, 25817, 9235, 4606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Khan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht. Are we justified in saying that \"Khan Kluay was a three-dimensional animated movie from Thailand.\"? Yes, no, or maybe? Yes\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330. Are we justified in saying that \"Thomas Cooper was the best England international footballer. \"? Yes, no, or maybe? Maybe\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film was released in the summer \"? Yes, no, or maybe? Yes\n###\nMorley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944. Are we justified in saying that \"Morley College Choir was founded at Yale\"? Yes, no, or maybe? No\n###\n\"Break the World\" is the lead single by alternative rock band Nine Lashes from their third album, \"From Water to War\". It was released on October 29, 2013 by Tooth & Nail Records. The song was the No. 1 \"Billboard\" Christian Rock song on January 25, 2014 chart. Are we justified in saying that \"Nine Lashes only released two albums as a group.\"? Yes, no, or maybe?", "doc_id": 9, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41344, 44603, 29123, 15218], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\". Are we justified in saying that \"Maria Ho can read faces.\"? Yes, no, or maybe? Maybe\n###\nJade is a 1995 American erotic thriller film written by Joe Eszterhas, produced by Robert Evans, directed by William Friedkin and starring David Caruso, Linda Fiorentino, Chazz Palminteri, Richard Crenna and Michael Biehn. The original music score was composed by James Horner based on a song composed by Loreena McKennitt. The film was marketed with the tagline \"Some fantasies go too far.\" Are we justified in saying that \"The fantasy genre saw the production of Jade, a 1995 American film written by Joe Eszterhas, with a score marketed with the tagline \"too far\".\"? Yes, no, or maybe? No\n###\n\"It's the Little Things\" is a 1967 single by Sonny James. \"It's the Little Things\" was Sonny James' twenty-fifth release on the country chart, the song went to number one on the country chart for five weeks and spent a total of fourteen weeks on the charts. Are we justified in saying that \"Sonny James is a rap artist \"? Yes, no, or maybe? No\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"Doomsday Device is a good term.\"? Yes, no, or maybe? Maybe\n###\nThe Icelandic national under-18 basketball team is the representative for Iceland in international Under-18 age basketball competitions, and it is organized and run by the Icelandic Basketball Federation. The team represents Iceland at the FIBA Europe Under-18 Championship. It is coached by Fri\u00f0rik Ingi R\u00fanarsson. Are we justified in saying that \"The Icelandic national under-18 basketball team was unsuccessful\"? Yes, no, or maybe?", "doc_id": 215, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23563, 14890, 33218, 28495], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California. Are we justified in saying that \"Michelle Do has talked to Bush.\"? Yes, no, or maybe? Maybe\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG. Are we justified in saying that \"Said bin Salim Al Shaksy spoke arabic\"? Yes, no, or maybe? Yes\n###\nALGOL 68 (short for ALGOrithmic Language 1968) is an imperative computer programming language that was conceived as a successor to the ALGOL 60 programming language, designed with the goal of a much wider scope of application and more rigorously defined syntax and semantics. Are we justified in saying that \"The language was very hard to use\"? Yes, no, or maybe? Maybe\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers. Are we justified in saying that \"Global Crossing and Hurricane Electric merged.\"? Yes, no, or maybe? Maybe\n###\nFlorence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution. Are we justified in saying that \"Harper left Russia before the Bolshevik Revolution.\"? Yes, no, or maybe?", "doc_id": 45, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4482, 20139, 2409, 37820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "West Town Mall is an upscale shopping mall located in Knoxville, Tennessee, United States. Opened in August 1972, this one-level mall is located in the western portion of Knoxville in the West Hills community. West Town Mall is located along Interstates 40/75 and Kingston Pike. The mall has over 1300000 sqft of Gross leasable area, making it the largest of any enclosed shopping mall in Tennessee. Are we justified in saying that \"West Town mall is located in Tennessee\"? Yes, no, or maybe? Yes\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville. Are we justified in saying that \"No. 27 Squadron RAAF is a Royal Australian Air Force reserve formed over 9 years ago\"? Yes, no, or maybe? Yes\n###\nThe Doberman Gang is a 1972 film about a talented animal trainer who uses a pack of Dobermans to commit a bank robbery. The six dogs were all named after famous bank robbers. Their names were Dillinger (John Dillinger), Bonnie (Bonnie Parker), Clyde (Clyde Barrow), Pretty Boy Floyd, Baby Face Nelson, and Ma Barker. Are we justified in saying that \"None of the dogs in The Doberman Gang had a name that included the last name of the bank robber they were named after.\"? Yes, no, or maybe? No\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries. Are we justified in saying that \"People really like nice things\"? Yes, no, or maybe? Maybe\n###\nCeres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club. Are we justified in saying that \"Ceres\u2013Negros Football Club had Trump.\"? Yes, no, or maybe?", "doc_id": 617, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3567, 14397, 9784, 40107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Flamingo is the debut solo studio album by American singer-songwriter and The Killers lead singer Brandon Flowers, released on September 3, 2010 by Island Records. It was recorded at Battle Born Studios in Winchester, Nevada, and Henson Recording Studios in Hollywood, California. The album debuted at number one on the UK Albums Chart. Are we justified in saying that \"Flamingo was worked on in the states of Nevada and California before it dropped in September of 2010.\"? Yes, no, or maybe? Yes\n###\nLament is the seventh studio album by British new wave band Ultravox, released in the UK on 6 April 1984. It was the last album featuring original drummer Warren Cann until the band's reunion album \"Brilliant\" in 2012. The album peaked at #8 on the UK album chart and was certified Gold by the BPI in June 1984 for 100,000 copies sold. It also reached #25 in Germany and #115 in the United States. Are we justified in saying that \"Ultravox had many top 10 albums\"? Yes, no, or maybe? Maybe\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle. Are we justified in saying that \" Symphonic songs are very loud.\"? Yes, no, or maybe? Maybe\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006. Are we justified in saying that \"It was not starring Tim Van Patten.\"? Yes, no, or maybe? Maybe\n###\nTime of Your Life is an American television drama series starring Jennifer Love Hewitt that aired for one season on Fox. A spin-off of \"Party of Five\", the series followed Sarah Reeves Merrin as she moved to New York City to learn more about her biological parents. Co-stars included Jennifer Garner, Pauley Perrette and Gina Ravera. Are we justified in saying that \"Time of Your Life is an American television drama series starring jlo\"? Yes, no, or maybe?", "doc_id": 152, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34971, 4631, 36231, 12929], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams. Are we justified in saying that \"The Gaming Control Act was highly contested before passing into law.\"? Yes, no, or maybe? Maybe\n###\nThe third season of \"Next Great Baker\" aired from November 26, 2012 to February 11, 2013. Like the previous season, this season was set at the Carlo's Bake Shop facility at Lackawanna Center in Jersey City, New Jersey. Unlike the previous two seasons, the finale for this season took place outside of the greater New York City area \u2013 in this case, in Las Vegas, Nevada at The Venetian Las Vegas. Are we justified in saying that \"Producers said US was the best location for a finale\"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Tinker Field no longer exists\"? Yes, no, or maybe? Yes\n###\nCaddyshack is a 1980 American comedy film directed by Harold Ramis and written by Brian Doyle-Murray, Ramis and Douglas Kenney. It stars Michael O'Keefe, Chevy Chase, Rodney Dangerfield, Ted Knight, and Bill Murray. Doyle-Murray also has a supporting role. The film was later dedicated to producer Douglas Kenney, who died shortly after the film's release. Are we justified in saying that \"Caddyshak a film that lost one of it's producers was dedicated to him. \"? Yes, no, or maybe? Yes\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points. Are we justified in saying that \"Lincoln, Nebraska's Memorial Stadium is located on campus at the University of Nebraska- Lincoln.\"? Yes, no, or maybe?", "doc_id": 83, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32058, 43861, 25647, 16950], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mentha diemenica is known by the common name of slender mint. The species is named after Van Diemen's Land, which is now called Tasmania. It is a mint species within the genus \"Mentha\", native not only to Tasmania but also to Queensland, New South Wales, Victoria, and South Australia. Are we justified in saying that \"Tasmania holds numerous different species of plants and animals, one specifically being the basil.\"? Yes, no, or maybe? Maybe\n###\nThe 198th Infantry Brigade, was first formed as part of the United States Army Reserve's 99th Division. It was active from 1967 through 1971 and has been active since 2007 as an Infantry Training Brigade as part of the US Army Infantry School at Fort Benning, Georgia. Are we justified in saying that \"The Brigade was inactive between 1972-2006.\"? Yes, no, or maybe? Yes\n###\nThe Never-Before-Released Masters is 1987 compilation album containing unreleased recordings recorded by Motown girl-group The Supremes from 1961 to 1969. It was the second CD release of unreleased recordings by The Supremes, the first being disc two of the 2 disc \"25th Anniversary\" compilation. Several other unreleased tracks appeared on earlier various artists compilations. Are we justified in saying that \"The Supremes was Kobe Bryant's favorite group to train to. \"? Yes, no, or maybe? Maybe\n###\nCroton lechleri is a species of flowering plant in the spurge family, Euphorbiaceae, that is native to northwestern South America. It is commonly known as Sangre de Grado (Peruvian Spanish) or Sangre de Drago (Ecuadorian Spanish), both of which translate to \"dragon's blood\". They refer to this tree's (and several related species') thick red latex. Are we justified in saying that \"Croton Lechleri is also known as dragon's breath. \"? Yes, no, or maybe? No\n###\nAlbert Ernest Clifford \"Cliff\" Young, OAM (8 February 19222 November 2003) was an Australian potato farmer and athlete from Beech Forest, Victoria, best known for his unexpected win of the inaugural Sydney to Melbourne Ultramarathon in 1983 at 61 years of age. Are we justified in saying that \"Albert Ernest Clifford \"Cliff\" Young died ten years after 2000.\"? Yes, no, or maybe?", "doc_id": 518, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7637, 2048, 8558, 18884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Good Night is a 2007 romantic comedy film written and directed by Jake Paltrow. The film stars his sister Gwyneth Paltrow, Pen\u00e9lope Cruz, Martin Freeman, Danny DeVito, Simon Pegg and others. The movie takes place in London and New York City, where a former pop star (Freeman) who now writes commercial jingles for a living experiences a mid-life crisis. Are we justified in saying that \"The Good Night was directed by the brother of Gwyneth Paltrow\"? Yes, no, or maybe? Yes\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"Migos also released a single in the year 2013.\"? Yes, no, or maybe? Maybe\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982). Are we justified in saying that \"\"King of the Jungle\" was a popular song in japan\"? Yes, no, or maybe? Maybe\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Ashcroft House is made with concrete\"? Yes, no, or maybe? No\n###\n\"May the Bird of Paradise Fly Up Your Nose\" is a 1965 novelty song performed by Little Jimmy Dickens. It was Dickens' most successful single on the U.S. country music chart. It spent two weeks at No. 1 that November, and stayed on the chart for a total of 18 weeks. On the overall \"Billboard\" Hot 100 the song peaked at No. 15. Are we justified in saying that \"\"May the Bird of Paradise Fly Up Your Nose\" was not on the charts for 6 months\"? Yes, no, or maybe?", "doc_id": 889, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2129, 5543, 31100, 32292], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Florence MacLeod Harper was a Canadian journalist sent by U.S. newspaper \"Frank Leslie's Illustrated Newspaper\" as a staff reporter with an assignment to cover World War I on the Eastern front. She was an early female war correspondent and one of a handful of western journalists to leave a first-hand journalistic account of the early stages of the Russian revolution. Are we justified in saying that \"Florence MacLeod Harper stayed with the same company for 25 years\"? Yes, no, or maybe? Maybe\n###\nThe Ferry County Carousel is an operational wooden carousel located three miles (5\u00a0km) east of the town of Republic, Washington on the Ferry County Fairgrounds. This carousel features 24 horses in two rows and is fitted with a jumping mechanism. The Ferry County Carousel is one of only seven classic wooden carousels in Washington state and possibly the oldest. Are we justified in saying that \"There are more metal carousels than wooden in Washington state.\"? Yes, no, or maybe? Maybe\n###\nThe Trexler Nature Preserve is an 1,108 acre county park owned and maintained by Lehigh County, Commonwealth of Pennsylvania. The preserve is situated in Lowhill Township and North Whitehall Township and the land that comprises the preserve was originally purchased between 1901 and 1911 by local industrialist General Harry Clay Trexler. Are we justified in saying that \"The Trexler Nature Preserve will be turned into a Walmart in 2050\"? Yes, no, or maybe? Maybe\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98. Are we justified in saying that \"Death Race is pie\"? Yes, no, or maybe? No\n###\nJohn (Johnnie) White (died 2007) was a high-ranking staff officer of the Official Irish Republican Army (Official IRA) in Derry, Northern Ireland and later Adjutant General of the Irish National Liberation Army (INLA). He was a key figure in Derry in the early years of the Troubles, and played a prominent role in the events surrounding the creation and defence of Free Derry. Are we justified in saying that \"John White is serving time for his involvment with the IRA.\"? Yes, no, or maybe?", "doc_id": 17, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3371, 29110, 22537, 2522], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the middle of 1984 a Brazilian company called Prol\u00f3gica, which made its own versions of 8 bits US computers, brought to the Brazilian market a new equipment for its personal computer series called \"CP\" (shorten of Personal Computer in Portuguese). Are we justified in saying that \"Brazilian company Prol\u00f3gica is based out of Rio de Janeiro.\"? Yes, no, or maybe? Maybe\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon plays the position of forward.\"? Yes, no, or maybe? Yes\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The police arrested the woman with the knife.\"? Yes, no, or maybe? No\n###\nThe Corridor (Lithuanian: Koridorius ) is a 1995 Lithuanian drama film directed by \u0160ar\u016bnas Bartas. It has a fragmentary narrative without dialogue and depicts several people in Vilnius. According to the director, the title symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\". Are we justified in saying that \"The Corridor portrays the everyday people in Vilnius \"? Yes, no, or maybe? Maybe\n###\nRoc-A-Fella Records Presents Teairra Mar\u00ed is the debut album by recording artist Teairra Mar\u00ed. It was released on August 2, 2005, by Roc-A-Fella Records. The album debuted in the top five selling 69,000 copies in the first week, eventually selling 248,000 units. Are we justified in saying that \"The album sold 69,000 copies per week\"? Yes, no, or maybe?", "doc_id": 597, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21127, 11229, 9605, 38344], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011. Are we justified in saying that \"Hipmunk media attention was still quite low despite its increase after Google's announcement.\"? Yes, no, or maybe? Maybe\n###\nColdwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep. Are we justified in saying that \"Coldwater fish refers to species that prefer to live in waters cooler than 20*C when in the context of aquariums.\"? Yes, no, or maybe? Yes\n###\nColdwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep. Are we justified in saying that \"Tropical fish prefer cooler water temperatures, typically below 20 degrees Celsius.\"? Yes, no, or maybe? No\n###\nArt of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley. Are we justified in saying that \"Art of Dying is fronted by Hetherington.\"? Yes, no, or maybe? Yes\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008. Are we justified in saying that \"Peter Franco won his first Grammy is 2014.\"? Yes, no, or maybe?", "doc_id": 201, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25166, 22195, 3445, 2525], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"David Lee Murphy toured around the US\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"The war caused significant damage.\"? Yes, no, or maybe? Maybe\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C. Are we justified in saying that \"Luton Town Ladies Football Club was founded in 1998.\"? Yes, no, or maybe? No\n###\nManiac (stylized as MANIAC) is an American short slasher film, directed by Shia LaBeouf. It was released on October 31, 2011. The short film stars American rappers Scott \"Kid Cudi\" Mecudi and Chris \"Cage\" Palko, as French-speaking serial killers. Mescudi and Palko also co-wrote the film with LaBeouf. Are we justified in saying that \"57% of the patrons seeing the film Maniac in the movie theater prefer their popcorn with extra butter.\"? Yes, no, or maybe? Maybe\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012. Are we justified in saying that \"Adrian Galvin was born in nineteen hundred eighty three.\"? Yes, no, or maybe?", "doc_id": 280, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28094, 8434, 19948, 7699], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"At some point the book changed titles\"? Yes, no, or maybe? Yes\n###\nReckless is the third book in the The It Girl novels by the German American author Cecily von Ziegesar. The series is ghostwritten from the original idea by Ziegesar. The series, aimed toward young adults, is a spin-off from the bestselling \"Gossip Girl\" series. It was released in 2006 by Little, Brown. Are we justified in saying that \"Reckless was geared towards younger adult audiences\"? Yes, no, or maybe? Yes\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying. Are we justified in saying that \"Elizabeth Short was an actress.\"? Yes, no, or maybe? Yes\n###\nUni\u00f3n Deportiva Vall de Ux\u00f3 is a football team based in La Vall d'Uix\u00f3, in Castell\u00f3n province, autonomous community of Valencian Community, Spain. Founded in 1975, it plays in Regional Preferente \u2013 Group 1. Its stadium is \"Jos\u00e9 Mangri\u00f1\u00e1n\", which has a capacity of 4,000 seats. Are we justified in saying that \"Uni\u00f3n Deportiva Vall de Ux\u00f3 is a popular team that always fills the stadium when they play\"? Yes, no, or maybe? Maybe\n###\nForever Lost is the second studio album by Norwegian recording artist A-Lee, released on October 5, 2012 in Norway, on EE Records and Columbia/Sony Music Norway. A-Lee worked with producers Ground Rules, Martin K, Bernt Rune Stray, BPM, Thomas Eriksen, Slipmats and The Products. The original album track list features Aleksander With, Elisabeth Carew and Marcus Only. Are we justified in saying that \"Forever Lost was not seen in South America\"? Yes, no, or maybe?", "doc_id": 941, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15329, 9542, 21843, 44903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Castle Wolfenstein is a stealth-based action-adventure shooter video game developed by Muse Software for the Apple II. It was first released in 1981 and later ported to MS-DOS, the Atari 8-bit family, and the Commodore 64. \"Beyond Castle Wolfenstein\" is its sequel. Are we justified in saying that \" Castle Wolfenstein was developed for the Macintosh. \"? Yes, no, or maybe? No\n###\nHeck's Department Store, a chain of West Virginia based discount department stores, was founded by Boone County natives and businessmen Fred Haddad, Tom Ellis, and Lester Ellis and wholesale distributor Douglas Cook. The Heck's name was a combination of the names Haddad, Ellis and Cook. Haddad served as President, Lester Ellis was Vice-President, and Tom Ellis was Secretary-Treasurer. Are we justified in saying that \"Heck's department stores are a a chain of West Virginia based discount grocery stores.\"? Yes, no, or maybe? No\n###\nBoneyard Beach is a 1995 album by Raleigh, North Carolina band Dish, led by singer and pianist Dana Kletter, on Interscope Records. The album was produced by John Agnello at Ardent Studios in Memphis, Tennessee. Interscope's VP, Tom Whalley, told \"Billboard\" magazine that \"the high quality of songwriting in Dish and the sound of Dana's voice are two things that set this band apart.\" Are we justified in saying that \"Dish released an album in 1995\"? Yes, no, or maybe? Yes\n###\nThere have been four head coaches of the Houston Texans, a professional American football team based in Houston, Texas, United States. The Texans play in the South Division of the American Football Conference (AFC) in the National Football League (NFL). Are we justified in saying that \"The Houston Texans start with a C.\"? Yes, no, or maybe? No\n###\nDean Young (born 1955) is a contemporary American poet in the poetic lineage of John Ashbery, Frank O'Hara, and Kenneth Koch. Often cited as a second-generation New York School poet, Young also derives influence and inspiration from the work of Andr\u00e9 Breton, Paul \u00c9luard, and the other French Surrealist poets. Are we justified in saying that \"Dean Young was a rapist\"? Yes, no, or maybe?", "doc_id": 396, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [45369, 5856, 8613, 32096], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Heinrich Karl Ludwig Herault de Seigneur de Hautcharmoy (1689 in Wesel \u2013 11 May 1757 in Margaret monastery at Prague) was a Prussian Lieutenant-General, Knight of the Black Eagle and commander of Brieg. His family was originally from Kingdom of France, and his father served as subordinate to Friedrich von Schomberg, and was killed with him at the Battle of the Boyne. Are we justified in saying that \"He outlived his father.\"? Yes, no, or maybe? No\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities. Are we justified in saying that \"Alberto Espinoza Barr\u00f3n was arrested last year.\"? Yes, no, or maybe? No\n###\nThe Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds. Are we justified in saying that \"The Phu Quoc Ridgeback is the most popular breed of ridgeback dogs\"? Yes, no, or maybe? Maybe\n###\nRoy Denzil Hibbert (born December 11, 1986) is a Jamaican-American professional basketball player who last played for the Denver Nuggets of the National Basketball Association (NBA). He is a two-time NBA All-Star, and earned NBA All-Defensive Second Team honors in 2014. Are we justified in saying that \"Roy will continue to play on his current team until he retires.\"? Yes, no, or maybe? Maybe\n###\nThe Governor Nobre de Carvalho Bridge also known as the Macau-Taipa Bridge, is a dual-lane two-way bridge connecting Macau Peninsula near Casino Lisboa and the island of Taipa at the northern slope of Taipa Pequena (Small Taipa Hill) crossing the Ba\u00eda da Praia Grande. It is the first bridge in Macau, to connect the peninsula and Taipa. It is locally known as \"The Old Bridge\" (). Are we justified in saying that \"There is a bridge in Macau called \"The New Bridge.\"\"? Yes, no, or maybe?", "doc_id": 213, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35089, 20959, 41011, 42435], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958. Are we justified in saying that \"It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. 50s movies leave something to be desired.\"? Yes, no, or maybe? Maybe\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas. Are we justified in saying that \"Paul Albert Raymond Barlatier de Mas was born in october\"? Yes, no, or maybe? Yes\n###\nGame engine recreations are remade engine interpreters for video games that replace the original engine binary that came with the original game. A notable example of game engine recreation is ScummVM which successfully recreated the SCUMM engine of classical LucasArts' point and click adventures. For further examples, refer to the list of game engine recreations. Are we justified in saying that \"Game engine recreations are not video games.\"? Yes, no, or maybe? No\n###\nThe San Pablo Reservoir is an open cut terminal water storage reservoir owned and operated by the East Bay Municipal Utility District (EBMUD). It is located in the valley of San Pablo Creek, north of Orinda, California and south of El Sobrante and Richmond, east of the Berkeley Hills between San Pablo Ridge and Sobrante Ridge. Are we justified in saying that \"San Pablo Reservoir is underneath Richmond on a map.\"? Yes, no, or maybe? No\n###\nPrince Karl Alfred of Liechtenstein (Karl Alfred Maria Johannes Baptista Heinrich Aloys Georg Hartmann Ignatius; 16 August 1910 \u2013 17 November 1985) was a Liechtensteiner prince and brother of Franz Joseph II. He was the third child and second son of Prince Aloys of Liechtenstein and Archduchess Elisabeth Amalie of Austria. Are we justified in saying that \"Karl Alfred of Liechtenstein was a prince from Liechtenstein whose birth was trending all over social media from the moment he was born\"? Yes, no, or maybe?", "doc_id": 660, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10840, 42733, 41356, 34543], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chatot (also Chacato or Chactoo) were a Native American tribe who lived in the upper Apalachicola River and Chipola River basins in what is now Florida. They spoke a Muskogean language, which may have been the same as that of the Pensacola people. Are we justified in saying that \"The Chacato tribe once lived in a river basin in Florida.\"? Yes, no, or maybe? Yes\n###\nThe Sydney/Melbourne Express was an overnight intercapital passenger train service that operated between the Australia's largest two cities, Sydney and Melbourne, between August 1986 and November 1993. Operated jointly by State Rail Authority and V/Line the name depended on the direction of travel, with the train nicknamed the 'Sex' or 'Mex'. Are we justified in saying that \"The rail line crossed the ocean.\"? Yes, no, or maybe? Maybe\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"The album \"Companions\" was re-titled \"Maps & Companions.\"\"? Yes, no, or maybe? No\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock. Are we justified in saying that \"Trick Pony never had a hit song.\"? Yes, no, or maybe? Maybe\n###\nDovyalis is a genus of shrubs and small trees. Recent genetic evidence has shown the genus to belong to the family Salicaceae; formerly it was classified in the family Flacourtiaceae. The 15 species are native to Africa (Ethiopia south to South Africa) and southern Asia (India, Sri Lanka). Some are cultivated for their fruit. Are we justified in saying that \"There are 15 species of Dovyalis that are cultivated for their fruit.\"? Yes, no, or maybe?", "doc_id": 281, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44726, 28450, 25666, 28779], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Masquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line. Are we justified in saying that \"The play was written in 1830+8\"? Yes, no, or maybe? No\n###\nDemi Lovato: Live in Concert (also known as the Summer Tour 2009) was the debut headlining concert tour by American singer Demi Lovato, launched in support of her debut album \"Don't Forget\" (2008) and the second studio album \"Here We Go Again\" (2009). Are we justified in saying that \"Demi Lovato has only released one album.\"? Yes, no, or maybe? No\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006. Are we justified in saying that \"Silent Scream made for a good christmas present\"? Yes, no, or maybe? Maybe\n###\nMonique Brumby (born 16 September 1974, Devonport) is an Australian Indie pop/rock singer-songwriter, guitarist and producer. Her debut single, \"Fool for You\", peaked into the top 40 in the Australian Recording Industry Association (ARIA) ARIA Singles Charts, and provided an ARIA Award for 'Best New Talent' in 1996. Her single, \"Mary\", won an ARIA Award in 1997 for 'Best Female Artist'. Are we justified in saying that \"Monique was born in the 20th century. \"? Yes, no, or maybe? Yes\n###\nThere have been four head coaches of the Houston Texans, a professional American football team based in Houston, Texas, United States. The Texans play in the South Division of the American Football Conference (AFC) in the National Football League (NFL). Are we justified in saying that \"The Houston Texans have had less than sixty head coaches.\"? Yes, no, or maybe?", "doc_id": 469, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21225, 26710, 42803, 35395], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"The re-release occurred in the year after 2010.\"? Yes, no, or maybe? Yes\n###\nRobert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983. Are we justified in saying that \"Robb Report started out as Twentieth Century Confederates while it's founder Robert L. \"Rusty\" White was an undergraduate in the University of Mississippi in the 1960s before it was sold some yeas later.\"? Yes, no, or maybe? Yes\n###\n\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"Kristen Bell is not named Elle.\"? Yes, no, or maybe? No\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals. Are we justified in saying that \"Mark's birthday was the same day their hit single was released.\"? Yes, no, or maybe? Yes\n###\nAlexander Vincent LoScialpo (born April 29, 1981) is an American actor. He is known for his role as Andy Barclay in the \"Child's Play\" franchise. He has appeared in \"Child's Play\" (1988), \"Child's Play 2\" (1990), \"Curse of Chucky\" (2013), and \"Cult of Chucky\" (2017). Are we justified in saying that \"Alexander Vincent LoScialpo starred in 5 Child's Play films\"? Yes, no, or maybe?", "doc_id": 241, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5099, 6530, 3892, 30846], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character. Are we justified in saying that \"The Inbetweeners was released in 199\"? Yes, no, or maybe? Maybe\n###\nMatthew Wayne Darwin (born March 11, 1963 in Houston, Texas) is a former professional American football center in the National Football League (NFL) for the Philadelphia Eagles. He was drafted twice, first in the 1985 NFL Draft by the Dallas Cowboys and finally in the 1986 NFL Draft by the Eagles. He played college football at Texas A&M University. Are we justified in saying that \"Matthew Wayne Darwin was drafted by the Eagles after he was drafted by the Cowboys.\"? Yes, no, or maybe? Yes\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station. Are we justified in saying that \"The Winter Hill air disaster was said to be the worst flight crash of its time.\"? Yes, no, or maybe? Maybe\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east. Are we justified in saying that \"It is bordered by a sea with a name that starts with an R\"? Yes, no, or maybe? Yes\n###\nEast of West is a monthly comic book series published by Image Comics which debuted in March 2013. Created by writer Jonathan Hickman and artist Nick Dragotta, the book is a science fiction Western set in a dystopian version of the United States whose fate rests with the Four Horsemen of the Apocalypse. Are we justified in saying that \"East of West is a book that includes pictures\"? Yes, no, or maybe?", "doc_id": 287, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18931, 35985, 22198, 21349], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sun Also Rises is a one-act opera by Webster A. Young, based on Ernest Hemingway's \"The Sun Also Rises\". It is one of a pair of Hemingway works that Young adapted into operas. The opera's libretto is by the composer, and includes direct quotations from the novel. It premiered on May 7, 2000 at the Long Island Opera. Are we justified in saying that \"Since Webster A. Young's The Sun Also Rises is based on Ernest Hemingway's, there is a marked improvement in interludes, composition, libretto, and quotations.\"? Yes, no, or maybe? Maybe\n###\nThe Vienna State Opera (German: Wiener Staatsoper ) is an Austria opera house and opera company based in Vienna, Austria. It was originally called the Vienna Court Opera (Wiener Hofoper). In 1920, with the replacement of the Habsburg Monarchy by the First Austrian Republic, it was renamed the Vienna State Opera. The members of the Vienna Philharmonic are recruited from its orchestra. Are we justified in saying that \"The Vienna State Opera currently has 5 opera singers.\"? Yes, no, or maybe? Maybe\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships. Are we justified in saying that \"The 98100 square foot center will be expanded.\"? Yes, no, or maybe? Maybe\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci appears as the devil in the upcoming film Devil's Whisper\"? Yes, no, or maybe? Maybe\n###\nThe Oakland Athletics' 1985 season involved the A's finishing 4th in the American League West with a record of 77 wins and 85 losses. While the Athletics' on-field performance continued to disappoint, the debut of slugger Jose Canseco gave fans a measure of hope. Are we justified in saying that \"Jose Canseco will play baseball next year.\"? Yes, no, or maybe?", "doc_id": 255, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20168, 27087, 10257, 36134], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\". Are we justified in saying that \"The 1980 album \"Monty Python's Contractual Obligation Album\" was released on January 19th.\"? Yes, no, or maybe? Maybe\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits. Are we justified in saying that \"Shades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple and contains rare edits and singles which are remastered along with album versions of their biggest and rarest hits.\"? Yes, no, or maybe? Maybe\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer. Are we justified in saying that \"Loui Jover knows what light aperture is\"? Yes, no, or maybe? Yes\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist. Are we justified in saying that \"Jean-Baptiste Lamarck often said he was proud to have the island named after him.\"? Yes, no, or maybe? Maybe\n###\nBaby Mine is a 1928 silent film comedy produced and distributed by MGM. This film is a remake of the 1917 film \"Baby Mine\" both being based on Margaret Mayo's 1910 Broadway comedy \"Baby Mine\". This film stars Karl Dane, George K. Arthur and Charlotte Greenwood and is her third feature film, she having made two previous films in 1916 and 1918. Are we justified in saying that \"The number of words spoken in Baby Mine is equal to the number of pigs who could fly\"? Yes, no, or maybe?", "doc_id": 666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11010, 7715, 343, 38117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vice Admiral Sir Timothy James Hamilton Laurence {'1': \", '2': \", '3': \", '4': \"} (born 1 March 1955) is a retired British naval officer and the second husband of Princess Anne, the only daughter of Queen Elizabeth II and Prince Philip. Laurence was Equerry to the Queen from 1986 to 1989. Are we justified in saying that \"Laurence is the first husband of Princess Anne\"? Yes, no, or maybe? No\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006. Are we justified in saying that \"Oliver Francis O'Grady was born in the 19th century.\"? Yes, no, or maybe? No\n###\nHidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO. Are we justified in saying that \"The company that developed Clout Fantasy never had a CEO.\"? Yes, no, or maybe? No\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Leonard and Mae Murray had three children.\"? Yes, no, or maybe? Maybe\n###\nConnacht Rugby (Irish: \"Rugba\u00ed Connachta\" ) is one of the four professional provincial rugby teams from the island of Ireland. Connacht competes in the Pro14 and the European Rugby Challenge Cup. The team represents the IRFU Connacht Branch, which is one of four primary branches of the IRFU, and is responsible for rugby union throughout the geographical Irish province of Connacht. Are we justified in saying that \"The team plays in the Republic of Ireland.\"? Yes, no, or maybe?", "doc_id": 524, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12343, 45422, 5616, 44926], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ghost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997). Are we justified in saying that \"Ghost Notes was released in the winter of 2015.\"? Yes, no, or maybe? No\n###\nBeyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John LaZar, Michael Blodgett and David Gurian. The film was directed by Russ Meyer and co-written by Meyer and Roger Ebert. Are we justified in saying that \"Beyond the Valley of the Dolls is a 1970 American satirical musical melodrama film starring Dolly Read, Cynthia Myers, Marcia McBroom, Phyllis Davis, John Blodgett, Michael LaZar and David Gurian.\"? Yes, no, or maybe? No\n###\n\"Girl in a Country Song\" is the debut single by American country music duo Maddie & Tae, co-written with Aaron Scherz and released in July 2014. The song is an answer to the \"bro-country\" subgenre in contemporary country music, specifically in how women are portrayed by men, with lyrics containing references to a variety of popular recent country songs. Are we justified in saying that \"Maddie & Tae are country music performers.\"? Yes, no, or maybe? Yes\n###\nThe Office is a British mockumentary sitcom, first broadcast in the United Kingdom on BBC Two on 9 July 2001. Created, written and directed by Ricky Gervais and Stephen Merchant, the programme is about the day-to-day lives of office employees in the Slough branch of the fictitious Wernham Hogg Paper Company. Gervais also stars in the series, playing the central character, David Brent. Are we justified in saying that \"Brent is not a fictitious character.\"? Yes, no, or maybe? No\n###\nSvensk Hyllningsfest (] , \"Swedish Honoring Festival\") is a biennial celebration held in Lindsborg, Kansas, in October of odd-numbered years since 1941 to celebrate the town's Swedish heritage. The festival includes Swedish dancing, foods including \"lutfisk\", cooking demonstrations, arts and crafts, entertainment by local artists and musicians, a parade, and a sm\u00f6rg\u00e5sbord. Are we justified in saying that \"Svensk Hyllningsfest started the year after 1939.\"? Yes, no, or maybe?", "doc_id": 590, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14926, 7409, 22245, 14746], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A startup company (startup or start-up) is an entrepreneurial venture which is typically a newly emerged, fast-growing business that aims to meet a marketplace need by developing a viable business model around an innovative product, service, process or a platform. A startup is usually a company designed to effectively develop and validate a scalable business model. Are we justified in saying that \"A startup company has not been around for a while.\"? Yes, no, or maybe? Yes\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\". Are we justified in saying that \"There was no other female rock and roll journalists before Gloria Stavers\"? Yes, no, or maybe? Yes\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style. Are we justified in saying that \"Those that wrestle with AAA use high flying.\"? Yes, no, or maybe? Yes\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band. Are we justified in saying that \"The album was one of the best they ever produced.\"? Yes, no, or maybe? Maybe\n###\nMarion Anna Fischer (born July 18, 1986 in East Berlin) is a German actress and singer. Since 2003, she appeared in over 30 film and television roles in appearance. She is most recognised to international audiences as the innocent vampire \"Nora\" in Dennis Gansel's drama film \"We Are The Night\" Are we justified in saying that \"Marion Anna Fischer first appeared in films at the age of twelve\"? Yes, no, or maybe?", "doc_id": 792, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21410, 1145, 33547, 16498], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Unternehmen Herbstnebel (\"Operation Autumn Mist\") was an offensive planned by German Field Marshal Walter Model and his Army Group B operational staff in late 1944 during World War II. It envisaged a German Army attack targeting the Allied forces in eastern Belgium and Luxembourg, east of the Meuse River. Are we justified in saying that \"Walter Model was the only Field Marshal who planned Operation Autumn Mist.\"? Yes, no, or maybe? Maybe\n###\nThe Sandlot is a 1993 American coming-of-age baseball film co-written and directed by David M. Evans, which tells the story of a group of young baseball players during the summer of 1962. It stars Tom Guiry, Mike Vitar, Karen Allen, Denis Leary and James Earl Jones. The filming locations were in Glendale, Midvale, Salt Lake City, and Ogden, Utah. Are we justified in saying that \" The filming locations were in Glendale, Seattle, Salt Lake City, and Ogden, Utah\"? Yes, no, or maybe? No\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines is a non-profit news organization. \"? Yes, no, or maybe? Yes\n###\nNo. 27 Squadron RAAF is a Royal Australian Air Force (RAAF) reserve and ground support squadron located at RAAF Base Townsville, Queensland. The squadron was formed on 1 July 1981 to recruit and train RAAF reservists in the Townsville area and in July 2010 took on the additional role of providing support services to RAAF Base Townsville. Are we justified in saying that \"No. 27 Sqaudron RAAF is a well known squadron. \"? Yes, no, or maybe? Maybe\n###\nMystery is a 1990 novel by American author Peter Straub, and is the second installment in Straub's loosely connected \"Blue Rose Trilogy\". The novel falls into the genre of crime fiction, and was preceded by \"Koko\" and followed by \"The Throat\". The book was published by Dutton, won the 1993 Bram Stoker Award and was a 1994 WFA nominee Are we justified in saying that \"Peter Straub was an american author that wrote the novel Mystery just one year after his birth in 1990\"? Yes, no, or maybe?", "doc_id": 892, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42323, 22447, 44786, 35055], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stoked (stylized as \"St\u014dked\") is a Canadian animated series produced by Fresh TV that premiered on Teletoon on June 25, 2009 and ended on January 26, 2013. It formerly aired on Teletoon in Canada and ABC3 in Australia, and on Cartoon Network in the United States. The series is from the same creators as \"6teen\" and the \"Total Drama\" series. Are we justified in saying that \"Stoked was released in English\"? Yes, no, or maybe? Maybe\n###\nKinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Santa Teresa is a fictional town\"? Yes, no, or maybe? Yes\n###\nJ\u00fcrgen Melzer (born 22 May 1981 in Vienna) is an Austrian tennis player. He reached a career-high singles ranking of world No. 8 in April 2011, and a doubles ranking of world No. 6 in September 2010. He is a left-handed tennis player, but is right-handed in everyday life. He has a younger brother, Gerald Melzer, with whom he has played doubles in several tournaments. Are we justified in saying that \"J\u00fcrgen Melzer starts with an A.\"? Yes, no, or maybe? No\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"The Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, but they aren't the game's developer. \"? Yes, no, or maybe? No\n###\nGenevieve LaCaze (born 4 August 1989) is an Australian athletics competitor who specialises in the 3000 metre steeplechase. She held an athletics scholarship at the University of Florida. She was selected to represent Australia at the 2012 Summer Olympics in London and Athletics at the 2016 Summer Olympics in Rio de Janeiro. LaCaze is of French, Italian and Spanish descent. Are we justified in saying that \"Genevieve LaCaze was born more than 1000 weeks ago.\"? Yes, no, or maybe?", "doc_id": 588, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15066, 22050, 19984, 40132], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "My Dinner with Herv\u00e9 is an upcoming American television drama film directed and written by Sacha Gervasi based on the later days of actor Herv\u00e9 Villechaize. The film stars Peter Dinklage as Villechaize, Jamie Dornan as a struggling journalist, and Andy Garc\u00eda as Ricardo Montalb\u00e1n, Villechaize\u2019s \"Fantasy Island\" co-star. Are we justified in saying that \"The film is about Villechaize's senior years.\"? Yes, no, or maybe? Yes\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris. Are we justified in saying that \"Yellow Ledbetter has acting.\"? Yes, no, or maybe? Yes\n###\nJames Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"James Wyatt won a literary award for \"Dungeon Master's Guide\".\"? Yes, no, or maybe? Maybe\n###\nJacob (Jake) Ruppert Jr. (August 5, 1867\u00a0\u2013 January 13, 1939) was an American brewer, businessman, National Guard colonel and United States Congressman who served for four terms representing New York from 1899 to 1907. He also owned the New York Yankees of Major League Baseball from 1915 until his death in 1939. Are we justified in saying that \"Jake Ruppert crafted beverages made of barley and malt.\"? Yes, no, or maybe? Maybe\n###\nRed Earth, White Lies: Native Americans and the Myth of Scientific Fact is a book by Native American author Vine Deloria, originally published in 1995. The book's central theme is to criticize the scientific consensus which has, in his words, created \"a largely fictional scenario describing prehistoric North America\". Are we justified in saying that \"Red Earth, White Lies: Native Americans and the Myth of Scientific Fact has an A.\"? Yes, no, or maybe?", "doc_id": 50, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4514, 37880, 45073, 19820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology. Are we justified in saying that \"Alexadrov was an american citizen\"? Yes, no, or maybe? No\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity. Are we justified in saying that \"The company does not exist.\"? Yes, no, or maybe? No\n###\nThe New Pornographers is a Canadian indie rock band formed in 1997 in Vancouver, British Columbia. Presented as a musical collective of singer-songwriters and musicians from multiple projects, the band has released seven studio albums to critical acclaim for their use of multiple vocalists and elements of power pop incorporated into their music. Are we justified in saying that \"The New Pornographers are amateur pornographers\"? Yes, no, or maybe? No\n###\nWKKF \"(102.3 FM)\" - branded as Kiss 102.3 - is a Top 40 (CHR) station licensed to Ballston Spa, New York and serving the Capital District and Adirondacks. The station is owned by iHeartMedia and broadcasts at 102.3 FM at 4,100 watts ERP from a transmitter in Clifton Park, New York on a tower shared with WDCD-FM and WTMM-FM. Are we justified in saying that \"People in the Adirondacks can listen to Kiss\"? Yes, no, or maybe? Yes\n###\nJohnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\". Are we justified in saying that \"Kidd died before he could see his song Shakin' All Over become a hit.\"? Yes, no, or maybe?", "doc_id": 154, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20097, 11211, 36689, 11642], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site. Are we justified in saying that \"The Cincinnati and Whitewater Canal Tunnel was constructed in 1832\"? Yes, no, or maybe? No\n###\nBovegno is a \"comune\" in the province of Brescia, in Lombardy. It borders the communes of Artogne, Berzo Inferiore, Bienno, Collio, Esine, Gianico, Irma, Marmentino and Pezzaze. It is pronounced B\u00f2vegno (\"B\u00f6egn\" in the local Eastern Lombard dialect). It is located in the valley named Val Trompia. Are we justified in saying that \"It is not located in a valley\"? Yes, no, or maybe? No\n###\nWalcha Shire is a local government area located in the New England region of New South Wales, Australia. The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 20 km east of the Main North railway line passing through Walcha Road. Are we justified in saying that \"The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 204 km east of the Main North railway line passing through Walcha Road.\"? Yes, no, or maybe? No\n###\nA Bad Girl in Harlem is the second studio album by Danish rock band New Politics, released on May 21, 2013 via RCA Records. The three members moved from Copenhagen to Brooklyn, where the material was recorded. Two singles were released, titled \"Harlem\" and \"Tonight You're Perfect\". Allmusic.com called the album 'hooky, infectious pop'. Are we justified in saying that \"New Politics released their first studio album on May 12, 2012.\"? Yes, no, or maybe? Maybe\n###\n\"Stagger Lee\", also known as \"Stagolee\" and other variants, is a popular American folk song about the murder of Billy Lyons by \"Stag\" Lee Shelton in St. Louis, Missouri at Christmas, 1895. The song was first published in 1911, and was first recorded in 1923 by Fred Waring's Pennsylvanians. A version by Lloyd Price reached number one on the \"Billboard\" Hot 100 in 1959. Are we justified in saying that \"The song Stagger Lee was first published over 100 years ago\"? Yes, no, or maybe?", "doc_id": 465, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8919, 4516, 11035, 7534], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Three Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film was released last century.\"? Yes, no, or maybe? Yes\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Sushil Kumar Shinde was an amazing vice president of India. \"? Yes, no, or maybe? Maybe\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory. Are we justified in saying that \"The chamber is from 1885\"? Yes, no, or maybe? No\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin. Are we justified in saying that \"Meyer professionally suffered for making a smut film.\"? Yes, no, or maybe? No\n###\nThe Wombats are an English rock band formed in Liverpool in 2003. The band is composed of lead vocalist and guitarist Matthew Murphy, drummer Dan Haggis, and bassist Tord \u00d8verland Knudsen, and has been since its inception. The band is signed to 14th Floor Records in the United Kingdom and Bright Antenna in the United States. The Wombats' albums have sold over 1 million copies worldwide. Are we justified in saying that \"The Wombats have 3 original band members.\"? Yes, no, or maybe?", "doc_id": 564, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17414, 25138, 19606, 35516], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ludwig Ruckdeschel (15 March 1907 \u2013 8 November 1986) was the Nazi \"Gauleiter\" of Bayreuth during final month of the \"Gau's\" existence before the collapse of Nazi Germany in 1945. Before this, from 1933 to 1941, he served as the deputy of Gauleiter Fritz W\u00e4chtler, whom he had executed on orders by Martin Bormann. From 1933 to 1945 he was also a member of the German Parliament, the Reichstag. Are we justified in saying that \"Ruckdeschel zodiac sign was Pisces.\"? Yes, no, or maybe? Yes\n###\nMads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted. Are we justified in saying that \"Aschehoug is a respected publishing house.\"? Yes, no, or maybe? Maybe\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Petasites is a type of cat\"? Yes, no, or maybe? No\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway. Are we justified in saying that \"There were no inhabitants of Princess Ragnhild Coast when Capt. Riiser-Larsen and Capt. Larsen discovered it in 1931.\"? Yes, no, or maybe? Maybe\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006. Are we justified in saying that \"Silent Scream premiered at the Chicago Horror Film Festival more than 3000 days ago.\"? Yes, no, or maybe?", "doc_id": 195, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7822, 44234, 30374, 38820], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oceanus ( ; Greek: \u1f68\u03ba\u03b5\u03b1\u03bd\u03cc\u03c2 \"\u014ckean\u00f3s\", ] ), also known as Ogenus (\"Ogenos\", \u03a9\u03b3\u03b7\u03bd\u03bf\u03c2) or Ogen (\u03a9\u03b3\u03b7\u03bd), was a divine figure in classical antiquity, believed by the ancient Greeks and Romans to be the divine personification of the sea, an enormous river encircling the world. Are we justified in saying that \"Greeks and Romans foresaw that in English we would change it to \"Ocean\".\"? Yes, no, or maybe? Maybe\n###\nCharles Dera is an American pornographic actor, dancer, and model. He has performed in hundreds of heterosexual pornographic movies and is also \u2018The Veteran\u2019 in the male strip troupe \u2018Men of the Strip\u2019. In 2016, Charles Dera played the role of Donald Trump and Cherie Deville played the role of Hillary Clinton in the parody American elections for Brazzers. Are we justified in saying that \"Charles Dera is anti-Trump\"? Yes, no, or maybe? Maybe\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. Are we justified in saying that \"Dover, New York is in northern New York.\"? Yes, no, or maybe? Maybe\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The Gator Bowl was a game between the Arkansas Razorbacks and the Georgia Tech Yellow Jackets and it was played in cold weather.\"? Yes, no, or maybe? Maybe\n###\nCharlotte Marie Pomeline Casiraghi (born 3 August 1986) is the second child of Caroline, Princess of Hanover, and Stefano Casiraghi, an Italian industrialist. She is ninth in line to the throne of Monaco. Her maternal grandparents were Rainier III, Prince of Monaco, and American actress Grace Kelly. She is named after her maternal great-grandmother, Princess Charlotte, Duchess of Valentinois. Are we justified in saying that \"Casiraghi was born in the eighth month.\"? Yes, no, or maybe?", "doc_id": 482, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31429, 21493, 37795, 18849], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The shooting of John Crawford III occurred on August 5, 2014. Crawford was a 22-year-old African-American man shot to death by Beavercreek police officer Sean Williams, in a Walmart store in Beavercreek, Ohio, near Dayton, while holding a toy BB gun. Are we justified in saying that \"The victim was not holding a real gun.\"? Yes, no, or maybe? Yes\n###\nA Common Land Unit (CLU) is the smallest unit of land that has a permanent, contiguous boundary, a common land cover and land management, a common owner and a common producer in agricultural land associated with USDA farm programs. CLU boundaries are delineated from relatively permanent features such as fence lines, roads, and/or waterways. Are we justified in saying that \"A Common Land Unit is only a show.\"? Yes, no, or maybe? No\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery. Are we justified in saying that \"One of the busiest taverns in London is The Anchor Bankside tavern.\"? Yes, no, or maybe? Maybe\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists. Are we justified in saying that \"Contra Conspiracy is a 1993 action film\"? Yes, no, or maybe? No\n###\nThe Hun River (\u6e3e\u6cb3, \"the muddy river\") is a river in Liaoning Province, China, and was formerly one of the largest tributaries of the Liao River. It was also formerly known as Shen River (\u700b\u6c34). Two of Liaoning's most important cities, the provincial capital Shenyang and the seventh largest city Fushun, are located on the Hun River. Are we justified in saying that \"The Hun River is in the southern hemisphere\"? Yes, no, or maybe?", "doc_id": 808, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31294, 17478, 10396, 15351], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "KnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates. Are we justified in saying that \"KnowledgeWare was sold in 1995.\"? Yes, no, or maybe? No\n###\nThe Kilpatrick and Beatty text-messaging scandal was a political-sex scandal emerging from a whistle blower lawsuit involving former Detroit Police Chief Gary Brown, Detroit Mayor Kwame Kilpatrick and his former Chief of Staff and paramour Christine Beatty. Are we justified in saying that \"The Kilpatrick and Beatty text-messaging scandal involved police chief Gary Brown and Donald Trump\"? Yes, no, or maybe? No\n###\nLive from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released. Are we justified in saying that \"The album was recorded during the British leg of the tour\"? Yes, no, or maybe? No\n###\nJurassic Park is a 1993 video game based on the film and novel of the same name. It was developed and published by Ocean Software and released for the Nintendo Entertainment System (NES). Ocean also released \"Jurassic Park\" on the handheld Game Boy console. The Game Boy version is a port of the NES version. Are we justified in saying that \"The Jurassic Park video game released for the Nintendo Entertainment System (NES) was widely popular. \"? Yes, no, or maybe? Maybe\n###\nThe Sea Hornet is a 1951 American adventure film directed by Joseph Kane and written by Gerald Drayson Adams. The film stars Rod Cameron, Adele Mara, Lorna Gray, Chill Wills, Jim Davis and Richard Jaeckel. The film was released on November 6, 1951, by Republic Pictures. Are we justified in saying that \"The Sea Hornet was released after Thanksgiving Day in 1951\"? Yes, no, or maybe?", "doc_id": 333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11026, 4936, 28, 39383], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kiss of the Spider Woman is a musical with music by John Kander and Fred Ebb, with the book by Terrence McNally. It is based on the Manuel Puig novel \"El Beso de la Mujer Ara\u00f1a\". The musical had runs in the West End (1992) and Broadway (1993) and won the 1993 Tony Award for Best Musical. Are we justified in saying that \"The music from Kiss of the Spider Woman was written over the course of 6 months.\"? Yes, no, or maybe? Maybe\n###\nThe 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017. Are we justified in saying that \"The ATP Challenger Tour was cancelled in 2019\"? Yes, no, or maybe? Maybe\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008. Are we justified in saying that \"Jung Young-bae directed one film in 2008\"? Yes, no, or maybe? Maybe\n###\nSuntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"Sadhu Kokila has won many awards.\"? Yes, no, or maybe? Maybe\n###\nThe Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue. Are we justified in saying that \"The Sky Bar at the Drake Hotel in Toronto has street-level access.\"? Yes, no, or maybe?", "doc_id": 306, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20650, 37623, 5078, 15819], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arizona Business Magazine, based out of Phoenix, Arizona, is the state\u2019s leading monthly Business magazine. Published by AZ Big Media, the magazine covers a wide range of topics focusing on the Arizona business scene, and is aimed at high-level corporate executives and business owners. Are we justified in saying that \"People in Australia read this magazine.\"? Yes, no, or maybe? Maybe\n###\nFC Saturn-1991 Saint Petersburg (Russian: \u0424\u041a \u00ab\u0421\u0430\u0442\u0443\u0440\u043d\u20111991\u00bb \u0421\u0430\u043d\u043a\u0442\u2011\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 ) was a Russian football team from Saint Petersburg. It played professionally from 1992 to 1995, including 3 seasons (1993\u20131995) in the second-highest Russian First Division. In 1996 it merged with FC Lokomotiv Saint Petersburg. Before 1995 it was called FC Smena-Saturn Saint Petersburg. Are we justified in saying that \"They were formed over 7 years ago\"? Yes, no, or maybe? Yes\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr. Are we justified in saying that \"There was at least one comedy film in 1947.\"? Yes, no, or maybe? Yes\n###\nAdwoa Aboah (born 18 May 1992) is a British fashion model and feminist activist, of Ghanaian origin In March 2017, she appeared on the cover of American Vogue with Liu Wen, Ashley Graham, Kendall Jenner, Gigi Hadid, Imaan Hammam, and Vittoria Ceretti. She has also been on the cover of Vogue Italia and i-D. Are we justified in saying that \"i-D magazine is an American magazine. \"? Yes, no, or maybe? Maybe\n###\nRylstone was a former electoral district of the Legislative Assembly in the Australian state of New South Wales, created in 1894 from part of Mudgee and named after and including Rylstone. It was abolished in 1904, with the downsizing of the Legislative Assembly after Federation. Are we justified in saying that \"Legislative Assembly was abolished 8 years after the founding\"? Yes, no, or maybe?", "doc_id": 857, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31556, 40668, 99, 12898], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Newlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005. Are we justified in saying that \"Newlyweds was a show about a boyband member and his wife.\"? Yes, no, or maybe? Yes\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology. Are we justified in saying that \"Pavel's brother wrote the three hundred papers attributed to Pavel.\"? Yes, no, or maybe? Maybe\n###\nGreed is the third studio album by American experimental rock band Swans. It was released in 1986, through record label K.422. \"Greed\" marks the slow turning point for Swans away from the harsh, brutal noise rock of prior releases, and is also the first Swans album to contain contributions from Jarboe. Are we justified in saying that \"Greed was released before 1985.\"? Yes, no, or maybe? No\n###\nTerry Butler is an American bassist who currently performs with the death metal bands Obituary and Massacre. He was also a member of Six Feet Under and Death. He was credited on the Death album \"Spiritual Healing\", and band leader Chuck Schuldiner stated that on the latter Death album \"Terry contributed to the songwriting as well\". Are we justified in saying that \"Terry Butler loves dogs\"? Yes, no, or maybe? Maybe\n###\nBronwen (] ) is a Welsh feminine given name. It is closely associated with the similar name \"Branwen\", which appears in medieval Welsh literature. Used in Wales since the 19th century, it was introduced to the English-speaking public at large by a character in the Richard Llewellyn novel \"How Green Was My Valley\" (1939). Are we justified in saying that \"Bronwen was a named based on a novel\"? Yes, no, or maybe?", "doc_id": 342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8825, 23792, 40962, 44612], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Texas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax. Are we justified in saying that \"Sales tax policy changed as a result of this action.\"? Yes, no, or maybe? Maybe\n###\nLeonard Edgcombe (died 1696) was a ship's captain with the Hudson's Bay Company who made a number of voyages into Hudson Bay and James Bay on behalf of the company. He had Henry Baley as a chief mate for a time prior to 1692 and this mariner became an important link with the area for the Hudson's Bay Company. Are we justified in saying that \"Leonard Edgcombe served in at least five jobs over the course of his life.\"? Yes, no, or maybe? Maybe\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"The electric chair position, in professional wrestling, involves flying on that prone opponent.\"? Yes, no, or maybe? Yes\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th. Are we justified in saying that \"Vasili Vyacheslavovich Blagov starts with an A.\"? Yes, no, or maybe? No\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films. Are we justified in saying that \"saturday night live was a show from 1985 to 1990\"? Yes, no, or maybe?", "doc_id": 74, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26396, 45089, 6826, 38680], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The John Coltrane Home is a house in the Dix Hills neighborhood of Huntington, Suffolk County, New York, where saxophonist John Coltrane lived from 1964 until his death in 1967. It was in this home that he composed his landmark work, \"A Love Supreme\". Are we justified in saying that \"John Coltrane was a saxophonist.\"? Yes, no, or maybe? Yes\n###\nGeorge William Lyttelton, 4th Baron Lyttelton, {'1': \", '2': \", '3': \", '4': \"} (31 March 1817 \u2013 19 April 1876) was a British aristocrat and Conservative politician from the Lyttelton family. He was chairman of the Canterbury Association, which encouraged British settlers to move to New Zealand. Are we justified in saying that \"George William Lyttleton lived to be 72 years old.\"? Yes, no, or maybe? No\n###\nThe Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly. Are we justified in saying that \"The Leader of the Opposition of Singapore is a unpopular political party\"? Yes, no, or maybe? Maybe\n###\nLudovic (Ludo) Coeck (25 September 1955 \u2013 9 October 1985) was a Flemish-Belgian footballer who played as left winger or central midfielder. His clubs included Berchem Sport, Anderlecht, Internazionale and Ascoli Calcio. He was capped for the Belgian national team 46 times. Are we justified in saying that \"Ludovic Coeck spent the majority of his playing time as a left winger.\"? Yes, no, or maybe? Maybe\n###\nBosch is an American police procedural web television series produced by Amazon Studios and Fabrik Entertainment. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show was developed for Amazon by Eric Overmyer and the first season takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"Bosch is Mexican.\"? Yes, no, or maybe?", "doc_id": 471, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20852, 37077, 22013, 4847], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel. Are we justified in saying that \"kiss and tell was an objectively bad song\"? Yes, no, or maybe? Maybe\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit. Are we justified in saying that \"Sandstone can be found near the top of a mountain in the Spring Mountain range.\"? Yes, no, or maybe? Yes\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Ralph D. Malone was tall.\"? Yes, no, or maybe? Maybe\n###\nHawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town. Are we justified in saying that \"Hawthorne Army Depot which is the primary economic base of the town,is located far away from the town.\"? Yes, no, or maybe? No\n###\nTim Witherspoon (born December 27, 1957) is an American former professional boxer who competed from 1979 to 2003. He is a two-time world heavyweight champion, having held the WBC title in 1984, and the WBA title in 1986. Upon winning his second world title, Witherspoon joined Floyd Patterson and Muhammad Ali as the only boxers to win multiple world heavyweight championships. Are we justified in saying that \"Witherspoon was boxing while Jimmy Carter lived in the white house.\"? Yes, no, or maybe?", "doc_id": 993, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4455, 1555, 4540, 19710], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "True as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"True as a Turtle has a different author or the screenplay and the original novel.\"? Yes, no, or maybe? No\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania. Are we justified in saying that \"Idris Sultan is a Tanzanian actor, comedian, and radio host who hosts a music radio show called MWB on ChoiceFM Tanzania.\"? Yes, no, or maybe? Maybe\n###\nReturn to Paradise is a live album by Styx, released in 1997. It features songs from their successful reunion tour with Tommy Shaw, but without John Panozzo, who died in July 1996. It includes three new studio tracks, including \"Dear John\", which Shaw wrote as a tribute to Panozzo. Are we justified in saying that \"Styx both gained and lost a member between 1996 and 1997. \"? Yes, no, or maybe? Yes\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct. Are we justified in saying that \"Eighty mammal species have become extinct.\"? Yes, no, or maybe? Yes\n###\nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"Lathan McKay is fluent in English.\"? Yes, no, or maybe?", "doc_id": 28, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21140, 11234, 11318, 28000], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Little Fluffy Gigolo Pelu (Japanese: \u30d5\u30a1\u30f3\u30b7\u30fc\u30b8\u30b4\u30ed \u30da\u30eb , Hepburn: Fansh\u012b Jigoro Peru , a.k.a. \"Fancy Gigolo Pelu\") is a three \"tank\u014dbon\" manga series written and illustrated by Junko Mizuno and published by Enterbrain. The series has been licensed in North America and France where the first volume received mostly positive reviews. Are we justified in saying that \"Junko made a series of comics that got published\"? Yes, no, or maybe? Yes\n###\nYissachar Dov Rokeach (born 19 January 1948) is the fifth and present Rebbe of the Hasidic dynasty of Belz. He is the son of Rabbi Mordechai of Bilgoray (1902 \u2013 1949), the grandson of the third Belzer Rebbe, Rabbi Yissachar Dov Rokeach, and the nephew of the fourth Belzer Rebbe, Rabbi Aharon Rokeach, who raised him. He has led Belz since 1966. Are we justified in saying that \"Yissachar Dov Rokeach is 71 years old.\"? Yes, no, or maybe? Yes\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Daniel Murphy was born February 24, 1996.\"? Yes, no, or maybe? Maybe\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge. Are we justified in saying that \"kew bridge railway station is in hounslow\"? Yes, no, or maybe? Yes\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC. Are we justified in saying that \"Tory Woodbury was born nine days after Independence Day.\"? Yes, no, or maybe?", "doc_id": 787, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37496, 27613, 38868, 35031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wanker Records is a Marburg, Hessen-based independent record label, focused on punk rock, which was started by Nick Wanker (founder of the \"\"Pot- Porn- Electro- Punk\"\" Band Frank Fortuna, guitarist for the punk rock bands Fuzzbeer, Fu\u00dfpils, the Heartbreakers and One Way Down) in 2001. Are we justified in saying that \"Wanker Records is a bad record label \"? Yes, no, or maybe? Maybe\n###\nThe Benetton B188 was a Formula One racing car designed by Rory Byrne and raced by Benetton team in the 1988 Formula One season and in the first half of the 1989 Formula One season. Dating back to when the team started as Toleman in , the B188 was the first car produced by the team not to be powered by a turbocharged engine. Are we justified in saying that \"Rory Byrne only designed race cars\"? Yes, no, or maybe? Maybe\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"the narration in the book New Day used Jamacian venacular \"? Yes, no, or maybe? Yes\n###\nAlbert Ernest Clifford \"Cliff\" Young, OAM (8 February 19222 November 2003) was an Australian potato farmer and athlete from Beech Forest, Victoria, best known for his unexpected win of the inaugural Sydney to Melbourne Ultramarathon in 1983 at 61 years of age. Are we justified in saying that \"Potato farming is more important than sports\"? Yes, no, or maybe? Maybe\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"Both parents and children love Amy Timberlake's books.\"? Yes, no, or maybe?", "doc_id": 138, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40737, 31328, 20305, 29984], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Weaver is an English actress well known for playing the role of recurring character Pamela Cooper in the E4 sitcom \"The Inbetweeners\" and its feature-length films, \"The Inbetweeners Movie\" and \"The Inbetweeners 2\". She has also appeared in several TV commercials. Are we justified in saying that \"Robin Weaver acts with an english accent\"? Yes, no, or maybe? Maybe\n###\nSulakshana is an Indian actress born on August 1 ,1965 who has performed in Tamil, Telugu, Kannada and Malayalam films at the age of two and half in the movie Kaaviya Thalaivi as child Krishna in the name of Dolly . After that she acted in Thulabharam as child artist in Tamil,Telugu,Malayalam and Hindi (all version) in the name of Rajani . Are we justified in saying that \"She hates acting a lot\"? Yes, no, or maybe? Maybe\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style. Are we justified in saying that \"Real Fuerza A\u00e9rea is the top ranked mexican wrestling team.\"? Yes, no, or maybe? Maybe\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo. Are we justified in saying that \"Song'z U Can't Find did not include an artist named E040.\"? Yes, no, or maybe? No\n###\nBen Marshall (born 8 June 1990) is a retired rugby union player from Ireland. He primarily played as a lock or in the back row. Marshall played for Irish provincial sides Leinster and Connacht in the Pro12, but was forced to retire in 2017 due to a concussion injury. Are we justified in saying that \"Marshall retired from the game while in his 20s.\"? Yes, no, or maybe?", "doc_id": 101, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11651, 25973, 32066, 37154], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dickinson Theodore Roosevelt Regional Airport (IATA: DIK,\u00a0ICAO: KDIK,\u00a0FAA LID: DIK) , formerly known as Dickinson Municipal Airport, is a public use airport located five\u00a0nautical miles (6\u00a0mi, 9\u00a0km) south of the central business district of Dickinson, in Stark County, North Dakota, United States. It is owned by the Dickinson Airport Authority. Are we justified in saying that \"Dickinson Municipal Airport is located in South Dakota.\"? Yes, no, or maybe? No\n###\n\"Ellens dritter Gesang \" (\"Ellens Gesang III \", D. 839, Op. 52, No. 6, 1825), in English: \"Ellen's Third Song\", was composed by Franz Schubert in 1825 as part of his Opus 52, a setting of seven songs from Walter Scott's popular epic poem \"The Lady of the Lake\", loosely translated into German. Are we justified in saying that \"Franz Schuber was bored ofThe Lady of the Lake\"? Yes, no, or maybe? Maybe\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km . Are we justified in saying that \"Jiaozhou Bay Bridge is located in a quiet part of China\"? Yes, no, or maybe? Maybe\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster. Are we justified in saying that \"Dance India Dance is similar to Dance Bangla Dance.\"? Yes, no, or maybe? Yes\n###\nReturn to Paradise is a live album by Styx, released in 1997. It features songs from their successful reunion tour with Tommy Shaw, but without John Panozzo, who died in July 1996. It includes three new studio tracks, including \"Dear John\", which Shaw wrote as a tribute to Panozzo. Are we justified in saying that \"Return to Paradise was released before the death of John Panozzo.\"? Yes, no, or maybe?", "doc_id": 824, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11603, 43388, 30402, 1373], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site. Are we justified in saying that \"Scotland contains many castles.\"? Yes, no, or maybe? Maybe\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions. Are we justified in saying that \"Project Gasbuggy was carried out more than 9000 days ago.\"? Yes, no, or maybe? Yes\n###\nStuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area. Are we justified in saying that \"It has a higher population now\"? Yes, no, or maybe? Maybe\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983. Are we justified in saying that \"Sherwood Stewart was born after 1940.\"? Yes, no, or maybe? Yes\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht. Are we justified in saying that \"\"Khan Kluay\" features a main character that doesn't live on land.\"? Yes, no, or maybe?", "doc_id": 98, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25066, 29532, 8886, 364], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"Nathan MacKinnon wanted to become a businessman rather than play hockey professionally \"? Yes, no, or maybe? Maybe\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street. Are we justified in saying that \"Wellingore has been visited by george.\"? Yes, no, or maybe? Maybe\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"Atlanta was always the headquarters of Home Depot.\"? Yes, no, or maybe? Maybe\n###\nNo Devotion are a Welsh/American alternative rock band formed in 2014. They are composed of American vocalist Geoff Rickly (of the band Thursday) from New Jersey, and former band members of the Welsh band Lostprophets. The band formed in the wake of Lostprophets' dissolution in 2013. Are we justified in saying that \"Geoff Rickley was born in New Jersey\"? Yes, no, or maybe? Maybe\n###\nErik Jacobsen (born May 19, 1940) is an American record producer, song publisher and artist manager. He is best known for his work in the 1960s with Tim Hardin, The Lovin' Spoonful, The Charlatans, Sopwith Camel, and later with Norman Greenbaum and Chris Isaak. Though semi-retired, Jacobsen continues to manage many of his published songs and masters for various uses. Are we justified in saying that \"Erik Jacobsen worked in the 1950s.\"? Yes, no, or maybe?", "doc_id": 670, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42835, 8928, 111, 11280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brian Wardle (born October 9, 1979) is an American college basketball coach and the current men's basketball coach at Bradley University. He was an assistant at Marquette from 2003\u20132005 and UW-Green Bay from 2005\u20132010. After the 2009-2010 season, Wardle was named head coach at UW-Green Bay. Upon his hiring, Wardle became the youngest head coach in NCAA Division I basketball. Are we justified in saying that \"Brian Wardle has only coached in NCAA Division I basketball.\"? Yes, no, or maybe? No\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"Three Little Sisters was made to make people laugh.\"? Yes, no, or maybe? Yes\n###\nEarlly Mac is an American rapper from Detroit, Michigan, who is best known for his collaborations with American rapper Big Sean. In 2010 he released his debut mixtape, \"Party Up!\". His debut EP, \"God Knows\", was released in January 2015, by Foolay Ent., LLC. The EP included the single \"Do It Again\" featuring Big Sean, which peaked at number 6 on the \"Billboard\" Twitter Emerging Artists chart. Are we justified in saying that \"the EP was his best work\"? Yes, no, or maybe? Maybe\n###\nJulia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\". Are we justified in saying that \"Julia Goldani Telles was born in Michigan on March 18, 1995.\"? Yes, no, or maybe? Maybe\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"Kapp Heights is in the Northern part of the United States.\"? Yes, no, or maybe?", "doc_id": 607, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27091, 30316, 24524, 9250], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Yahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies. Are we justified in saying that \"Yahoo Serious is a man with too much time on his hands\"? Yes, no, or maybe? Maybe\n###\nThe Interview is a 1998 Australian thriller film from writer-director Craig Monahan, and is the first of two films directed by Monahan. Almost the entire film takes place in a police interrogation room, with some short flashback sequences, and the cast consists primarily of three key actors\u2014Hugo Weaving, Tony Martin, and Aaron Jeffery. Are we justified in saying that \"There are 4 actors in The Interview.\"? Yes, no, or maybe? No\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Due to its large rhubarb-like leaves during the growing season it is mistaken for Rhubarb plant\"? Yes, no, or maybe? Maybe\n###\nInterstate 29 (I-29) is an Interstate Highway in the Midwestern United States. I-29 runs from Kansas City, Missouri, at a junction with Interstate 35 and Interstate 70, to the Canada\u2013US border near Pembina, North Dakota, where it connects with Manitoba Highway 75. Are we justified in saying that \"Interstate 29 is under construction at this time.\"? Yes, no, or maybe? Maybe\n###\nThe Local Government (Northern Ireland) Act 1972 (1972 c. 9) was an Act of the Parliament of Northern Ireland that constituted district councils to administer the twenty-six local government districts created by the Local Government (Boundaries) Act (Northern Ireland) 1971, and abolished the existing local authorities in Northern Ireland. Are we justified in saying that \"Local authority was removed in favor of localized governance \"? Yes, no, or maybe?", "doc_id": 372, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24561, 36924, 34387, 30931], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"Mixtapes is an Ohio/Detroit based pop punk band.\"? Yes, no, or maybe? Yes\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label. Are we justified in saying that \"Bad Company has played with Led Zeppelin.\"? Yes, no, or maybe? Maybe\n###\nIn poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown. Are we justified in saying that \"EPT has not been discontinued \"? Yes, no, or maybe? No\n###\nThe Tenpin Bowling Association of Wales (Welsh: \"Cymdeithas Bowlio Deg Cymru\" ) (TBAW) is the national governing body for tenpin bowling in Wales. It is a member of the F\u00e9d\u00e9ration Internationale des Quilleurs (English: International Bowling Federation ) and the European Tenpin Bowling Federation \u2013 the \"European Zone\" of the World Tenpin Bowling Association. Are we justified in saying that \"International bowling federation when abbreviated is IBF\"? Yes, no, or maybe? Yes\n###\nHolly Weber (born September 20, 1984) is an American glamour model and actress. As a model, she has appeared in \"Maxim\", \"FHM\", \"Muscle & Fitness\", \"Glamour\", and as no. 66 on AskMen's Top 99 Most Desirable Women of 2009. She has made uncredited appearances in a number of movies and TV series. Are we justified in saying that \"Holly Weber was born more than 1000 weeks ago.\"? Yes, no, or maybe?", "doc_id": 343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25862, 36130, 18706, 33119], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character. Are we justified in saying that \"Jeffrey Reiner went on to direct Superman instead.\"? Yes, no, or maybe? Maybe\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist. Are we justified in saying that \"Jean-Baptiste Lamarck was proud to have the island named after him.\"? Yes, no, or maybe? Maybe\n###\nThe Lonely Londoners is a 1956 novel by Tamil Trinidadian author Samuel Selvon. Its publication marked the first literary work focusing on poor, working-class blacks in the beat writer tradition following the enactment of the British Nationality Act 1948. Are we justified in saying that \"A Tamil Trinidadian has written a book before\"? Yes, no, or maybe? Yes\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"The lead singer of Foals always plays guitar.\"? Yes, no, or maybe? Maybe\n###\nThe Hyundai Genesis Coup\u00e9 is a rear-wheel drive sports coupe from Hyundai Motor Company, released on October 13, 2008 for the Korean market. It is Hyundai's first rear-wheel drive sports coupe, and shares its basic platform with the Hyundai Genesis luxury sedan. Are we justified in saying that \"The Hyundai Genesis Coup\u00e9 was released on October 13, 2008 world wide\"? Yes, no, or maybe?", "doc_id": 629, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26214, 31016, 23948, 12626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster. Are we justified in saying that \"Mithun Chakraborty really likes being the Grandmaster\"? Yes, no, or maybe? Maybe\n###\nCharles Dera is an American pornographic actor, dancer, and model. He has performed in hundreds of heterosexual pornographic movies and is also \u2018The Veteran\u2019 in the male strip troupe \u2018Men of the Strip\u2019. In 2016, Charles Dera played the role of Donald Trump and Cherie Deville played the role of Hillary Clinton in the parody American elections for Brazzers. Are we justified in saying that \"Charles Dera has never had a gay experience\"? Yes, no, or maybe? Maybe\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"The Cleveland Browns were Superbowl champions while Malone played for them.\"? Yes, no, or maybe? Maybe\n###\nThe Perfect Gift is a 2009 spinoff of the 2005 Christian drama movie \"The Perfect Stranger\", and its first sequel, \"Another Perfect Stranger\". It stars Christina Fougnie, Amy Hess, Matt Wallace, and Jefferson Moore once again as Jesus Christ. It was filmed almost entirely in Kentucky, where the first two movies in the series were not. Are we justified in saying that \"The Perfect Stranger was released in 2005.\"? Yes, no, or maybe? Yes\n###\nThe following details notable events from the year 2005 in Northern Ireland. Northern Ireland is a part of the United Kingdom in the north-east of the island of Ireland. It is variously described as a country, province or region of the UK, amongst other terms. Northern Ireland shares a border with the Republic of Ireland to the south and west. Are we justified in saying that \"Northern Ireland has a border with the United Kingdom.\"? Yes, no, or maybe?", "doc_id": 514, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9605, 41025, 11875, 3614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Coldwater fish, in the context of aquariums, refers to fish species that prefer cooler water temperatures than tropical fish, typically below 20 \u00b0C . Some examples are koi and goldfish. These species tend to grow more slowly and live longer than fish that live in warmer waters, and are generally felt to be easier to keep. Are we justified in saying that \"Tropical fish prefer cooler water temperatures, typically below 20 degrees Celsius.\"? Yes, no, or maybe? No\n###\nLe roi malgr\u00e9 lui (\"King in Spite of Himself\" or \"The reluctant king\") is an op\u00e9ra-comique in three acts by Emmanuel Chabrier of 1887 with an original libretto by Emile de Najac and Paul Burani. The opera is revived occasionally, but has not yet found a place in repertory. Are we justified in saying that \"The opera is in French.\"? Yes, no, or maybe? Maybe\n###\nThe Mnet Asian Music Award for Best Collaboration is an award presented annually by CJ E&M Pictures (Mnet). It was first awarded at the 12th Mnet Asian Music Awards ceremony held in 2010; singers Ga-in & Jo Kwon won the award for their song \"We Fell in Love\", and it is given in honor for the artists with the most artistic achievement in collaboration performances in the music industry. Are we justified in saying that \"The Mnet Asian Music Award is a disliked show in South korea\"? Yes, no, or maybe? Maybe\n###\n\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"\"Bosch\" features events that take place in California.\"? Yes, no, or maybe? Yes\n###\nThe \"Charleston\"-class amphibious cargo ships were a class of amphibious cargo ships in service with the United States Navy. These ships served in Amphibious Readiness Groups between 1968 and 1994. The ships were the last amphibious cargo ships built for the U.S. Navy, their role having been taken over by amphibious transport docks. Are we justified in saying that \"he \"Charleston\"-class was made more than 0 times\"? Yes, no, or maybe?", "doc_id": 668, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9065, 36950, 39105, 43839], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show is also shown in Canada. \"? Yes, no, or maybe? Maybe\n###\nThomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 1, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily. From 1951 until 1967, he was the boss of the Lucchese crime family, one of the Five Families that dominates organized crime in New York City. Are we justified in saying that \"Thomas \"Tommy\" Lucchese (pronounced ] ; born Gaetano Lucchese, December 2, 1899 \u2013 July 13, 1967) was a Sicilian-born American gangster and founding member of the Mafia in the United States, an offshoot of the \"Cosa Nostra\" in Sicily.\"? Yes, no, or maybe? No\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"The band added more group members which made them fall apart.\"? Yes, no, or maybe? Maybe\n###\nTight is the debut album by the American rock band Mindless Self Indulgence. The album was originally released on April 20, 1999 through Uppity Cracker Recording Group. After having been out of print for many years, the album was reissued as Tighter on April 26, 2011 through The End Records. The reissue features updated artwork and packaging, 12 previously unreleased tracks, and a bonus DVD. Are we justified in saying that \"The 1999 release features a dozen fewer songs than the 2011 release.\"? Yes, no, or maybe? Yes\n###\nThe United Nations Peacekeepers Medal (Irish: \"An Bonn Chosant\u00f3ir\u00ed Sioch\u00e1na na N\u00e1isi\u00fan Aontaithe\" ) is awarded to those members of the Irish Defence Forces or Chaplaincy Service who have served overseas on a United Nation Mission or United Nations Mandated Mission. Are we justified in saying that \"All members of the Irish Defence Forces have received the United Nations Peacekeepers Medal.\"? Yes, no, or maybe?", "doc_id": 720, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16650, 6174, 6087, 34002], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships. Are we justified in saying that \" Cashman Center only allows national events once per year.\"? Yes, no, or maybe? Maybe\n###\nThe Lei \u00c1urea (] ; English: Golden Law ), adopted on May 13, 1888, was the law that abolished slavery in Brazil. It was signed by Isabel, Princess Imperial of Brazil (1846\u20131921), an opponent of slavery, who acted as regent to Emperor Dom Pedro II, who was in Europe. Are we justified in saying that \"Isabel, Princess Imperial of Brazil did not like slavery after watching it in America\"? Yes, no, or maybe? Maybe\n###\nSuperior is a town in and the county seat of Mineral County, Montana, United States. The population was 812 at the 2010 census. Superior was named after its founders' hometown of Superior, Wisconsin in 1869. The post office was established in 1871 after Mineral County became the site of one of the largest gold strikes that helped settle the West. Are we justified in saying that \"The Post Office was important for Mineral County\"? Yes, no, or maybe? Maybe\n###\nCranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century. Are we justified in saying that \"Cranborne is in Dorset England\"? Yes, no, or maybe? Yes\n###\nDopamine is the fifth studio album by American rock band Third Eye Blind, released on June 16, 2015. It is the band's first studio album since 2009's \"Ursa Major.\" The album's first single, \"Everything Is Easy,\" was released on May 8, 2015, along with a cover version of the Beyonc\u00e9 song \"Mine.\" The album debuted at No. 13 on the Billboard 200, selling over 21,000 copies in its first week. Are we justified in saying that \"The album was released in the century after the century of the 1900's\"? Yes, no, or maybe?", "doc_id": 526, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1718, 8189, 19139, 12503], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song. Are we justified in saying that \"The Cars have released at least four albums.\"? Yes, no, or maybe? Yes\n###\nThe Bajo de la Carpa Formation is a geologic formation that outcrops in Patagonia, in the provinces of R\u00edo Negro and Neuqu\u00e9n, Argentina. It is the first of two formations belonging to the R\u00edo Colorado Subgroup within the Neuqu\u00e9n Group. Formerly that subgroup was treated as a formation, and the Bajo de la Carpa Formation was known as the Bajo de la Carpa Member. Are we justified in saying that \"Bajo de la Carpa Formation is the second of two formations belonging to the Rio Colorado Subgroup.\"? Yes, no, or maybe? No\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\". Are we justified in saying that \"Crawling is a song by Linkin Park\"? Yes, no, or maybe? Yes\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists. Are we justified in saying that \"Some of the editorial cartoonists in the list received more pay in their careers than others.\"? Yes, no, or maybe? Maybe\n###\nRye St Antony School is an independent Roman Catholic boarding and day school for girls aged 3 to 18 and boys up to age 8 in Headington, Oxford, England. It is commonly abbreviated and referred to by both pupils and staff as 'Rye'. Rye is unique as a girls\u2019 independent Catholic school founded by lay women rather than by a religious order. Are we justified in saying that \"You must be Roman Catholic to attend Rye St Antony School.\"? Yes, no, or maybe?", "doc_id": 872, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1678, 11329, 41015, 5025], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Have You Ever Met That Funny Reefer Man\", often known simply as \"The Reefer Man\", is a 1932 American jazz song composed by J. Russel Robinson, with lyrics by Andy Razaf. It was first recorded by Cab Calloway and his orchestra, with versions by others over the years, including by Harlan Lattimore, Murphy's Law and Big Bad Voodoo Daddy. Are we justified in saying that \"The song was released in 1931\"? Yes, no, or maybe? No\n###\nThe 2007 North Indian Ocean cyclone season was an event in the annual cycle of tropical cyclone formation. The North Indian Ocean cyclone season has no official bounds, but cyclones tend to form between April and December, with peaks in May and November. These dates conventionally delimit the period of each year when most tropical cyclones form in the northern Indian Ocean. Are we justified in saying that \"Cyclones exclusively form between april and december\"? Yes, no, or maybe? No\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles. Are we justified in saying that \"The movie has a cult following\"? Yes, no, or maybe? Yes\n###\nAnimation Domination was an animated programming block which originally aired from May 1, 2005, until September 21, 2014, on the Fox network. The block aired on Sunday evenings through the entirety of that night's primetime schedule (unless preempted, usually by sports telecasts). Are we justified in saying that \"Animation Domination aired in the evenings of the first day of the american week.\"? Yes, no, or maybe? Yes\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label. Are we justified in saying that \"Nightmare was created before Lance King reached 30 years of age.\"? Yes, no, or maybe?", "doc_id": 449, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34266, 15138, 4905, 31612], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"She was forced to do pageants as a kid\"? Yes, no, or maybe? Maybe\n###\nUSFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico. Are we justified in saying that \"The USFC \"Fish Hawk\" was in operation for almost half a century. \"? Yes, no, or maybe? Yes\n###\nXiaogan () is a prefecture-level city in east-central Hubei province, People's Republic of China, some 60 km northwest of the provincial capital of Wuhan. According to the 2010 census, its population totaled 4,814,542, of whom 908,266 lived in the built-up (\"or metro\") area of Xiaonan District. Are we justified in saying that \"Wuhan had a population of 4,814,543 in 2010.\"? Yes, no, or maybe? Maybe\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism. Are we justified in saying that \"The population is now over 5 thousand\"? Yes, no, or maybe? Maybe\n###\n\"Stephen, Stephen\" is a song by American rock band, The Apples in Stereo. The song made its debut on December 20, 2006 on the Comedy Central program \"The Colbert Report\" where it was performed by Apples frontman, Robert Schneider during Episode number 193. Are we justified in saying that \"\"Stephen, Stephen\" was released the week before Christmas Day.\"? Yes, no, or maybe?", "doc_id": 592, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7229, 28531, 34435, 1021], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Spring Fine Art Exhibition of Leningrad artists (Leningrad, 1954) (Russian: \"\"\u0412\u0435\u0441\u0435\u043d\u043d\u044f\u044f \u0432\u044b\u0441\u0442\u0430\u0432\u043a\u0430 \u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0435\u043d\u0438\u0439 \u043b\u0435\u043d\u0438\u043d\u0433\u0440\u0430\u0434\u0441\u043a\u0438\u0445 \u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a\u043e\u0432 1954 \u0433\u043e\u0434\u0430\"\" ) become one of the largest Soviet Art Exhibition of 1954 and one of the first Art Exhibitions after Stalin death. The Exhibition took place in Leningrad Union of Soviet Artists Exhibition Halls on Bolshaya Morskaya st. 38. Are we justified in saying that \"Spring Fine Art Exhibition of Leningrad was the most popular art show in Russia\"? Yes, no, or maybe? Maybe\n###\nThe 2010 ASB Classic was a women's tennis tournament played on outdoor hard courts. It was the 25th edition of the ASB Classic, and was part of the WTA International tournaments of the 2010 WTA Tour. It took place at the ASB Tennis Centre in Auckland, New Zealand, from 4 January through 9 January 2010. Yanina Wickmayer won the singles title. Are we justified in saying that \"The tournament are played in outdoors hard courts.\"? Yes, no, or maybe? Yes\n###\nSusarion (Greek: \u03a3\u03bf\u03c5\u03c3\u03b1\u03c1\u03af\u03c9\u03bd) was an Archaic Greek comic poet, was a native of Tripodiscus in Megaris (see Megara) and is considered one of the originators of metrical comedy and, by others, he was considered the founder of Attic Comedy. Nothing of his work, however, survives except one iambic fragment (see below) and this is not from a comedy but instead seems to belong within the Iambus tradition. Are we justified in saying that \"The surviving works of Susarion are not of the sort from which he is renowned.\"? Yes, no, or maybe? Yes\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced. Are we justified in saying that \"Splice is a sci fi horror movie about genetic engineering but it's also about a married couple\"? Yes, no, or maybe? Maybe\n###\n\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012. Are we justified in saying that \"\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want! I Want!\".\"? Yes, no, or maybe?", "doc_id": 665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10969, 44247, 31920, 5668], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rock Star Supernova was a reality television-formed supergroup consisting of drummer Tommy Lee (M\u00f6tley Cr\u00fce), bassist Jason Newsted (ex-Metallica), guitarist Gilby Clarke (ex-Guns N' Roses) and singer Lukas Rossi. The band was formed during the second season of the Rock Star Reality TV series which was called . Are we justified in saying that \"Newsted used to be a part of Metallica\"? Yes, no, or maybe? Yes\n###\nCloverdale Depot is a bus station and future intermodal station in Cloverdale, California. It is served by Amtrak Thruway and Sonoma County Transit buses. Additional service to Sonoma County Airport station is provided by Sonoma County Transit under contract by Sonoma\u2013Marin Area Rail Transit. Are we justified in saying that \"Sonoma\u2013Marin Area Rail Transit and Amtrak Thruway collaborate.\"? Yes, no, or maybe? Maybe\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"CUHK was established before 1964\"? Yes, no, or maybe? Yes\n###\nShehzad Sheikh or Shahzad Sheikh is a Pakistani film and television actor and model, known for playing the lead role in the 2015 film \"Karachi Se Lahore\". He also starred in the series \"Annie Ki Ayegi Baraat\", \"Mi Raqsam\", and \"Mere Hamrahi\", and a TV film \"Main Kukkoo Aur woh\". He is the son of well-known actor Javed Sheikh. Are we justified in saying that \"Shehzad Sheikh was born in Karachi.\"? Yes, no, or maybe? Maybe\n###\nThe 1998 Idaho Vandals football team represented the University of Idaho in the 1998 NCAA Division I-A football season. The Vandals, led by fourth-year head coach Chris Tormey, were members of the Big West Conference and played their home games at the Kibbie Dome, an indoor facility on campus in Moscow, Idaho. Are we justified in saying that \"The 1998 Idaho Vandals football team played games in 10 states.\"? Yes, no, or maybe?", "doc_id": 284, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20270, 19198, 19666, 12484], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aubrey Morgan O'Day (born February 11, 1984) is an American singer-songwriter, member of the duo Dumblonde, reality television personality, and a former member of the girl group Danity Kane. Following discord among Danity Kane and with her mentor at the time, P. Diddy, O'Day was fired from the group in 2008, but reunited with them in 2013 before a second disbandment. Are we justified in saying that \"Aubrey Morgan O'Day is an Aquarius.\"? Yes, no, or maybe? Yes\n###\nRoc-A-Fella Records Presents Teairra Mar\u00ed is the debut album by recording artist Teairra Mar\u00ed. It was released on August 2, 2005, by Roc-A-Fella Records. The album debuted in the top five selling 69,000 copies in the first week, eventually selling 248,000 units. Are we justified in saying that \"Rock-a-Fella records gross revenue increased dramatically after the release of Teairri Mari's successful first album.\"? Yes, no, or maybe? Maybe\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"Greatest Hits Volume 1 was not released in 1966\"? Yes, no, or maybe? Yes\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\". Are we justified in saying that \"Carlesimo was a businessman in his spare time\"? Yes, no, or maybe? Maybe\n###\nLurianic Kabbalah is a school of kabbalah named after the Jewish rabbi who developed it: Isaac Luria (1534\u20131572; also known as the \"ARI'zal\", \"Ha'ARI\" or \"Ha'ARI Hakadosh\"). Lurianic Kabbalah gave a seminal new account of Kabbalistic thought that its followers synthesised with, and read into, the earlier Kabbalah of the Zohar that had disseminated in Medieval circles. Are we justified in saying that \"Isaac Luria was a scholar.\"? Yes, no, or maybe?", "doc_id": 125, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37182, 8115, 26533, 23452], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "History of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Orson Pratt published the book in the 1850s.\"? Yes, no, or maybe? Yes\n###\nThe 1970 Swedish Open was a combined men's and women's tennis tournament played on outdoor clay courts held in B\u00e5stad, Sweden and was part of the Grand Prix circuit of the 1970 Tour. It was the 23rd edition of the tournament and was held from 2 July through 12 July 1970. Dick Crealy and Peaches Bartkowicz won the singles titles. Are we justified in saying that \"Dick Crealy won the woman's single title.\"? Yes, no, or maybe? No\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Nick Harper is a songwriter.\"? Yes, no, or maybe? Yes\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"ECSU also has a lot of frats\"? Yes, no, or maybe? Maybe\n###\nThe ostrich or common ostrich (\"Struthio camelus\") is either one or two species of large flightless birds native to Africa, the only living member(s) of the genus \"Struthio\", which is in the ratite family. In 2014, the Somali ostrich (\"Struthio molybdophanes\") was recognized as a distinct species. Are we justified in saying that \"The ostrich is a large bird that hates flying.\"? Yes, no, or maybe?", "doc_id": 504, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29406, 24327, 16041, 6976], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Intervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009. Are we justified in saying that \"intervilles was a french drama show on games\"? Yes, no, or maybe? No\n###\nLimnocharis flava (commonly known as yellow velvetleaf, sawah flower rush, sawah lettuce) is a species of aquatic flowering plant which is native to Mexico, Central America, South America, Cuba, Haiti and the Dominican Republic but widely naturalized in southern and southeastern Asia: India, Sri Lanka, Cambodia, Burma, Thailand, Vietnam, Indonesia, Malaysia and southern China (Guangdong, Yunnan). Are we justified in saying that \"Limnocharis flava is more popular in Asia then in Central and South America\"? Yes, no, or maybe? Maybe\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"The cultural evolution of Pakistan is portrayed in the book End of the Past.\"? Yes, no, or maybe? Yes\n###\nDicksonia is a genus of tree ferns in the order Cyatheales. It is regarded as related to \"Cyathea\", but is considered more primitive, dating back at least to the Jurassic and Cretaceous periods. The fossil record includes stems, pinnules, and spores. Are we justified in saying that \"The Dicksonia genus includes a dozen different species within it\"? Yes, no, or maybe? Maybe\n###\nNannina de' Medici (14 February 1448 \u2013 14 May 1493), born Lucrezia de' Medici, was the second daughter of Piero di Cosimo de' Medici and Lucrezia Tornabuoni. She was thus the elder sister of Lorenzo de' Medici. She married Bernardo Rucellai. Her father's name was Piero, so she is sometimes known as Lucrezia di Piero de' Medici. Are we justified in saying that \"Nannina de' Medici did not have any brothers\"? Yes, no, or maybe?", "doc_id": 657, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25740, 3115, 15029, 35758], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\". Are we justified in saying that \"Dostluk Spor Kul\u00fcb\u00fc plans to celebrate its 50th anniversary in 2023 by adding a third color.\"? Yes, no, or maybe? Maybe\n###\nSherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983. Are we justified in saying that \"Stewart has been ranked in both singles and doubles in the ATP rankings. \"? Yes, no, or maybe? Yes\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"Tsewang Rigzin was re-elected to serve through August 2013.\"? Yes, no, or maybe? Yes\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough. Are we justified in saying that \"Merry Christmas, Charlie Manson! aired over 2 years ago\"? Yes, no, or maybe? Yes\n###\nThe 2012 SEC Women\u2019s Basketball Tournament took place at the Bridgestone Arena in Nashville, Tennessee from March 1-4, 2012. The Tennessee Lady Volunteers won the tournament and received the SEC\u2019s automatic bid to the 2012 NCAA Women\u2019s Basketball Tournament by defeating the LSU Lady Tigers 70-58 in the championship game. Are we justified in saying that \"The Lady Volunteers won the NCAA tournament only thirteen years after the penultimate year of the twentieth century.\"? Yes, no, or maybe?", "doc_id": 706, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23697, 2615, 20669, 33915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"The final of the 1983 Prudential Cup was the most exciting game of the century.\"? Yes, no, or maybe? Maybe\n###\nJefferies LLC is an American global investment bank and institutional securities firm headquartered in New York. The firm provides clients with capital markets and financial advisory services, institutional brokerage, securities research, and asset management. This includes mergers and acquisitions, restructuring, and other financial advisory services. Are we justified in saying that \"Institutional Brokerage is a hard field to get into when developing a company. \"? Yes, no, or maybe? Maybe\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel. Are we justified in saying that \"Bryan Ferry is a back up singer.\"? Yes, no, or maybe? No\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\". Are we justified in saying that \"Paint It Black was released by The Rolling Stones in 1966\"? Yes, no, or maybe? Yes\n###\n\"You & Me\" is a 1985 single by The Flirts, a vocal trio based in New York City. The single, taken from their album \"Blondes, Brunettes, and Redheads,\" was produced by Bobby Orlando, the creator/founder of the group, and shared co-writing credits with Clifton \"Jiggs\" Chase. The featured vocalists on this single were Tricia Wygal, Debra \"Debbie\" Gaynor, and Christina Criscione. Are we justified in saying that \"The Flirts's members live in New York. \"? Yes, no, or maybe?", "doc_id": 479, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8532, 24214, 5654, 43723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh. Are we justified in saying that \"Although Stanley Anthony Woods has played for more than one professional football team, he played for the Washington Redskins the longest.\"? Yes, no, or maybe? Maybe\n###\nThe 2012 Toronto FC season was the sixth season in Toronto FC's existence. The club missed the playoffs for the sixth straight season, having never made a post-season appearance. Their season began on March 7 with the quarterfinal of the Champions League against the Los Angeles Galaxy. Are we justified in saying that \"The Toronto FC came close to getting into the playoffs but knocked out a few games to the qualifier.\"? Yes, no, or maybe? Maybe\n###\n\"Klaatu barada nikto\" is a phrase that originated in the 1951 science fiction film \"The Day the Earth Stood Still\". The humanoid alien protagonist of the film, Klaatu (Michael Rennie), instructs Helen Benson (Patricia Neal) that if any harm befalls him, she must say the phrase to the robot Gort (Lockard Martin). In response Gort relents from destroying the Earth and resurrects Klaatu from death. Are we justified in saying that \"Most of the actors from \"The Day the Earth Stood Still\" are still alive today.\"? Yes, no, or maybe? Maybe\n###\nA madrigal is a secular vocal music composition of the Renaissance and early Baroque eras. Traditionally, polyphonic madrigals are unaccompanied; the number of voices varies from two to eight, and most frequently from three to six. It is quite distinct from the Italian Trecento madrigal of the late 13th and 14th centuries, with which it shares only the name. Are we justified in saying that \"The number of voices can be 5\"? Yes, no, or maybe? Yes\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award. Are we justified in saying that \"The Panama Canal was completed before David McCullough was born.\"? Yes, no, or maybe?", "doc_id": 400, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2387, 4590, 9486, 11388], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Bad Voodoo Daddy is a contemporary swing revival band from Southern California. Their notable singles include \"Go Daddy-O\", \"You & Me & the Bottle Makes 3 Tonight (Baby)\", and \"Mr. Pinstripe Suit\". The band played at the Super Bowl XXXIII half-time show in 1999. Are we justified in saying that \"Big Voodoo Daddy played at superbowl\"? Yes, no, or maybe? Yes\n###\nThe Australia national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the South Africa national cricket team. The tournament was won by England. Australia were captained by Syd Gregory. Are we justified in saying that \"The Australia national cricket team won the 1912 Triangular Tournament\"? Yes, no, or maybe? No\n###\nMark Donovan (born 12 October 1968) is a Welsh character actor best known for his roles in productions such as \"Shaun of the Dead\", \"Black Books\", \"In Bruges\", and \"Murder Investigation Team\". He also played a brief scene of Hamlet in an episode of the David Renwick comedy-drama, \"Love Soup\". His stage roles include Gozark in \"Singin' in the Rain\" and Inspector Clay in \"Plan 9 from Outer Space\". Are we justified in saying that \"Mark Donovan is German character that starred in \"Shaun of the Dead\".\"? Yes, no, or maybe? No\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"Antonio Lewis is from North America.\"? Yes, no, or maybe? Yes\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr. Are we justified in saying that \"Things Happen at Night was released more than 1000 days ago.\"? Yes, no, or maybe?", "doc_id": 239, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8186, 2632, 44584, 18589], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2017 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the third edition of the tournament and part of the 2017 ATP Challenger Tour. It took place in Happy Valley, Australia from 2 to 8 January 2017. Are we justified in saying that \"Happy Valley, Australia provides hard courts for tennis tournaments, which it regularly hosts. \"? Yes, no, or maybe? Maybe\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006. Are we justified in saying that \"\"Live Free or Die\" was the final episode of the six season of \"The Sopranos\".\"? Yes, no, or maybe? Maybe\n###\nRafael Cede\u00f1o Hern\u00e1ndez is an imprisoned Mexican drug trafficker who was a high-level leader of La Familia Michoacana, a drug cartel based in the Mexican state of Michoac\u00e1n. He was the successor of Alberto Espinoza Barr\u00f3n, a drug trafficker who was arrested on 31 December 2008 by the Mexican authorities. Are we justified in saying that \"Rafael Cede\u00f1o Hern\u00e1ndez was arrested by Mexican authorities\"? Yes, no, or maybe? Maybe\n###\nGary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978. Are we justified in saying that \"Gary Sutherland no longer plays professional baseball\"? Yes, no, or maybe? Yes\n###\n2 Cool 2 Be 4gotten is a 2016 Filipino coming-of-age drama film directed by Petersen Vargas in his feature-length directorial debut and written by Jason Paul Laxamana. The film stars Khalil Ramos, Ethan Salvador and Jameson Blake. It depicts the mysterious coming-of-age tale of Felix after he met half-American Snyder brothers, Magnus and Maxim. Are we justified in saying that \"the most well known actor in the film is natalie portman\"? Yes, no, or maybe?", "doc_id": 488, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37274, 10615, 32915, 19172], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Far from the Madding Crowd is a 2015 British romantic drama film directed by Thomas Vinterberg and starring Carey Mulligan, Matthias Schoenaerts, Michael Sheen, Tom Sturridge and Juno Temple. It is an adaptation of the 1874 novel of the same name by Thomas Hardy, the fourth time this novel has been filmed. Are we justified in saying that \"Far from the Madding Crowd is a thriller\"? Yes, no, or maybe? No\n###\nThe Battle of Rio de Janeiro was a battle in 1558 on the French town at Rio de Janeiro, called Henriville. The Portuguese, though in far smaller numbers, defeated the French and made them flee to the jungle. The French town was then burnt by Mem de S\u00e1, the Portuguese governor. Are we justified in saying that \"Hennville is where a fight took place between the French and Portuguese\"? Yes, no, or maybe? Yes\n###\nFabio Ochoa V\u00e1squez (born May 2, 1957) is a former leading member of the Medell\u00edn cocaine trafficking cartel, along with his older brothers Juan David and Jorge Luis. His role briefly made him a billionaire. After serving a brief prison term in Colombia, he was arrested and extradited to the US in 1999 and is serving a 30 year term in US federal prison. Are we justified in saying that \"Fabio is no longer a billionaire.\"? Yes, no, or maybe? Yes\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011. Are we justified in saying that \"Most consumer-oriented companies headquartered in San Francisco, California are co-founded by Goldstein and Huffman in 2010.\"? Yes, no, or maybe? Maybe\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"The cable car takes you up and down the mountain.\"? Yes, no, or maybe?", "doc_id": 193, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1633, 20658, 10234, 26104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Annacone and Christo van Rensburg were the defending champions. Annacone participated with John Fitzgerald, and lost in the quarterfinals to Scott Davis and David Pate, while Van Rensburg played with Kevin Curren, and lost in the semifinals to Grant Connell and Glenn Michibata.
Rick Leach and Jim Pugh defeated Connell and Michibata 3\u20136, 6\u20134, 6\u20132, in the final. Are we justified in saying that \"Leach was defeated by the current champions Annacone and Rensburg.\"? Yes, no, or maybe? No\n###\nJosef Matthias Hauer (March 19, 1883 \u2013 September 22, 1959) was an Austrian composer and music theorist. He is most famous for developing, independent of and a year or two before Arnold Schoenberg, a method for composing with all 12 notes of the chromatic scale. Hauer was also an important early theorist of twelve-tone music and composition. Are we justified in saying that \"Josef Matthias Hauer died in September 22, 1959 in Austria\"? Yes, no, or maybe? Maybe\n###\nThe second series of the British television sketch comedy premiered on BBC Two on 21 July 2005. This series included six episodes with the concluding episode broadcast on 25 August 2005. A Christmas Special followed the second series and was screened on BBC Two on 20 December 2005. Are we justified in saying that \"On December 20, 2005, a Christmas Special followed the second series.\"? Yes, no, or maybe? Yes\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars. Are we justified in saying that \"X X X X is brewed in Japan.\"? Yes, no, or maybe? No\n###\nLike the Roman: The Life of Enoch Powell is a 1998 book by the English writer Simon Heffer. It is a biography of the politician Enoch Powell. The title is taken from Powell's 1968 Rivers of Blood speech when Powell quoted Virgil's \"Aeneid\": \"As I look ahead, I am filled with foreboding; like the Roman, I seem to see the River Tiber foaming with much blood\". Are we justified in saying that \"Like the Roman: The Life of Enoch Powell is based on real events.\"? Yes, no, or maybe?", "doc_id": 593, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26015, 16230, 24609, 20659], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sanation (Polish: \"Sanacja\" , ] ) was a Polish political movement that was created in the interwar period, prior to J\u00f3zef Pi\u0142sudski's May 1926 \"Coup d'\u00c9tat\", and came to power in the wake of that coup. In 1928 its political activists would go on to form the Nonpartisan Bloc for Cooperation with the Government (\"\"BBWR\"\"). Are we justified in saying that \"When Sanation was created there was someone other than J\u00f3zef Pi\u0142sudski in power\"? Yes, no, or maybe? Yes\n###\nRoss Dawson (born 1962) is an Australian author, futurist, entrepreneur and former stockbroker. Best known for his 2002 book 'Living Networks', Dawson founded the futures think tank Future Exploration Network and consults on digital futures to various big organisations such as Ernst & Young, Macquarie Bank, Microsoft and News Corp. Are we justified in saying that \"Ross Dawson bought stocks.\"? Yes, no, or maybe? Yes\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif. Are we justified in saying that \"Ihsan Abdel Quddous is also of Lebanese origin.\"? Yes, no, or maybe? Maybe\n###\n\"Brown Eyed Handsome Man\" is a rock and roll song written and recorded by Chuck Berry, originally released by Chess Records in September 1956 as the B-side of \"Too Much Monkey Business.\" It was also included on Berry's 1957 debut album, \"After School Session\". The song title was also used as the title of a biography of Berry. Are we justified in saying that \"Obama was not the POTUS when Chuck Berry's debut album was released.\"? Yes, no, or maybe? Yes\n###\nPhacelia coerulea is a species of phacelia known by the common name skyblue phacelia. It is native to the California and the Southwestern United States and northern Mexico, where it grows in desert and plateau habitat types, such as scrub and woodland. Are we justified in saying that \"Phacelia cant grow during summer.\"? Yes, no, or maybe?", "doc_id": 69, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14218, 24716, 2671, 30494], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick Ferdinand of Anhalt-K\u00f6then (25 June 1769, Pless \u2013 23 August 1830, K\u00f6then) was a German prince, Ascanian ruler of the principality of Anhalt-Pless and, from 1818, of the duchy of Anhalt-K\u00f6then. He was the second son of Frederick Erdmann, Prince of Anhalt-Pless, and his wife, Louise Ferdinande, daughter of Henry Ernest, Count of Stolberg-Wernigerode. Are we justified in saying that \"Frederick Ferdinand of Anhalt-K\u00f6then was born the same day as his wife\"? Yes, no, or maybe? Maybe\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona. Are we justified in saying that \"Knightriders only took 2 weeks to film\"? Yes, no, or maybe? Maybe\n###\nIdris Sultan (born January 1993) is a Tanzanian Actor and comedian, actor and radio host who won the Big Brother Africa-Hotshots in 2014. He hosts the biggest comedy news show called SIO HABARI, he also hosts a radio show called MWB(Mji wa burudani) on ChoiceFm Tanzania. Are we justified in saying that \"The radio show,MWB, will soon no longer be on the air for people to enjoy. \"? Yes, no, or maybe? Maybe\n###\nThe Chattenden and Upnor Railway (later known as the Lodge Hill and Upnor Railway) was a narrow gauge railway serving the military barracks and depot at Upnor and associated munitions and training depots. It was built in 1873 as a gauge railway, converted to narrow gauge around 1885, and continued in use until 1961. Are we justified in saying that \"The Chattenden and Upnor railway is still in use today.\"? Yes, no, or maybe? No\n###\nVictor H. Halligan (November 22, 1892 \u2013 March 10, 1973) was an American football player. He played for the University of Nebraska from 1912 to 1914 and was the first All-American football player to be selected from the Nebraska Cornhuskers football team. Are we justified in saying that \"Halligan played football until he was thirty.\"? Yes, no, or maybe?", "doc_id": 113, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21772, 20393, 14511, 37817], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Semonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781. Are we justified in saying that \"The base is located in Lesotho\"? Yes, no, or maybe? Yes\n###\nJohn Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old. Are we justified in saying that \"John Cameron Urschel played for penn state\"? Yes, no, or maybe? Yes\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \" Gabriele Muccino also had a small character role in The Pursuit of Happyness.\"? Yes, no, or maybe? Maybe\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts. Are we justified in saying that \"Diaspora studies is a fascinating topic of discussion.\"? Yes, no, or maybe? Maybe\n###\nDaniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country. Are we justified in saying that \"Daniel James Shellabarger was born in a different state than the one he currently lives in\"? Yes, no, or maybe?", "doc_id": 624, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40897, 11714, 28455, 5576], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Les Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads: Are we justified in saying that \"The work was begun in 1936\"? Yes, no, or maybe? No\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\". Are we justified in saying that \"The album Take Two had more than three hits.\"? Yes, no, or maybe? Maybe\n###\nSonnette is an unincorporated community in west central Powder River County, Montana, United States. The community is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest. It lies along local roads northwest of the town of Broadus, the county seat of Powder River County. Its elevation is 3,766\u00a0feet (1,148\u00a0m). Are we justified in saying that \"There is a community who live along local roads northwest of the town of Broadus in along local roads northwest of the town of Broadus. This Montana community, is not governed by a local municipal corporation and it is situated near the headwaters of Pumpkin Creek, just west of the Custer National Forest.\"? Yes, no, or maybe? Yes\n###\nVictor Ebubedike (born February 1, 1966), also known as Victor X Ebubedike and Victor Muhammad, is an English former American football player who played as a running back for London Ravens, from 1983-1990, then onto the NFL Europe's London Monarchs from 1991\u20131992 and 1995-1998. Are we justified in saying that \"Europe had a popular NFL team known as the London Ravens.\"? Yes, no, or maybe? Maybe\n###\nStainer & Bell Limited is a British publisher of classical sheet music and books, based in London. Stainer, founded in 1907, publish the works of a number of significant twentieth-century composers, including Charles Villiers Stanford, Gustav Holst, Ralph Vaughan Williams, and Herbert Howells. They also publish a number of earlier composers, including Henry VIII, William Byrd, and Henry Purcell. Are we justified in saying that \"Stainer & Bell Limited was founded after the War of 1812.\"? Yes, no, or maybe?", "doc_id": 423, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30217, 27821, 28524, 23469], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ahmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Ahmad Kemal Idri was not the Indonesian National Revolution, but was the Yogyakarta itself after continued resistance\"? Yes, no, or maybe? No\n###\nEl\u00ednr\u00f3s L\u00edndal is an entrepreneur in Fashion design. She established ELLA fashion label in 2008, one of the first Slow Fashion brands in the world. Elinr\u00f3s was the brands creative director and CEO. ELLA launched] it\u00b4s first fashion line in April 2011. Are we justified in saying that \"Ella first fashion line was launched in April 26, 2011\"? Yes, no, or maybe? Maybe\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"There were no acoustic songs on the album \"Smithereens.\"\"? Yes, no, or maybe? No\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Ashcroft was built in 1861.\"? Yes, no, or maybe? No\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics. Are we justified in saying that \"Amelie Simone Mauresmo is forty years old at the time of this statement. \"? Yes, no, or maybe?", "doc_id": 41, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18084, 12124, 28219, 4652], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nina Ellen \u00d8deg\u00e5rd (born 2 November 1979 in Stavanger) is a Norwegian actress. She made her stage debut at Rogaland Teater in 2002 in a play by Brian Friel. Among her films are \"Play\" from 2003 and \"Alt for Egil\" from 2004. Her role as \"Josie\" in O'Neill's play \"M\u00e5ne for livets stebarn\" in 2005 earned her the Hedda Award for best stage performance. Are we justified in saying that \"Nina Ellen \u00d8deg\u00e5rd made her debut in the same century that she was born.\"? Yes, no, or maybe? No\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College. Are we justified in saying that \"The Vigilantes' home town and home stadium share the same name \"? Yes, no, or maybe? Yes\n###\nTurnagain, also called Buru Island, is an island of the \"Western Islands\" region of the Torres Strait Islands archipelago, located in the northern section of Torres Strait, Queensland, Australia. Turnagain is located within the Torres Strait Island Region Local government area. Are we justified in saying that \"Turnagain is located within the Region Local government area. \"? Yes, no, or maybe? Yes\n###\nSophie Charlene Akland Monk (born 14 December 1979) is an English-born Australian singer, songwriter, actress, model and radio personality. Monk was a member of the girl group Bardot and released a solo album called \"Calendar Girl\" (2003). She has appeared in films such as \"Date Movie\" (2006), \"Click\" (2006), and \"Spring Breakdown\" (2009). Are we justified in saying that \"Sophie was born in 1989.\"? Yes, no, or maybe? No\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan. Are we justified in saying that \"Mike Bossy the Scoring Machine is a pinball machine, it is a one of a kind machine which is pretty rare, since they only made the prototype\"? Yes, no, or maybe?", "doc_id": 679, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12042, 11563, 17877, 2156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "This is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists. Are we justified in saying that \"An editorial cartoonist makes shapes.\"? Yes, no, or maybe? Yes\n###\nAlec Holowka is a Canadian indie game developer, co-founder of an independent game companies Bit Blot, and Infinite Ammo, where he works as lead programmer, musician and game designer. He collaborated with Derek Yu to create the award-winning game \"Aquaria\" and freeware game \"I'm O.K - A Murder Simulator\". He collaborated with Scott Benson to create \"Night in the Woods\". Are we justified in saying that \"Derek Yu collaborated with him to write Night in the Woods\"? Yes, no, or maybe? No\n###\nHistorical period drama is a film genre in which stories are based on historical events and famous persons. Some historical dramas attempt to accurately portray a historical event or biography, to the degree that the available historical research will allow. Other historical dramas are fictionalised tales that are based on an actual person and their deeds. Are we justified in saying that \"Historical period dramas are hard to make accurate\"? Yes, no, or maybe? Maybe\n###\nA Qualified Person Responsible for Pharmacovigilance, or QPPV, is an individual named by a pharmaceutical company as the main person responsible for ensuring that the company (the product's Marketing Authorisation Holder or MAH) meets its legal obligations for the monitoring of the safety of a medicinal product on the market. Are we justified in saying that \"Medications go through a four step process with the QPPV regulation.\"? Yes, no, or maybe? Maybe\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums). Are we justified in saying that \"Lars Iversen is a European composer and producer.\"? Yes, no, or maybe?", "doc_id": 845, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44318, 25028, 43614, 41835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center. Are we justified in saying that \"Tango is a dance inspired by classical music\"? Yes, no, or maybe? Maybe\n###\nArt History is a 2011 American drama film directed by Joe Swanberg, written by Swanberg, Josephine Decker, and Kent Osborne. It stars Decker, Swanberg, Osborne, Adam Wingard, and Kris Swanberg as filmmakers whose lives are complicated by a graphic sex scene in an arthouse film. Are we justified in saying that \"There are two actors with the last name Swanberg in Art History.\"? Yes, no, or maybe? Yes\n###\nBride of Chucky is a 1998 American supernatural comedy slasher film, the fourth installment of the \"Child's Play\" franchise and sequel to 1991's \"Child's Play 3\". The film is written by Don Mancini and directed by Ronny Yu, and stars Jennifer Tilly (who plays and voices the title character Tiffany) and Brad Dourif (who voices Chucky), as well as John Ritter, Katherine Heigl and Nick Stabile. Are we justified in saying that \"Bride of Chucky was a sequel in the Chucky series.\"? Yes, no, or maybe? Yes\n###\n\"That's the Beat of a Heart\" is a song recorded by American country music duo The Warren Brothers featuring Sara Evans. It was released in March 2000 as the first single from their album \"King of Nothing\". It was also included on the soundtrack to the 2000 film \"Where the Heart Is\". The song was written by Tena Clark and Tim Heintz. Are we justified in saying that \"The Warren Brothers are not siblings\"? Yes, no, or maybe? Maybe\n###\nA political decoy is a person employed to impersonate a politician, to draw attention away from the real person or to take risks on that person's behalf. This can also apply to military figures, or civilians impersonated for political or espionage purposes. Are we justified in saying that \"People who work in government affairs might hire a political decoy.\"? Yes, no, or maybe?", "doc_id": 721, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33022, 31347, 17293, 21760], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Call\" is a song written by Gene MacLellan and performed by Anne Murray. The song reached #5 on the Canadian country chart, #6 on the U.S. Adult Contemporary chart, and #13 on the Canadian Adult Contemporary chart in 1976. The song appeared on her 1975 album, \"Together\". The song was produced by Tom Catalano. Murray recorded a different version on her 1970 album, \"Honey, Wheat and Laughter\". Are we justified in saying that \"The Call features music\"? Yes, no, or maybe? Yes\n###\nThe Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW. Are we justified in saying that \"The Linkou Power Plant will be more efficient after the retrofitting.\"? Yes, no, or maybe? Maybe\n###\nCharles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009. Are we justified in saying that \"Charles Rashad Jamaal Brown was born in the 80s.\"? Yes, no, or maybe? Yes\n###\nHector and the Search for Happiness is a 2014 German-British-Canadian comedy-drama film directed by Peter Chelsom and co-written with Tinker Lindsay and Maria von Heland, based on Fran\u00e7ois Lelord's novel of the same name. The film stars Simon Pegg and Rosamund Pike. Are we justified in saying that \"Hector and the Search for Happiness has no story.\"? Yes, no, or maybe? No\n###\nYou Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California. Are we justified in saying that \"Leary just released a new album.\"? Yes, no, or maybe?", "doc_id": 717, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3437, 21360, 19040, 20896], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Many science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list. Are we justified in saying that \"Many science fiction works have been set in the 21st century (years 2100 to 2100)\"? Yes, no, or maybe? No\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci was injured while filming the Devi's Whisper\"? Yes, no, or maybe? Maybe\n###\nPrincess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld (Coburg, 23 September 1781 \u2013 Elfenau, near Bern, Switzerland, 15 August 1860), also known as Grand Duchess Anna Feodorovna of Russia (Russian: \u0410\u043d\u043d\u0430 \u0424\u0451\u0434\u043e\u0440\u043e\u0432\u043d\u0430 ), was a German princess of the ducal house of Saxe-Coburg-Saalfeld (after 1826, the house of Saxe-Coburg-Gotha) who became the wife of Grand Duke Konstantin Pavlovich of Russia. Are we justified in saying that \"Princess Juliane Henriette Ulrike of Saxe-Coburg-Saalfeld only loved her husband Grand Duke Konstantin Pavlovich of Russia.\"? Yes, no, or maybe? Maybe\n###\nThe Global Food Security Act of 2016 (Pub.L. 114\u2013195 ), is a law introduced on March 24, 2015 in the 114th Congress by Representative Christopher Henry \"Chris\" Smith (New Jersey-R) and on May 7, 2015 by Senator Robert Patrick \"Bob\" Casey Jr. (Pennsylvania-D), and signed by President Barack Obama on July 20, 2016. Are we justified in saying that \"The Global Food Security Act of 2016 was later dismissed by President Trump.\"? Yes, no, or maybe? Maybe\n###\nThe Volkswagen Citi Golf was a car produced by Volkswagen in South Africa from 1984 until 21 August 2009. It was a face-lifted version of the original Volkswagen Golf Mk1 hatchback, which ceased production in Germany in 1983. The car was produced only with right-hand drive. Are we justified in saying that \"The Volkswagen Citi Golf was designed to drive on the left side of the road\"? Yes, no, or maybe?", "doc_id": 30, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18027, 15918, 37469, 39423], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso. Are we justified in saying that \"Gun Bow was noted for his rivalry with six-time American Horse of the Year Kelso.\"? Yes, no, or maybe? No\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Chris McKendry speaks multiple languages\"? Yes, no, or maybe? Maybe\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village. Are we justified in saying that \"It is west of downtown Alexandroupoli.\"? Yes, no, or maybe? Yes\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree Victoria Murphy lives in an apartment. \"? Yes, no, or maybe? Maybe\n###\nThe following is a list of ongoing civil unrest or ongoing protests that are taking place around the world. This list is for the sole purpose of identifying present-day civil unrest and protests and the death toll and number of protesters associated with each event. Are we justified in saying that \"This is a bar graph.\"? Yes, no, or maybe?", "doc_id": 630, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28957, 8279, 30235, 36053], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The NME Awards 2017 were held in London, England, on 15 February 2017, at the Brixton Academy and was hosted by English comedian Huw Stephens. Beyonc\u00e9 led the nominations with five, followed by The 1975, Bastille, Christine And The Queens and Skepta with four nominations each. Are we justified in saying that \"The NME was held in London at night. \"? Yes, no, or maybe? Maybe\n###\nThe Communaut\u00e9 de communes des Trois Rivi\u00e8res (before January 2017: \"Communaut\u00e9 de communes du Pays des Trois Rivi\u00e8res\") is a federation of municipalities (\"communaut\u00e9 de communes\") in the Aisne \"d\u00e9partement\" and in the Hauts-de-France \"region\" of France. Are we justified in saying that \"Aisne is the smallest d\u00e9partement in France\"? Yes, no, or maybe? Maybe\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"There are currently no plans to shorten the The San Nicolao Tunnel\"? Yes, no, or maybe? Maybe\n###\nTory Woodbury (born July 12, 1978) is a former American football quarterback/wide receiver. He was originally signed as an undrafted free agent out of Winston-Salem State University by the New York Jets. He is currently the quarterback coach at Garinger High School in Charlotte, NC. Are we justified in saying that \"Tory Woodburn will die in Charlotte, NC.\"? Yes, no, or maybe? Maybe\n###\nPixote: a Lei do Mais Fraco (] , lit. \"Pixote (small child): The Law of the Weak\") is a 1980 Brazilian drama film directed by H\u00e9ctor Babenco. The screenplay was written by Babenco and Jorge Dur\u00e1n, based on the book \"A Inf\u00e2ncia dos Mortos\" (\"The Childhood of the Dead Ones\") by Jos\u00e9 Louzeiro. Are we justified in saying that \"Jos\u00e9 Louzeiro was inspired to write \"A Inf\u00e2ncia dos Mortos\" after watching \"Pixote: a Lei do Mais Fraco\".\"? Yes, no, or maybe?", "doc_id": 537, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36182, 43027, 17633, 29931], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"The Master of Revenge airs on KBS2\"? Yes, no, or maybe? Yes\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label. Are we justified in saying that \"Bad Company was recorded at Headley Grange with Ronnie Lane's Mobile Studio in month after Halloween in the year that equals 2073 minus 100.\"? Yes, no, or maybe? Yes\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website. Are we justified in saying that \"The Austin City Limits Festival concert took place 6 years before 2012.\"? Yes, no, or maybe? Yes\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\". Are we justified in saying that \"Selvaraj died in January\"? Yes, no, or maybe? Yes\n###\nDennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues. Are we justified in saying that \"By the time Mr Probz was 30 years old, he had an international hit.\"? Yes, no, or maybe?", "doc_id": 602, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35555, 4593, 28382, 26502], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Castaways Hotel and Casino, formerly the Showboat Hotel and Casino was a hotel and casino located at the north end of the Boulder Strip in Las Vegas, Nevada. The hotel consisted of a 19 story tower containing 445 rooms, a casino and an adjacent RV park. The Castaways hotel was demolished on January 11, 2006 to make way for an unknown project. Are we justified in saying that \"The project that replaces the The Castaways Hotel and Casino will be better\"? Yes, no, or maybe? Maybe\n###\nThe Fondation Prince Pierre was established by Prince Rainier III of Monaco in February 1966 to promote culture and the arts through the creation and the awarding of prizes. Prince Rainier III created the foundation in tribute to his father, Pierre de Polignac a great patron of the arts. Are we justified in saying that \"The Fondation Prince Pierre promotes culture and the arts.\"? Yes, no, or maybe? Yes\n###\nBrenda Fricker (born 17 February 1945) is an Irish actress of theatre, film and television. She has appeared in more than 30 films and television roles. In 1989, she became the first Irish actress to win an Oscar, earning the Academy Award for Best Supporting Actress for \"My Left Foot\". As of July 2014, she has tentatively retired from acting. Are we justified in saying that \"Fricker was the first female Irish actress to win an Oscar.\"? Yes, no, or maybe? Yes\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"\"Come Back in One Piece\" was a major urban contemporary hit.\"? Yes, no, or maybe? No\n###\nJohan Niemann (born 26 June 1977) is best known for being the current bass player for Evergrey and co-founding the band Mind's Eye, for membership of Swedish heavy metal band Therion and as a member of the Scandinavian theatrical metal band Evil Masquerade. He is also currently live guitarist for Tiamat. He is a brother of Kristian Niemann. Are we justified in saying that \"Johan Niemann's brother is a musician \"? Yes, no, or maybe?", "doc_id": 107, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19496, 37602, 15998, 22158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"That's the Beat of a Heart\" is a song recorded by American country music duo The Warren Brothers featuring Sara Evans. It was released in March 2000 as the first single from their album \"King of Nothing\". It was also included on the soundtrack to the 2000 film \"Where the Heart Is\". The song was written by Tena Clark and Tim Heintz. Are we justified in saying that \"That's the Beat of a Heart has been in a movie.\"? Yes, no, or maybe? Yes\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions. Are we justified in saying that \"Project Gasbuggy was created by Obama.\"? Yes, no, or maybe? No\n###\nA Merry Friggin' Christmas is a 2014 American black comedy film directed by Tristram Shapeero and written by Phil Johnston. The film stars an ensemble cast featuring Joel McHale, Lauren Graham, Clark Duke, Oliver Platt, Wendi McLendon-Covey, Tim Heidecker, Candice Bergen and Robin Williams. The film was released on November 7, 2014, by Phase 4 Films. Are we justified in saying that \"A Merry Friggin' Christmas cast included Joel Mchale.\"? Yes, no, or maybe? Yes\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race. Are we justified in saying that \"He used to be a woman\"? Yes, no, or maybe? Maybe\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\". Are we justified in saying that \"They are a popuylar writing duo.\"? Yes, no, or maybe?", "doc_id": 104, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7566, 43144, 2143, 19486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frederick William Chesson (1833\u20131888) was an English journalist and prominent anti-slavery campaigner. He was active in the London Aborigines' Protection Society and Emancipation Committee, and met Harriet Ann Jacobs when she was in England in 1858; and was a vocal supporter of the Union side during the American Civil War. Are we justified in saying that \"Frederick William Chesson never met Harriet Ann Jacobs\"? Yes, no, or maybe? No\n###\nWings over America is a triple live album by Wings, released in December 1976. The album was recorded during American leg of the band's acclaimed 1975\u201376 Wings Over the World tour. It peaked at number 8 on the UK Albums Chart and reached number 1 on the US \"Billboard\" Top LPs & Tape chart. Are we justified in saying that \"Wings over America was released over 3000 days ago.\"? Yes, no, or maybe? Yes\n###\nKate Saunders (born 4 May 1960 in London) is an English writer, actress and journalist. The daughter of the early public relations advocate Basil Saunders and his journalist wife Betty (n\u00e9e Smith), Saunders has worked for newspapers and magazines in the UK, including \"The Sunday Times\", \"Sunday Express\", \"Daily Telegraph\", \"She\" and \"Cosmopolitan\". Are we justified in saying that \"Kate Saunders travels in very affluent circles in the UK. \"? Yes, no, or maybe? Maybe\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp. Are we justified in saying that \"Sidney is a fun place to swim.\"? Yes, no, or maybe? Maybe\n###\nAjay Nagrath is an Indian television and movie actor and is the son of Bollywood actor Anil Nagrath. Currently, he plays the role of \"Pankaj\" in C.I.D. He has done many roles in many TV shows and even films, but there came a point in his life when he was unhappy that his weight had become his identity in the industry. He said \"I used to be a couch potato.\" Are we justified in saying that \"Ajay Nagrath was always unhappy with his weight.\"? Yes, no, or maybe?", "doc_id": 363, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1063, 32778, 24884, 24414], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"Harry Bosch is a fictional character.\"? Yes, no, or maybe? Yes\n###\nThe Asteroids Galaxy Tour is a Danish pop band consisting of vocalist Mette Lindberg and songwriter/producer Lars Iversen, formed in 2007. When performing live the band extends to a five-piece, featuring Mikkel Balster Dorig (guitar), Simon Littauer (MPC and keys) and Rasmus Littauer (drums). Are we justified in saying that \"Although The Asteroids Galaxy Tour had Mette Lindberg as a vocalist, it was unsure if they needed another\"? Yes, no, or maybe? Maybe\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\". Are we justified in saying that \"Jeon Do-yeon as internationally popular\"? Yes, no, or maybe? Maybe\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph Ernst Friedrich von Forcade de Biaix retired at age 65\"? Yes, no, or maybe? Maybe\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Idris was 80 when he retired.\"? Yes, no, or maybe?", "doc_id": 8, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21328, 8760, 13392, 40483], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tamanna (Hindi: \u0924\u092e\u0928\u094d\u0928\u093e , translation: Desire) is a 1997 Indian drama film directed by Mahesh Bhatt. It stars Paresh Rawal, Pooja Bhatt, Sharad Kapoor and Manoj Bajpayee in the lead roles The screenplay was written by Tanuja Chandra. The story was written by Tanuja Chandra and Mahesh Bhatt. It was produced by Pooja Bhatt. Are we justified in saying that \"Tamanna is a 1997 Indian drama which was actually written in 1995 .\"? Yes, no, or maybe? Maybe\n###\nThe Nariphon (Thai: \u0e19\u0e32\u0e23\u0e35\u0e1c\u0e25 ), also known as Makkaliphon (Thai: \u0e21\u0e31\u0e01\u0e01\u0e30\u0e25\u0e35\u0e1c\u0e25 , from Pali \"makkaliphala\"), is a tree in Buddhist mythology which bears fruit in the shape of young female creatures. The maidens grow attached by their head from the tree branches. This tree grows at the Himaphan, a mythical forest where the female fruits are enjoyed by the Gandharvas who cut the fruits and take them away. Are we justified in saying that \"Buddhist mythology has a tree named Makkaliphon\"? Yes, no, or maybe? Yes\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award. Are we justified in saying that \"The Path Between the Seas takes place between 1870 through 1914\"? Yes, no, or maybe? Yes\n###\nThe 1973 Atlanta Braves season was the eighth season in Atlanta along with the 103rd season as a franchise overall. The highlight of the season was Hank Aaron finishing the season just one home run short of Babe Ruth as baseball's all-time home run king. The 1973 Atlanta Braves were the first team to boast three 40 home run hitters. They were Aaron, Darrell Evans, and Davey Johnson. Are we justified in saying that \"The Atlanta Braves are a third rate symphony.\"? Yes, no, or maybe? No\n###\nGray Cowan Boyce (19 February 1899 - 14 May 1981) was an American medieval historian and historical bibliographer whose masterwork was his five volume \"Literature of Medieval History, 1930-1975: A Supplement to Louis John Paetow's \"A Guide to the Study of Medieval History\"\" (1981). Are we justified in saying that \"Gray Boyce lived through both world wars.\"? Yes, no, or maybe?", "doc_id": 135, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25818, 25739, 4747, 33453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The following is a list of female cabinet ministers of Thailand. Thailand is a country located at the centre of the Indochina peninsula in Southeast Asia. It is bordered to the north by Burma and Laos, to the east by Laos and Cambodia, to the south by the Gulf of Thailand and Malaysia, and to the west by the Andaman Sea and the southern extremity of Burma. Are we justified in saying that \"Malaysia boarders Thailand to the south\"? Yes, no, or maybe? Yes\n###\nAlbert Levitt (March 14, 1887 \u2013 June 18, 1968) was a judge, law professor, attorney, and candidate for political office. While he was a memorable teacher at Washington and Lee University, and as judge of the United States District Court for the Virgin Islands ordered that woman voters must be registered, he later came to hold what some thought were eccentric views on religion. Are we justified in saying that \"Albert Levitt enjoyed golf in his spare time, was an animal rights activist, had a vegetarian diet, and identified as Jewish despite having a Catholic background. \"? Yes, no, or maybe? Maybe\n###\nDaoud Abdel Sayed (Arabic: \u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e \u00a0 ] ) is an Egyptian director and screenwriter. He was born in Cairo in 1946. He started as the assistant of Youssef Chahine in The Land. He made several critically acclaimed films, and won several international awards notably for \"The Land of Fear\" which was produced in 1999. Are we justified in saying that \"\u062f\u0627\u0648\u062f \u0639\u0628\u062f \u0627\u0644\u0633\u064a\u062f\u200e \u200e was screenwriter during his first job.\"? Yes, no, or maybe? Maybe\n###\nPhua Chu Kang Pte Ltd, also known as PCK Pte Ltd or Phua Chu Kang for short (\u9b3c\u99ac\u5bb6\u65cf in Chinese), was a Singaporean sitcom on MediaCorp TV Channel 5. The show debuted in Singapore in 1997. A sequel, \"Phua Chu Kang Sdn Bhd\" debuted on NTV7 on 25 March 2009 and aired in Singapore's MediaCorp TV Channel 5 on 6 October 2009. Are we justified in saying that \"Phua Chu Kang Pte Ltd has multiple sequels including \"Phua Chu Kang Sdn Bhd\" which debuted on NTV7 on 25 March 2009\"? Yes, no, or maybe? Maybe\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\". Are we justified in saying that \"Big Jet Plane was not released for the first time in 2009.\"? Yes, no, or maybe?", "doc_id": 674, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35681, 35989, 30057, 29596], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ra\u00fal Alberto Osella (born 8 June 1984 in Morteros) is an Argentine association footballer who currently plays for FC Locarno in Swiss Challenge League. He played FIFA U-17 World Cup Final for Argentina national team 2001. He played for Boca Juniors and Tiro Federal in Argentina. Are we justified in saying that \"Ra\u00fal Alberto Osella is a heartthrob.\"? Yes, no, or maybe? Maybe\n###\nFC Spartak Vladikavkaz (Russian: \u0424\u0443\u0442\u0431\u043e\u043b\u044c\u043d\u044b\u0439 \u043a\u043b\u0443\u0431 \u00ab\u0421\u043f\u0430\u0440\u0442\u0430\u043a\u00bb \u0412\u043b\u0430\u0434\u0438\u043a\u0430\u0432\u043a\u0430\u0437 , Ossetian: \u0424\u0443\u0442\u0431\u043e\u043b\u043e\u043d \u043a\u043b\u0443\u0431 \"\u0410\u043b\u0430\u043d\u0438\" ) is a Russian football club based in Vladikavkaz (formerly Ordzhonikidze), North Ossetia\u2013Alania. Founded in 1921, the club played in the Soviet Top League during the communist era, and won its first and only league title in the 1995 Russian Top League. Are we justified in saying that \"They played hockey since 1921\"? Yes, no, or maybe? No\n###\nThe Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network. Are we justified in saying that \"Chingford will be in operation for many years to come.\"? Yes, no, or maybe? Maybe\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street. Are we justified in saying that \"The population was over 300 \"? Yes, no, or maybe? Yes\n###\nThe Maloof family is a prominent American family based in Las Vegas, Nevada, who are owners of numerous business properties in the Western United States. The origin of the family name is Maalouf and is of Lebanese descent via their paternal grandfather. Are we justified in saying that \"The number of businesses owned by the Maloof family is large.\"? Yes, no, or maybe?", "doc_id": 713, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14775, 42138, 38710, 34871], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser had much sympathy for Jews\"? Yes, no, or maybe? Maybe\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012. Are we justified in saying that \"Four different people won the Melodi Grand Prix junior 2012.\"? Yes, no, or maybe? Maybe\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"The author wrote the book from the perspective of a young boy\"? Yes, no, or maybe? Yes\n###\nMuccan or Muccan Station is a pastoral lease and cattle station that once operated as a sheep station, located approximately 68 km north east of Marble Bar and 155 km south east of Port Hedland in the Pilbara region of Western Australia. Kookenyia Creek and the De Grey River run through the property. The country is gently undulating with large areas of spinifex. Are we justified in saying that \"Muccan Station makes pies\"? Yes, no, or maybe? No\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\". Are we justified in saying that \"As of 2019, Jeon Do-yeon would be 50 years old.\"? Yes, no, or maybe?", "doc_id": 676, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5012, 29617, 23380, 3546], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "August Perk (October 25, 1897, Lohne / Lingen, Germany; \u2013 May 12, 1945, Braunschweig, Germany) was a German Resistance fighter against the National Socialism. His brief friendship with Erich Maria Remarque influenced Remarque's novel \"All Quiet on the Western Front\". Are we justified in saying that \"August Perk was a German resistance fighter who lived with Erich Maria Remarque.\"? Yes, no, or maybe? Maybe\n###\nWellingore is a village and civil parish in the North Kesteven district of Lincolnshire, England. The population of the civil parish at the 2011 census was 356. It is situated on the A607 road, approximately 12 mi south from Lincoln. It conjoins the village of Navenby to the north. The Viking Way traverses through the village, passing from the side of the cliff edge to Ermine Street. Are we justified in saying that \"The village starts with a W\"? Yes, no, or maybe? Yes\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site. Are we justified in saying that \"The Cincinnati and Whitewater Canal Tunnel has bronze.\"? Yes, no, or maybe? Maybe\n###\nMarguerite Aimee Rosine Coppin (2 February 1867 \u2013 1931) was born in Brussels and became woman Poet Laureate of Belgium and a noted feminist and pioneer in female emancipation and equal rights for women. She was compared with women's rights activists Amelia Bloomer and Emmeline Pankhurst. Are we justified in saying that \"Marguerite Aimee Rosine Coppin lived in Europe.\"? Yes, no, or maybe? Yes\n###\nDelivery Man is a 2013 American comedy-drama film written and directed by Ken Scott, produced by DreamWorks Pictures and Reliance Entertainment. A remake of Scott's 2011 French-Canadian film, \"Starbuck\", the film stars Vince Vaughn, Chris Pratt and Cobie Smulders. Are we justified in saying that \"Delivery Man had to be written\"? Yes, no, or maybe?", "doc_id": 467, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18025, 35900, 22452, 18614], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Euphorbia pubentissima, known by the common names false flowering spurge and southeastern flowering spurge, is a species of plant in the spurge family. It is native to the Southeastern United States where it is found in areas of sandy, open woodlands. Are we justified in saying that \"Euphorbia pubentissima is a flower\"? Yes, no, or maybe? Yes\n###\nNathan MacKinnon (born September 1, 1995) is a Canadian professional ice hockey forward, an alternate captain of the Colorado Avalanche organization of the National Hockey League (NHL). MacKinnon was selected first overall by the Avalanche in the 2013 NHL Entry Draft. Are we justified in saying that \"At the time of the 2013 NHL Entry Draft, Nathan was at least 17 years old.\"? Yes, no, or maybe? Yes\n###\nThe Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949. Are we justified in saying that \"The Girl from Jones Beach stars a future president\"? Yes, no, or maybe? Yes\n###\nAlice Geraldine Farrar (February 28, 1882 \u2013 March 11, 1967) was an American soprano opera singer and film actress, noted for her beauty, acting ability, and \"the intimate timbre of her voice.\" She had a large following among young women, who were nicknamed \"Gerry-flappers\". Are we justified in saying that \"Alice Geraldine Farrar was an actress\"? Yes, no, or maybe? Yes\n###\nSouthern Methodist University (SMU) is a private research university in Dallas, University Park, and Highland Park, Texas. Founded in 1911 by the Methodist Episcopal Church, South, SMU operates satellite campuses in Plano, Texas, and Taos, New Mexico. SMU is owned by the South Central Jurisdiction of the United Methodist Church. Of the university's 11,643 students, 6,411 are undergraduates. Are we justified in saying that \"SMU has 11,644 students\"? Yes, no, or maybe?", "doc_id": 862, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38651, 19098, 12645, 4860], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. Are we justified in saying that \"Since 1965 The University of Maryland Eastern Shore has been a historically black university.\"? Yes, no, or maybe? Maybe\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television. Are we justified in saying that \"Sebastian Bach has a solo career\"? Yes, no, or maybe? Yes\n###\nAntonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"Antonio Lewis was the highest paying member of Flatbush ZOMBIES\"? Yes, no, or maybe? Maybe\n###\nHawthorne is a census-designated place (CDP) in Mineral County, Nevada, United States. At the 2010 census, the population was 3,269, a decrease since the 2000 census, when it was 3,311. It is the county seat of Mineral County. The nearby Hawthorne Army Depot is the primary economic base of the town. Are we justified in saying that \"Hawthorne is located in new york\"? Yes, no, or maybe? No\n###\nThomas Cooper (9 April 1904 \u2013 25 June 1940) was an England international footballer who played for Port Vale, Derby County, and Liverpool. He won 15 caps, and played 430 league games in a 16-year career in the Football League. He helped Derby to finish second in the Second Division in 1925\u201326 and second in the First Division in 1929\u201330. Are we justified in saying that \"Thomas Cooper was born over 15,000 days ago.\"? Yes, no, or maybe?", "doc_id": 920, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8586, 17835, 36203, 15334], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire. Are we justified in saying that \"The 100 people are on the same team against the outcast player\"? Yes, no, or maybe? Maybe\n###\nLegoland Discovery Center Dallas Fort Worth is an indoor family entertainment center located at Grapevine Mills mall in Grapevine, Texas, which is situated between the cities of Dallas and Fort Worth, Texas. The attraction includes Lego-theme rides, a soft play area, a 4D cinema and a gift shop. The center is owned and operated by British leisure group Merlin Entertainments. Are we justified in saying that \"Legoland Discovery Center Dallas Fort Wort is in Texas\"? Yes, no, or maybe? Yes\n###\nDennis Gordon Patterson (born January 9, 1950) is a Canadian former professional ice hockey defenceman who played three seasons in the National Hockey League (NHL) for the Kansas City Scouts and Philadelphia Flyers and also played one season in the World Hockey Association (WHA) for the Edmonton Oilers. He is currently a scout with the Flyers. Are we justified in saying that \"Dennis Gordon Patterson was born on the first of the month\"? Yes, no, or maybe? Yes\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"The station employees 300 people\"? Yes, no, or maybe? Maybe\n###\nBlanche Barrow (born Bennie Iva Caldwell; January 1, 1911 \u2013 December 24, 1988) was a fringe member of Bonnie and Clyde's gang and the wife of Clyde Barrow's brother Buck. Brought up by her father, she had a poor relationship with her mother, who arranged for Blanche to be married to an older man. Blanche ran away and met Buck Barrow. He was 8 years older, and a fugitive. Are we justified in saying that \"blanche barrow died as a fugitive\"? Yes, no, or maybe?", "doc_id": 698, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29701, 22396, 33080, 30912], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Henry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795. Are we justified in saying that \"It was a British nobleman and politician.\"? Yes, no, or maybe? Yes\n###\nJonathan Michael Lovitz (born July 21, 1957) is an American comedian, actor and singer, best known as a cast member of \"Saturday Night Live\" from 1985 to 1990. He starred as Jay Sherman in \"The Critic\" and has appeared in numerous other television series and films. Are we justified in saying that \"jonathan michael lovitz acted in numeroues television series and films while he was in saturday night live\"? Yes, no, or maybe? Maybe\n###\nDennis Gordon Patterson (born January 9, 1950) is a Canadian former professional ice hockey defenceman who played three seasons in the National Hockey League (NHL) for the Kansas City Scouts and Philadelphia Flyers and also played one season in the World Hockey Association (WHA) for the Edmonton Oilers. He is currently a scout with the Flyers. Are we justified in saying that \"Dennis Patterson played more than 2 seasons in the NHL.\"? Yes, no, or maybe? Yes\n###\nBaar is a railway station in the Swiss canton of Zug, situated in the municipality of Baar. The station is located on the Z\u00fcrich to Lucerne railway line and is an intermediate stop for InterRegio trains from Z\u00fcrich to Lucerne and on Z\u00fcrich S-Bahn line S9. Are we justified in saying that \"Zug has one railway station.\"? Yes, no, or maybe? Maybe\n###\nThe Cit\u00e9 du Cin\u00e9ma is a film studio complex supported by the film director and producer Luc Besson, located in Saint-Denis, north of Paris. The studio complex is intended to be a competitor of Cinecitt\u00e0 in Rome, Pinewood in London and Babelsberg in Berlin. It was inaugurated on 21 September 2012. Are we justified in saying that \"Bebelsburg is older than Cite du Cinema.\"? Yes, no, or maybe?", "doc_id": 376, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33282, 3015, 39199, 10631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gwendoline See-Hian Yeo (; born July 10, 1977) is a Singaporean-born American actress, voice actress and musician, best known for her recurring guest-star role as Xiao-Mei in the hit television series \"Desperate Housewives\", and as Dr. Kelly Lee in \"General Hospital\". Are we justified in saying that \"Dr. Kelly Lee is well liked.\"? Yes, no, or maybe? Maybe\n###\nKilimanjaro Native Cooperative Union (KNCU) is a cooperative federation in Tanzania and the oldest cooperative in Africa, founded in 1930 by Charles Dundas. KNCU is owned by the farmers of the 90 primary cooperative societies which buy coffee from the farmers on Kilimanjaro. Offices for the cooperative are located in Moshi. Are we justified in saying that \"Dundas lives in Moshi.\"? Yes, no, or maybe? Maybe\n###\nFraser Wishart (born Johnstone, Renfrewshire, 1 March 1965) is a Scottish former professional footballer, former Secretary of the Scottish Professional Footballers' Association, and current chief executive of the Professional Footballers' Association Scotland. He is also an occasional radio and television commentator. Are we justified in saying that \"Fraser Wishart quit professional football to become Secretary of the Scottish Professional Footballers' Association.\"? Yes, no, or maybe? Maybe\n###\nThe Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs. Are we justified in saying that \"Heavy water nuclear reactors have no use for the Girdler Sulfide process.\"? Yes, no, or maybe? No\n###\nJohnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson. Are we justified in saying that \"\"Prep\" is a common abbreviation for \"preparatory\".\"? Yes, no, or maybe?", "doc_id": 544, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16804, 44972, 33964, 22090], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roland Buerk (born 1973), was a journalist working for the BBC. He was the Tokyo Correspondent for BBC News and is best known for his coverage of the 2011 T\u014dhoku earthquake and tsunami. He is the son of former BBC newsreader and current BBC Radio 4 presenter Michael Buerk. He left the BBC in mid-2012, to work for Nissan in the United Arab Emirates. Are we justified in saying that \"Roland Buerk has an R.\"? Yes, no, or maybe? Yes\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Weltenbrand was formed in nineteen hundred ninety six.\"? Yes, no, or maybe? No\n###\nJohns Creek is a city located in Fulton County in the U.S. state of Georgia. According to the 2010 U.S. Census, the population was 76,728. The city is an affluent northeastern suburb of Atlanta. In 2017 Johns Creek ranked third on the \"USA TODAY\" list of \"50 best cities to live in.\" Are we justified in saying that \"Johns Creek is a tiny village located in Fulton County in the U.S. state of Georgia.\"? Yes, no, or maybe? No\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008. Are we justified in saying that \"Laura Elena Z\u00fa\u00f1iga Huizar met Bush.\"? Yes, no, or maybe? Maybe\n###\nWhat Happens Next is the eighth studio album by English post-punk band Gang of Four. It was released on 24 February 2015 through Metropolis Records and Membran record label. It is the band's first album to feature John \"Gaoler\" Sterry on vocals, following vocalist Jon King's departure, which left the guitarist Andy Gill as the sole original member of the band. Are we justified in saying that \"Gang of Four had eight albums released by Metropolis Records. \"? Yes, no, or maybe?", "doc_id": 463, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21860, 17591, 1127, 5398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "View from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"View from the Top was seen by Obama.\"? Yes, no, or maybe? Maybe\n###\nRecorrupted is a limited edition EP by Whitechapel that was released on November 8, 2011 through Metal Blade Records. It consists of one original song, two of their previously released songs remixed (\"This Is Exile\" and \"Breeding Violence\"), an acoustic version of \"End of Flesh\" and a cover of the Pantera song \"Strength Beyond Strength\". Are we justified in saying that \"Recorrupted is primarily original material for that album.\"? Yes, no, or maybe? No\n###\nThe 1994 Nebraska Cornhuskers football team represented the University of Nebraska\u2013Lincoln in the 1994 NCAA Division I-A football season. The team was coached by Tom Osborne and played their home games in Memorial Stadium in Lincoln, Nebraska. The Cornhuskers offense scored 459 points while the defense allowed 162 points. Are we justified in saying that \"The Cornhuskers scored more points than they ever had before in the 1994 game\"? Yes, no, or maybe? Maybe\n###\nFrank John Gorshin, Jr. (April 5, 1933 \u2013 May 17, 2005) was an American character actor, impressionist, and comedian. He was perhaps best known as an impressionist, with many guest appearances on \"The Ed Sullivan Show\" and \"Tonight Starring Steve Allen\". His most famous acting role was as the Riddler on the live-action television series \"Batman\". Are we justified in saying that \"Frank John Gorshin, Jr. played the Riddler in the Batman movie.\"? Yes, no, or maybe? No\n###\n169th Street is a local station on the IND Queens Boulevard Line of the New York City Subway. Located at the intersection of 169th Street and Hillside Avenue in Queens, it is served by the F train at all times. This is the closest subway station to the 165th Street Bus Terminal after the closure of the nearby 168th Street BMT Station on Jamaica Avenue in 1977. Are we justified in saying that \"169th Street is served by the train that is the sixth letter of the alphabet. \"? Yes, no, or maybe?", "doc_id": 810, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41359, 11693, 44467, 20858], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the Ugric mythology, Kaltes-Ekwa (Khanty, Kaltes Ankw) was the mother of the hero Mir-Susne-Hum and the wife of the god Num-Torum, who defeated her in heaven. She was also a goddess of the moon associated with the month April; a birth giving goddess (she is called upon by women in child-birth); goddess of fate; goddess of dawn and a shape-shifter, often shown manifested as a hare. Are we justified in saying that \"Khanty was the grand mother of the hero\"? Yes, no, or maybe? Maybe\n###\nCeres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club. Are we justified in saying that \"Ceres Negros Football Club has no sponsors\"? Yes, no, or maybe? Maybe\n###\nThe St. Louis Cardinals 1984 season was the team's 103rd season in St. Louis, Missouri and the 93rd season in the National League. The Cardinals went 84-78 during the season and finished 3rd in the National League East, 12\u00bd games behind their arch-rivals, the Chicago Cubs. It was also the final season of the Columbia blue road uniforms for the Cardinals. Are we justified in saying that \"The cardinals were once in the American League\"? Yes, no, or maybe? Maybe\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Tillia tepe is the site of dinosaur bones.\"? Yes, no, or maybe? Maybe\n###\nWest Palm Beach Municipal Stadium, referred to as \"Municipal Stadium\", located at 755 Hank Aaron Drive, was a ballpark in West Palm Beach, Florida and the long-time spring training home for the Milwaukee and Atlanta Braves and Montreal Expos. The Braves played spring training games at the stadium from 1963 to 1997, while the Expos played there from 1969 to 1972 and from 1981 to 1997. Are we justified in saying that \"The Braves played at Municipal Stadium in the fall.\"? Yes, no, or maybe?", "doc_id": 436, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31397, 18154, 23444, 13667], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 18th Annual Latin Grammy Awards will be held on Thursday, November 16, 2017 at the MGM Grand Garden Arena in Las Vegas. It will be broadcast on Univision at 8PM ET\\PT. This will mark the tenth year Las Vegas hosts the Latin Grammy Awards and will also mark the telecasts return to the MGM Grand Garden Arena. Are we justified in saying that \"The 18th Annual Latin Grammy awards will be simulcast on American television as well.\"? Yes, no, or maybe? Maybe\n###\nNosopsyllus fasciatus, the northern rat flea, is a species of flea found on domestic rats and house mice. Northern rat fleas are external parasites, living by hematophagy off the blood of rodents. It is the most widely spread of its genus, having originated in Europe, but has been transported to temperate regions all over the world. Are we justified in saying that \"Nosopsyllus fasciatus originated in Italy which later sped across all of Europe and World.\"? Yes, no, or maybe? Maybe\n###\nElizabeth City State University (ECSU) is a public, historically black college located in Elizabeth City, North Carolina, in the United States. ECSU, which enrolls nearly 2,500 students in 37 baccalaureate programs and three master's degree programs, is a member-school of the Thurgood Marshall College Fund, as well as a member-institution of the University of North Carolina system. Are we justified in saying that \"ECSU has more undergrad than grad degrees\"? Yes, no, or maybe? Maybe\n###\nAm\u00e9lie Simone Mauresmo ] (born 5 July 1979) is a French former professional tennis player, and a former world No. 1. Mauresmo won two Grand Slam singles titles at the Australian Open and at Wimbledon, and also won a Silver Medal at the 2004 Summer Olympics. Are we justified in saying that \"Am\u00e9lie Simone Mauresmo is good at badminton.\"? Yes, no, or maybe? Maybe\n###\nKimberly Ane Peirce (born September 8, 1967) is an American feature film director, best known for her debut feature film, \"Boys Don't Cry\" (1999). Her second feature, \"Stop-Loss\", was released by Paramount Pictures in 2008. Her most recent feature film, \"Carrie\", was released on October 18, 2013. Are we justified in saying that \"Kim Peirce was born on the eight day of September in the early 1960s.\"? Yes, no, or maybe?", "doc_id": 499, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16772, 31974, 44579, 13549], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Kinsey Millihone currently lives in California\"? Yes, no, or maybe? No\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"Come Back in One Piece was a massive hit.\"? Yes, no, or maybe? No\n###\nTommy Stewart is an American trumpeter, arranger, composer, and record producer. He has been a member of the Magic City Jazz Orchestra, Cleveland Eaton and the Alabama All-Stars, the Alabama Jazz Hall of Fame All-Stars, and Ray Reach and Friends. He was a 1988 inductee into the Alabama Jazz Hall of Fame. Are we justified in saying that \"Stewart was actually a terribly unskilled trumpeter and got by using an electronic attachment that played his trumpet for him.\"? Yes, no, or maybe? Maybe\n###\nHarriston (population 1,797) is a community in the Town of Minto in Wellington County, Ontario, Canada. In 1999, Harriston was amalgamated with the communities of Palmerston, Clifford, and Minto Township to form the Town of Minto. Harriston is located at the headwaters of the Maitland River, and has several shops, restaurants, a library, an art gallery and cultural centre. Are we justified in saying that \"Harriston is part of a town that has several other communities.\"? Yes, no, or maybe? Yes\n###\nValentino D. B. Mazzia (February 17, 1922 \u2013 March 10, 1999) was an American physician who served as chairman of the department of anesthesiology at the New York University School of Medicine and was a pioneer in the forensic analysis of deaths occurring during surgical procedures. He testified in many criminal cases about the use and presence of anesthesia products in cases of death. Are we justified in saying that \"Valentino was very knowlegabel about anesthesia.\"? Yes, no, or maybe?", "doc_id": 31, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17061, 41631, 6486, 37400], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa. Are we justified in saying that \"Jara is a European language spoken by 46000 people\"? Yes, no, or maybe? No\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"The 2002 Indian vice-presidential election has an A.\"? Yes, no, or maybe? Maybe\n###\nTexas Monthly v. Bullock 489 U.S. 1 (1989) was a case brought before the US Supreme Court in November 1988. The case (initiated by the publishers of \"Texas Monthly\", a well-known general-interest magazine in Texas) was to test the legality of a Texas statute that exempted religious publications from paying state sales tax. Are we justified in saying that \"Texas Monthly v. Bullock was a case initiated against Texas Monthly.\"? Yes, no, or maybe? No\n###\nChristian Darcy Bisson (born August 14, 1989) is a Canadian professional baseball second baseman in minor league baseball organization of the San Diego Padres of Major League Baseball. Prior to beginning his professional career, he played college baseball at the University of Kentucky. Bisson has also competed for the Canadian national baseball team. Are we justified in saying that \"Christian Darcy Bisson is a Canadian professional baseball second baseman and a fat man\"? Yes, no, or maybe? Maybe\n###\nLloyd Newton Morrisett, Jr. (born November 2, 1929) is an American experimental psychologist with a career in education, communications, and philanthropy. He is one of the founders of the Sesame Workshop, the organization famous for the creation of the children's television shows \"Sesame Street\" which was also co-created by him, \"The Electric Company\", and many others. Are we justified in saying that \"Lloyd Newton Morrisett, Jr. is dead\"? Yes, no, or maybe?", "doc_id": 114, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8482, 8933, 26042, 5486], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Port Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"The census took place the year before 2012.\"? Yes, no, or maybe? Yes\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show has had more then one permanent host. \"? Yes, no, or maybe? Maybe\n###\n\"Something from Nothing\" is a song by the American rock band Foo Fighters from their eighth studio album \"Sonic Highways\". It was released as the album's lead single on October 16, 2014. Recorded at Steve Albini's Electrical Audio studio, the song was influenced by the Chicago music scene. Are we justified in saying that \"\"Something from Nothing\" will be the main theme of the next Marvel movie\"? Yes, no, or maybe? Maybe\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world. Are we justified in saying that \"Hudson Valley Community College has students from all countries in the world\"? Yes, no, or maybe? No\n###\nThe Newnes railway line (also called Wolgan Valley Railway) is a closed and dismantled railway line in New South Wales, Australia. The line ran for 32 mi from the Main Western line to the township of Newnes. Along the way, it passed through a tunnel now known as the Glowworm Tunnel, because it is famous for its glow-worms. The tunnel is now contained within the Wollemi National Park. Are we justified in saying that \"New South Wales, Australia is west and east of the Indian Ocean.\"? Yes, no, or maybe?", "doc_id": 232, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6695, 15803, 4877, 16919], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prom Night IV: Deliver Us from Evil is a 1992 Canadian slasher horror film directed by Clay Borris and starring Nicole de Boer and J.H. Wyman. The film follows a deranged Catholic priest who begins murdering teenagers on their prom night. It is the fourth and final film in the \"Prom Night\" franchise. Like the previous , it was released briefly in theaters before later being released to video. Are we justified in saying that \"Clay Borris is a Catholic Priest.\"? Yes, no, or maybe? Maybe\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde. Are we justified in saying that \"Jango is Brian Wilde's debut role.\"? Yes, no, or maybe? Maybe\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki. Are we justified in saying that \"Matsuri Mizuguchi shares the exact date of birth and age as Christopher Lee\"? Yes, no, or maybe? No\n###\nSabanc\u0131 University (Turkish: \"Sabanc\u0131 \u00dcniversitesi\" ), established in 1994, is a young foundation university located on a 1.26 million squaremeter campus which is about 40\u00a0km from Istanbul's city center. Its first students matriculated in 1999. The first academic session started on \u00a020,\u00a01999\u00a0(1999--T) . Are we justified in saying that \"Sabanc\u0131 University has an impressive alumni.\"? Yes, no, or maybe? Maybe\n###\nHudepohl Brewing Company is a brewery established in Cincinnati, Ohio in 1885 by founder Ludwig Hudepohl II. Hudepohl was the son of Bavarian immigrants and had worked in the surgical tool business before starting his brewery. Hudepohl combined with Schoenling Brewing Company in 1986. Today, the Hudepohl-Schoenling Brewing Company is a wholly owned subsidiary of Christian Moerlein Brewing Co.. Are we justified in saying that \"The Schoenling Brewing Company was established after the Hudepohl Brewing Company.\"? Yes, no, or maybe?", "doc_id": 223, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36270, 27963, 20409, 34755], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In electromagnetism, charge density is a measure of electric charge is the amount of electric charge per unit length, surface area, or volume, called the linear, surface, or volume charge density, respectively. The respective SI units are C\u22c5m, C\u22c5m or C\u22c5m. Are we justified in saying that \"electromagnetism has to do with charge\"? Yes, no, or maybe? Yes\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback. Are we justified in saying that \"Hard Landing is said to be his most successful novel. \"? Yes, no, or maybe? Maybe\n###\nCharles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009. Are we justified in saying that \"Brown was drafted by the Saints.\"? Yes, no, or maybe? Maybe\n###\nMakri (Greek: \u039c\u03ac\u03ba\u03c1\u03b7) is a village and a municipal district of the city of Alexandroupoli, Evros regional unit, Greece. In 2011 its population was 924 for the village, and 1,919 for the municipal district. It is situated on the Aegean Sea coast, 12\u00a0km west of downtown Alexandroupoli. Makri has an exit on the Egnatia Odos motorway, that passes north of the village. Are we justified in saying that \"The population of Makri was 1919 in the viilliage\"? Yes, no, or maybe? No\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013. Are we justified in saying that \"Brandon Tyler McManus was born after the 17th century\"? Yes, no, or maybe?", "doc_id": 270, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24815, 32749, 21977, 14471], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Margaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law. Are we justified in saying that \"Margaret Lucille Jeanne Parker is an independent\"? Yes, no, or maybe? No\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Seven Ways from Sundown was released before 1961\"? Yes, no, or maybe? Yes\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano. Are we justified in saying that \"Memento has been seen by everybody.\"? Yes, no, or maybe? Maybe\n###\nPolarbr\u00f6d is a Swedish bread company. Their head office is in \u00c4lvsbyn in northern Sweden. Polarbr\u00f6d is Sweden's third-largest bread company. Its typical product is a soft compact bread formed into round, flat shapes. It is also noted for ready-made sandwiches produced from such bread and reindeer meat, which was introduced as a product in the 1960s under the name \"renkl\u00e4mma\". Are we justified in saying that \"Polarbr\u00f6d was started in the 1960s.\"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"Joe Tinker was named after the baseball Hall of Famer, Joe Tinker.\"? Yes, no, or maybe?", "doc_id": 779, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [25294, 36011, 30946, 24949], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio. Are we justified in saying that \"Christopher Lawrence works on ABC Radio.\"? Yes, no, or maybe? Yes\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway. Are we justified in saying that \"It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1932\"? Yes, no, or maybe? No\n###\nSierpe River (Spanish: \"Rio Sierpe\") is a river of Costa Rica. Boat traffic is common with both locals and tourists. A broad range of wildlife can be seen from the American Crocodile, various other reptile species, and exotic fish and birds. It joins the Rio Terraba. Are we justified in saying that \"Rio Sierpe has more crocs than birds\"? Yes, no, or maybe? Maybe\n###\nAnastasija Sevastova (born 13 April 1990) is a professional tennis player from Latvia. Having retired in 2013 due to recurring injuries, Sevastova returned to competition in 2015 and became known for her campaign at the 2016 US Open, where she defeated third-seeded Garbi\u00f1e Muguruza as well as Johanna Konta en route to her first ever Grand Slam quarterfinal. Are we justified in saying that \"Anastasija Sevastova defeated Garbine Muguruza in the 2016 US Open\"? Yes, no, or maybe? Yes\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position. Are we justified in saying that \"The CTA has always had a male leader until recently.\"? Yes, no, or maybe?", "doc_id": 275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30834, 28978, 16739, 44887], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots. Are we justified in saying that \"Panchos was let go in the early 80s\"? Yes, no, or maybe? No\n###\nThe Arkansas Mountain AVA is an American Viticultural Area located in the Ozark Mountains of northwestern Arkansas. It is part of the larger Ozark Mountain AVA, which also includes regions in Missouri and Oklahoma. The smaller Altus AVA is entirely contained within the Arkansas Mountain AVA. The Arkansas Mountain AVA includes 2880000 acre , making it the ninth largest AVA as of 2008. Are we justified in saying that \"The Ozark Mountains are inhabited with Native Americans.\"? Yes, no, or maybe? Maybe\n###\nJoseph Maurice Ravel (] ; 7 March 1875 \u2013 28 December 1937) was a French composer, pianist and conductor. He is often associated with impressionism along with his elder contemporary Claude Debussy, although both composers rejected the term. In the 1920s and 1930s Ravel was internationally regarded as France's greatest living composer. Are we justified in saying that \"Joseph Maurice Ravel was skinny.\"? Yes, no, or maybe? Maybe\n###\nAnti-D\u00fchring (German: \"Herrn Eugen D\u00fchrings Umw\u00e4lzung der Wissenschaft\" , \"Herr Eugen D\u00fchring's Revolution in Science\") is a book by Friedrich Engels, first published in German in 1878. It had previously been serialised in a periodical. There were two further German editions in Engels' lifetime. \"Anti-D\u00fchring\" was first published in English translation in 1907. Are we justified in saying that \"Anti-D\u00fchring starts with C.\"? Yes, no, or maybe? No\n###\nVia Dante is an important and elegant pedestrian street in central Milan, Italy, connecting Piazzale Cordusio (Cordusio (Milan Metro)) and Largo Cairoli (Cairoli (Milan Metro)). It is very near to the city's Castello Sforzesco and is named after the Florentine poet Dante Alighieri. It is known for containing several theatres, shops, restaurants, caf\u00e9s, palaces and bars. Are we justified in saying that \"Milan's streets are named after poets.\"? Yes, no, or maybe?", "doc_id": 580, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27761, 39195, 4378, 24550], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Kingfisher Ultra Indian Derby, or simply the Indian Derby, is an Indian annual Thoroughbred horse race. It is a 2,400-metre race held on the first Sunday of February on the Mahalaxmi Racecourse in Mumbai and is one of the premier sporting activities in the city. Are we justified in saying that \"The Indian Derby is the first horse race held in Mumbai\"? Yes, no, or maybe? Maybe\n###\nDave Ward, born 12 July 1959, is a British Trade Unionist and General Secretary of the Communication Workers\u2019 Union (CWU), which was formed through the merger of the Union of Communication Workers and the National Communications Union in 1995. The CWU is the largest Trade Union in the United Kingdom for people working in the Postal and Telecommunications industry with over 200,000 members. Are we justified in saying that \"Dave Ward was born on the same month as the holiday July 4th, where America celebrated its independence.\"? Yes, no, or maybe? Yes\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"Hope depot is based in kansas\"? Yes, no, or maybe? No\n###\nRewire (formerly RH Reality Check, long name Reproductive Health Reality Check) is a website focused on reproductive and sexual health from a pro-reproductive rights perspective. The website began as a UN Foundation blog in 2006, and became its own nonprofit organization in January 2012. In 2016, it was renamed \"Rewire\". Are we justified in saying that \"Reproductive Health Reality Check is a longer version of RH Reality Check.\"? Yes, no, or maybe? Yes\n###\n\"The Dog Said Bow-Wow\" is a science fiction short story by American writer Michael Swanwick, published in 2001. It won the 2002 Hugo Award for Best Short Story and was nominated for the 2002 Nebula Award for Best Short Story. \"The Dog Said Bow-Wow\" is the title story of his 2007 short story collection, published by Tachyon Publications, and was reprinted in the same year in \"\". Are we justified in saying that \"\"The Dog Said Bow-Wow\" is about the rapper Bow Wow.\"? Yes, no, or maybe?", "doc_id": 699, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20983, 18837, 12723, 7514], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Creation of Adam is a fresco painting by Michelangelo, which forms part of the Sistine Chapel's ceiling, painted c. 1508\u20131512. It illustrates the Biblical creation narrative from the Book of Genesis in which God breathes life into Adam, the first man. The fresco is part of a complex iconographic scheme and is chronologically the fourth in the series of panels depicting episodes from Genesis. Are we justified in saying that \"The Creation of Adam is a fresco painting by Michelangelo painted in the 1500's\"? Yes, no, or maybe? Yes\n###\nHe is a member of the Royal Shakespeare Company and later joined the Renaissance Theatre Company. He has appeared in many of Kenneth Branagh's films, most recently as Corin in the 2006 film \"As You Like It\". Yuill was also the music composer for \"A Midwinter's Tale\" and \"Swan Song\". Are we justified in saying that \"Yuill only composed for \"Swan Song,\" and none other.\"? Yes, no, or maybe? No\n###\nPunjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999. Are we justified in saying that \"The Punjab Control of Organised Crime Act was enacted in the state of Iraq.\"? Yes, no, or maybe? No\n###\nDr. Edward Vivian Scobie (1918 \u2013 14 November 1996) was a Dominican-born journalist, magazine publisher and historian. He is best known for his research into the black history of Western Europe and his 1972 seminal book \"Black Britannia: A History of Blacks in Britain\". Are we justified in saying that \"Dr. Edward Vivian Scobie was over 50 when he died\"? Yes, no, or maybe? Yes\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki. Are we justified in saying that \"Matsuri Mizuguchi and Aki Toyosaki were born in the same place on the same day.\"? Yes, no, or maybe?", "doc_id": 190, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28516, 40993, 10194, 8558], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Goodlettsville is a city in Davidson and Sumner counties, Tennessee. Goodlettsville was incorporated as a city in 1958 with a population of just over 3,000 residents; at the 2010 census, the city had a total population of 15,921 and in 2015 the population was 16,994. Goodlettsville chose to remain autonomous in 1963 when the city of Nashville merged with the government of Davidson County. Are we justified in saying that \"Goodlettsville was the only place to remain autonomous during the merger\"? Yes, no, or maybe? Maybe\n###\nQingtongxia () literally, \"Bronze Gorge\" is a city in the province of Ningxia in the north of China. Administratively, Qingtongxia is a county-level city within the prefecture-level city of Wuzhong. It is located on the left (northwestern) bank of the Yellow River, opposite and a bit upstream of Wuzhong main urban area. Are we justified in saying that \"Qingtongxia is a city within the prefecture-level city of Wuzhong.\"? Yes, no, or maybe? Yes\n###\nLes Soir\u00e9es de Nazelles, FP 84, is a set of variations for piano written by the French composer Francis Poulenc. During the evenings, the composer used to sit at the piano and improvise \"portraits\" of his friends, all based on a given theme. The work was begun in 1930, and completed at Noizay on October 1, 1936. At the beginning of the score, it reads: Are we justified in saying that \"Les Soir\u00e9es de Nazelles is Francis Poulenc's most famous song.\"? Yes, no, or maybe? Maybe\n###\n\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982). Are we justified in saying that \"\"King of the Jungle\" was a popular song in japan\"? Yes, no, or maybe? Maybe\n###\nThe Florida Board of Regents was from 1965 to 2001 the governing body for the State University System of Florida, which includes all public universities in the state of Florida, United States. It was created to replace a predecessor body called the Florida Board of Control, which had existed from 1905. Its powers are now held by the Florida Board of Governors. Are we justified in saying that \"The Florida Board of Regents spanned into two centuries.\"? Yes, no, or maybe?", "doc_id": 672, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32269, 17812, 669, 29784], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "weRead, formerly iRead, is an online community of book enthusiasts. weRead started out as a social cataloging application on Facebook in June 2007 and has since expanded to over 3.1 million active members across Facebook, MySpace, Orkut, Hi5, and Bebo. Are we justified in saying that \"weRead started out as an application in 2007\"? Yes, no, or maybe? Yes\n###\nGeorge Corrie (born 16 September 1973) is an English footballer, born in Workington, who played for ten years as a midfielder for American USL Second Division side Wilmington Hammerheads, of which he was the captain. He joined the Hammerheads in 1999 after six seasons with Conference North team Workington A.F.C.. Are we justified in saying that \"George Corrie (born 16 September 1973) is an English footballer who played baseball for the Wilmington Hammerheads.\"? Yes, no, or maybe? No\n###\nKilimanjaro Native Cooperative Union (KNCU) is a cooperative federation in Tanzania and the oldest cooperative in Africa, founded in 1930 by Charles Dundas. KNCU is owned by the farmers of the 90 primary cooperative societies which buy coffee from the farmers on Kilimanjaro. Offices for the cooperative are located in Moshi. Are we justified in saying that \"The KNCU stands for the Kilimanjaro Native Cooperative Union.\"? Yes, no, or maybe? Yes\n###\nThe Golden Fetter is a 1917 American romance silent film directed by Edward LeSaint and written by Charles Tenney Jackson and Charles Maigne. The film stars Wallace Reid, Anita King, Tully Marshall, Guy Oliver, Walter Long and Mrs. Lewis McCord. The film was released on January 25, 1917, by Paramount Pictures. Are we justified in saying that \"The Golden Fetter did not do well in the box office.\"? Yes, no, or maybe? Maybe\n###\nOperation Mojo is part documentary and part rock-mockumentary musical comedy of the TEENick series The Naked Brothers Band. It's the sixth television movie of \"The Naked Brothers Band\", and the second of season 3. The movie aired on Nickelodeon on November 22, 2008 Are we justified in saying that \"The Naked Brothers Band was popular for teens.\"? Yes, no, or maybe?", "doc_id": 490, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43569, 38988, 9621, 20168], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Stranger Left No Card (1952) is a British short film directed by Wendy Toye. The film won the Best Fiction award at the 1953 Cannes Film Festival, where it was described as \"a masterpiece\" by Jean Cocteau. It marked the film debut of actor Alan Badel. Are we justified in saying that \"Alan used the film to kickstart his career\"? Yes, no, or maybe? Yes\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs is a Canadian politician.\"? Yes, no, or maybe? Yes\n###\nVitacost.com, Inc is an American e-commerce company based in Boca Raton, Florida, that sells vitamins, supplements and organic grocery products. The company was bought by Kroger, in 2014. Vitacost was inducted into Inc Magazine's \"Inc. 500 Lifetime Hall of Fame,\" in 2006 as one of the US's 500 fastest-growing privately held businesses for five consecutive years (2001\u20132005). Are we justified in saying that \"Vitacost.com sells vitamins and energy drinks \"? Yes, no, or maybe? Maybe\n###\n\"Never Be Rude to an Arab\" is a satirical song by the members of \"Monty Python's Flying Circus\", originally appearing on the 1980 album \"Monty Python's Contractual Obligation Album\". It appears as sung by Terry Jones in the theatrically released concert film \"Monty Python Live at the Hollywood Bowl\" and was included on the compilation album \"Monty Python Sings\". Are we justified in saying that \"The 1980 album \"Monty Python's Contractual Obligation Album\" was released on January 19th.\"? Yes, no, or maybe? Maybe\n###\nBig Sky is a census-designated place (CDP) in Gallatin and Madison counties in southwestern Montana. As of the 2010 census it had a population of 2,308. It is 45 mi southwest of Bozeman. This unincorporated community straddles the two counties, is not considered a town, and has no town government. The primary industry of the area is tourism. Are we justified in saying that \"Bozeman is not in Montana with Gallatin and Madison\"? Yes, no, or maybe?", "doc_id": 288, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2525, 41858, 39057, 36204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Maniac (stylized as MANIAC) is an American short slasher film, directed by Shia LaBeouf. It was released on October 31, 2011. The short film stars American rappers Scott \"Kid Cudi\" Mecudi and Chris \"Cage\" Palko, as French-speaking serial killers. Mescudi and Palko also co-wrote the film with LaBeouf. Are we justified in saying that \"57% of the patrons seeing the film Maniac in the movie theater prefer their popcorn with extra butter.\"? Yes, no, or maybe? Maybe\n###\nRemember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\" Are we justified in saying that \"Remember the Daze was released in 2008\"? Yes, no, or maybe? No\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Five more movies came out in 1922. \"? Yes, no, or maybe? Maybe\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city. Are we justified in saying that \"The population increased by 153 people\"? Yes, no, or maybe? Yes\n###\nThe South Africa national cricket team toured England from May to September 1912 and took part in the 1912 Triangular Tournament, playing three Test matches each against the England national cricket team and the Australia national cricket team. The tournament was won by England. South Africa were captained by Frank Mitchell and Louis Tancred. Are we justified in saying that \"Frank Mitchell captained a test match without Louis Tanced.\"? Yes, no, or maybe?", "doc_id": 349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30622, 25408, 43152, 35696], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Eva Strong is a character in Hollyoaks\"? Yes, no, or maybe? Yes\n###\nThe Kyrkog\u00e5rden Runestones are three Viking Age memorial runestones located at the cemetery of St. Mary's Church in Sigtuna, Stockholm County, Sweden, in the historic province of Uppland. One of the runic inscriptions documents the existence of a Viking Age mercantile guild in Sweden. Are we justified in saying that \"The Kyrkog\u00e5rden Runestones are Incan monument\"? Yes, no, or maybe? No\n###\nOliver Francis O'Grady (born June 5, 1945) is an Irish laicized Catholic priest who raped, molested and abused at least 25 children in California from 1973 onwards. His abuse and Cardinal Roger Mahony's attempts to hide the crimes are the subject of Amy J. Berg's documentary film \"Deliver Us from Evil\" in 2006. Are we justified in saying that \"Cardinal Roger Mahony was eventually unsuccessful in hiding the crimes.\"? Yes, no, or maybe? Yes\n###\nEllon Castle is a scheduled monument within the town of Ellon, Aberdeenshire. Only ruins survive of the 16th-century structure that may incorporate sections from the 15th century together with 18th-century renovations. The ruins form a focal point in a formal 6 acre garden planted in 1745; an older Category A listed sundial dating from c. 1700 forms the centrepiece to the garden. Are we justified in saying that \"The garden at Ellon Castle was conceived in 1743.\"? Yes, no, or maybe? Maybe\n###\nThe Tampa Bay Buccaneers season was the franchise's 40th season in the National Football League and the second under head coach Lovie Smith. The offseason was marked by the draft selection of All-American Florida State quarterback Jameis Winston first overall in the 2015 NFL Draft. Are we justified in saying that \"The Tampa Bay Buccaneers have played in 46 NFL seasons. \"? Yes, no, or maybe?", "doc_id": 404, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15150, 32493, 9222, 17526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico. Are we justified in saying that \"The USFC \"Fish Hawk\" was not in operation in 1962. \"? Yes, no, or maybe? Yes\n###\nBear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city. Are we justified in saying that \"The population was the sum 800 + 53 in 2010\"? Yes, no, or maybe? Yes\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"Coates was inspired to write a story about the ketch because of the sailing trips he took with his father.\"? Yes, no, or maybe? Maybe\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues. Are we justified in saying that \"The Centre for Policy Studies focuses on business issues the most.\"? Yes, no, or maybe? Maybe\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr.. Are we justified in saying that \"Herradura-Agullo was eight years old in 1984.\"? Yes, no, or maybe?", "doc_id": 916, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32239, 6140, 7965, 44993], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Junoon (Hindi: \u091c\u0941\u0928\u0942\u0928, translation: \"The Obsession\") is a 1978 Indian Hindi language film produced by Shashi Kapoor and directed by Shyam Benegal. The film is based on Ruskin Bond's fictional novella, \"A Flight of Pigeons\", set around the Indian Rebellion of 1857. The film's soundtrac was composed by Vanraj Bhatia, and cinematography by Govind Nihalani. Are we justified in saying that \"A flight of pigeons is a multi part novella\"? Yes, no, or maybe? Maybe\n###\nThe 2000 Family Circle Cup was the 28th edition of the Family Circle Cup tennis tournament. This WTA Tier I Event was held at the Family Circle Tennis Center in Hilton Head, South Carolina, United States. First-seeded Mary Pierce won the singles title and earned $166,000 first-prize money. Are we justified in saying that \"The 2000 Family Circle Cup was the 28th time Mary Pierce played in the event.\"? Yes, no, or maybe? Maybe\n###\n\"The Day the Earth Stood Stupid\" is the seventh episode in season three of \"Futurama\". It originally aired on the Fox network in the United States on February 18, 2001. The title of this episode is a play on the title of the 1951 science fiction film, \"The Day the Earth Stood Still\". Are we justified in saying that \"The Day the Earth Stood Stupid was the only time the show used early sci fi films as inspiration that year. \"? Yes, no, or maybe? Maybe\n###\nOleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers. Are we justified in saying that \"Smirnov was drafted while a woman was the president of the United States.\"? Yes, no, or maybe? No\n###\nUp the River (1938) is a prison comedy film starring Preston Foster and Arthur Treacher and featuring Bill \"Bojangles\" Robinson. The movie was directed by Alfred L. Werker and is a remake of a 1930 film with the same title directed by John Ford and starring Spencer Tracy and Humphrey Bogart in the roles subsequently played by Foster and Tony Martin. Are we justified in saying that \"Up the River stars Preston Fost\"? Yes, no, or maybe?", "doc_id": 534, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29472, 4771, 45002, 16323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Elmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County. Are we justified in saying that \"Elmira has a low population due to hard to live in weather\"? Yes, no, or maybe? Maybe\n###\nTiggy (born 1970 as Charlotte Vigel) is a Danish bubblegum/Eurodance artist. She is perhaps best known for her remix of the Sandy Fox song \"Freckles\" in \"\", originally the English version of the song \"Sobakasu\" by Judy and Mary from the anime \"Rurouni Kenshin\" and she's also popular in parts of Southeast Asia with the song \"Why\". Are we justified in saying that \"Tiggy is a famous painter\"? Yes, no, or maybe? No\n###\nThe Joint Special Operations University (JSOU) is the designated agency within USSOCOM to conduct joint Special Operations Force (SOF) education and thus is tasked with and directed to provide relevant, realistic, leading-edge education opportunities to military and civilian special operations forces personnel around the world, located at MacDill Air Force Base, Florida, USA. Are we justified in saying that \"The S in SOF stands for Super\"? Yes, no, or maybe? No\n###\n2009, Year of Us is the third extended play (EP) by South Korean boy group Shinee. It consists of six tracks and it incorporates alternative rock and hip-hop music genres. The digital version of the album was released on October 19, 2009, with a physical release on October 22. The title track, \"Ring Ding Dong\" was released on October 14, 2009 through various music sites. Are we justified in saying that \"Shinee doesn't speak Korean.\"? Yes, no, or maybe? No\n###\n\"Trap Queen\" is the debut single by American rapper Fetty Wap from his self-titled debut album (2015). Following its online premiere in March 2014, it was released independently on April 22, 2014 before being re-released in conjunction with 300 Entertainment on December 15, 2014. The song was well received by critics who praised the vocals and production for being cheery and infectious. Are we justified in saying that \"Trap Queen was a popular first release by Fetty Wap.\"? Yes, no, or maybe?", "doc_id": 305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12122, 12323, 7569, 33328], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release. Are we justified in saying that \"Shameless Self-Promotion was the band's most popular album\"? Yes, no, or maybe? Maybe\n###\nKasey Peters (born May 20, 1987) is a former American football quarterback. He played college football at Saddleback, Santa Ana, Grand Valley State and Rocky Mountain. He was a member of the Tri-Cities Fever, New Mexico Stars, West Texas Wildcatters, Duke City Gladiators, Las Vegas Outlaws, New Orleans VooDoo, Portland Steel and Monterrey Steel. Are we justified in saying that \"Kasey Peters now has a career in another field.\"? Yes, no, or maybe? Maybe\n###\nThe Portezuelo Formation is a geologic formation outcropping in the Mendoza, R\u00edo Negro and Neuqu\u00e9n provinces of Argentina. It is the fourth-oldest formation in the Neuqu\u00e9n Group and the older of the two formations in the R\u00edo Neuqu\u00e9n Subgroup. Formerly, that subgroup was treated as a formation, and the Portezuelo Formation was known as the Portezuelo Member. Are we justified in saying that \"There are ten other formations in the Neuqu\u00e9n Group.\"? Yes, no, or maybe? Maybe\n###\nEdward Gibbon FRS ( ; 8 May 173716 January 1794) was an English historian, writer and Member of Parliament. His most important work, \"The History of the Decline and Fall of the Roman Empire\", was published in six volumes between 1776 and 1788 and is known for the quality and irony of its prose, its use of primary sources, and its open criticism of organised religion. Are we justified in saying that \"Edward Gibbon FRS was a known athiest.\"? Yes, no, or maybe? Maybe\n###\nThe Mercedes-Benz W221 is a chassis code of S-Class, the successor of the Mercedes-Benz S-Class (W220) and the predecessor of the Mercedes-Benz S-Class (W222). The S-Class are the flagship vehicles of Mercedes-Benz and each generation typically introduces a range of technical innovations and developments that over time will find their way into smaller cars. Are we justified in saying that \"The The Mercedes-Benz W220 was a great influence W221\"? Yes, no, or maybe?", "doc_id": 286, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1501, 8410, 44951, 29838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sherwood Stewart (born June 6, 1946, in Goose Creek, Texas, United States) played amateur and professional tennis in the 1970s and 1980s. Stewart was ranked as high as No. 60 in the world in singles on the ATP Rankings on December 31, 1978, and No. 4 in doubles on January 3, 1983. Are we justified in saying that \"Stewart was ranked as high as number 60 in the world of singles in 1978 because his dad paid someone a good amount of money\"? Yes, no, or maybe? Maybe\n###\nThe Canyons is a 2013 American erotic thriller-drama film directed by Paul Schrader and written by Bret Easton Ellis. The film is set in Los Angeles and stars Lindsay Lohan, James Deen, Nolan Funk, Amanda Brooks, and Gus Van Sant. It received a limited release on August 2, 2013 at the IFC Center in New York City, the Bell Lightbox in Toronto, and on video on demand platforms. Are we justified in saying that \"Lindsay Lohan starred in the movie The Canyons.\"? Yes, no, or maybe? Yes\n###\nJustin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci was born in South America.\"? Yes, no, or maybe? No\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"Kidsty Pike has been seen by earl.\"? Yes, no, or maybe? Maybe\n###\nAziyad\u00e9 (1879; also known as Constantinople) is a novel by French author Pierre Loti. Originally published anonymously, it was his first book, and along with \"Le Mariage de Loti\" (1880, also published anonymously), would introduce the author to the French public and quickly propel him to fame; his anonymous persona did not last long. Are we justified in saying that \"Aziyad\u00e9 was published more than 50 months ago.\"? Yes, no, or maybe?", "doc_id": 944, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9424, 42867, 3704, 26057], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Choirboys is a 1977 American comedy-drama film directed by Robert Aldrich, written by Christopher Knopf and Joseph Wambaugh based on Wambaugh's novel of the same title. It features an ensemble cast including Charles Durning, Louis Gossett, Jr., Randy Quaid, and James Woods. The film was released to theaters by Universal Pictures on December 23, 1977. Are we justified in saying that \"Louis was the youngest of the cast\"? Yes, no, or maybe? Maybe\n###\nGladys Leslie (March 5, 1899 \u2013 October 2, 1976) was an American actress in silent film, active in the 1910s and 1920s. Though less-remembered than superstars like Mary Pickford, she had a number of starring roles from 1917 to the early 1920s and was one of the young female stars of her day. Are we justified in saying that \"Audiences loved Gladys' voice.\"? Yes, no, or maybe? Maybe\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives. Are we justified in saying that \"Katey Sagal will play Christina Applegate's mom, as she did before on Married With Children.\"? Yes, no, or maybe? Maybe\n###\nAlma Ros\u00e9's father was the violinist Arnold Ros\u00e9 (n\u00e9 Rosenblum; 1863\u20131946) who was the leader of the Vienna Philharmonic Orchestra for 50 years: from 1881-1931 as well as leader of the Vienna State Opera orchestra and leader of the legendary Ros\u00e9 String Quartet. Her mother, Justine (died 22 August 1938), was Gustav Mahler's sister. Alma was named for Alma Mahler. Are we justified in saying that \"Gustav hated his sister Justine\"? Yes, no, or maybe? Maybe\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"The tournament was organised by the game publisher.\"? Yes, no, or maybe?", "doc_id": 689, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42312, 22102, 13675, 14664], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Web of Passion (also released as Leda, original French title: \u00c0 double tour) is a 1959 French suspense thriller film directed by Claude Chabrol and based on the novel \"The Key to Nicholas Street\" by American writer Stanley Ellin. It was Chabrol's first film in the thriller genre, which would be his genre of choice for the rest of his career. The film had a total of 1,445,587 admissions in France. Are we justified in saying that \"Before Web of Passion, Chabrol didn't direct other movies.\"? Yes, no, or maybe? No\n###\nUSNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944. Are we justified in saying that \"The ship contained 150 soldiers\"? Yes, no, or maybe? Maybe\n###\nSamat (Kyrgyz: \u0421\u0430\u043c\u0430\u0442 ) is a small village located in Leilek District of Batken Region, Kyrgyzstan. The village is subordinated to the town of Isfana. According to the 2009 Population and Housing Census of Kyrgyzstan, at the time the population of Samat was 2,076. Are we justified in saying that \"Samat's citizens are mostly male.\"? Yes, no, or maybe? Maybe\n###\nAn election campaign was held ahead of a general election for the 54th Parliament of New South Wales on Saturday, 24 March 2007. The result\u2014a win for the social-democratic Australian Labor Party and its new leader Morris Iemma\u2014was widely perceived as a foregone conclusion, with opposition leader Peter Debnam conceding as much the week before the poll. Are we justified in saying that \"Moris Iemma was the leader of the Parliament of New South Wales in 2007. \"? Yes, no, or maybe? Yes\n###\n\"Champions\" is a song by American singer Usher and Panamanian singer Rub\u00e9n Blades, recorded for the biographical sports film, \"Hands of Stone\" and is also included on his eight studio album \"Hard II Love\". It was released by RCA on August 26, 2016, available for digital download and online streaming. The song was written by Usher, Rub\u00e9n Blades, Raphael Saadiq and Taura Stinson. Are we justified in saying that \"\"Champions\" by Usher and Ruben Blades was released less than a decade ago\"? Yes, no, or maybe?", "doc_id": 296, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12119, 9450, 24326, 21288], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls. Are we justified in saying that \"He remained with the Chicago Bulls for a period of four years.\"? Yes, no, or maybe? Maybe\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh. Are we justified in saying that \"Telephone Shilpa Sangstha launched the first Laptop made/assembled in Bangladesh, Doel, in October, 2011.\"? Yes, no, or maybe? Maybe\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808 Are we justified in saying that \"The area where Marks was located has never been built on again.\"? Yes, no, or maybe? Maybe\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians. Are we justified in saying that \"This album is the best selling of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy.\"? Yes, no, or maybe? Maybe\n###\nJoshua \"Josh\" Murphy (born 24 February 1995) is an English professional footballer who plays for Championship club Norwich City. He scored on his professional debut in a Football League Cup match against Watford. He is the twin brother of professional footballer Jacob Murphy and brother of professional singer Daniel Murphy. Are we justified in saying that \"Daniel Murphy was born in February.\"? Yes, no, or maybe?", "doc_id": 749, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33327, 22619, 37693, 7376], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Overseas Territories Conservation Forum (UKOTCF) is a UK-based non-governmental organisation which promotes coordinated conservation in the UK Overseas Territories and Crown Dependencies (UKOTs and CDs). It is a not-for-profit organisation supported by grants, donations and subscriptions, and a registered charity and company. Are we justified in saying that \"UKOTCF is a trustworthy charity\"? Yes, no, or maybe? Maybe\n###\nArizona Business Magazine, based out of Phoenix, Arizona, is the state\u2019s leading monthly Business magazine. Published by AZ Big Media, the magazine covers a wide range of topics focusing on the Arizona business scene, and is aimed at high-level corporate executives and business owners. Are we justified in saying that \"Arizona Business Magazine is successful in the business scene.\"? Yes, no, or maybe? Maybe\n###\nYouth in Guatemala are the largest segment of the nation's population. Youth includes individuals between the ages of 15 and 24 Over half of the population is under 19 years old in 2011, the highest proportion of young people of any country in Latin America. The health, education, and work opportunities for young people differ by ethnicity (\"ladino\" or indigenous) and social class. Are we justified in saying that \"young people in Guatemala go to church. \"? Yes, no, or maybe? Maybe\n###\n\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character. Are we justified in saying that \"The series is narrated by Damon, who acts as the programme's lead character.\"? Yes, no, or maybe? No\n###\nSea Lion Park was a 16 acre amusement park started in 1895 on Coney Island by Paul Boyton. He fenced the property and charged admission, the park becoming the first enclosed and permanent amusement park in North America. Up until the establishment of this park, amusement areas around the country consisted of pay-as-you-go concessions. In 1903, Sea Lion Park was replaced by Luna Park. Are we justified in saying that \"It is less than 150 years old\"? Yes, no, or maybe?", "doc_id": 606, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8870, 13488, 1218, 14915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"King of the Jungle\" is a song written and recorded by English girl group Bananarama. It was released as a single in Japan only from their self-titled second album in 1984. The song is the second single from Bananarama to be a Japanese-only release (following \"He's Got Tact\" in 1982). Are we justified in saying that \"Bananarama later released a third Japanese-only single.\"? Yes, no, or maybe? Maybe\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"Nydala Abbey was situated in London, England.\"? Yes, no, or maybe? No\n###\nNatasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad. Are we justified in saying that \"natasha choufani is beautiful\"? Yes, no, or maybe? Maybe\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine. Are we justified in saying that \"MIT Technology Review was founded in 1899 with the word \"The\" in front.\"? Yes, no, or maybe? Yes\n###\nMiranda May Kerr (born 20 April 1983) is an Australian model. Kerr rose to prominence in 2007 as one of the Victoria's Secret Angels. Kerr was the first Australian Victoria's Secret model and also represented the Australian department store chain David Jones. Kerr has launched her own brand of organic skincare products, KORA Organics, and has written a self-help book. Are we justified in saying that \"Miranda May Kerr was born in the 8th decade of the 20th century.\"? Yes, no, or maybe?", "doc_id": 595, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37055, 37756, 29527, 11791], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Asian Institute is a research centre at the Munk School of Global Affairs at the University of Toronto, and is located in the historical Devonshire House, a former residential hall of the university's Trinity College. Ritu Birla is the Richard Charles Lee Director of the Asian Institute. Are we justified in saying that \"The Devonshire House is located on the outskirts of Toronto.\"? Yes, no, or maybe? Maybe\n###\nWuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county. Are we justified in saying that \"Wuqiang County is a southeastern province\"? Yes, no, or maybe? Yes\n###\nAn opening act, warm-up act, or supporting act is an entertainment act (musical, comedic, or otherwise), that performs at a concert before the featured act, or \"headliner\". Rarely, an opening act may perform again at the end of the event, or perform with the featured act after both have had a set to themselves. Are we justified in saying that \"Headliners don't perform at the same time as the opening act\"? Yes, no, or maybe? Yes\n###\nCecilia Makiwane Hospital (CMH) is a large, provincial, government funded hospital situated in the Mdantsane township of East London, Eastern Cape in South Africa. It is a tertiary teaching hospital and forms part of the East London Hospital Complex with Frere Hospital. It is named after Cecilia Makiwane, the first African woman to become a professional nurse in South Africa. Are we justified in saying that \"CMH is the acronym for a hospital named after a person.\"? Yes, no, or maybe? Yes\n###\nLaura Warholic; or, The Sexual Intellectual is a 2007 novel by Alexander Theroux. The plot concerns the relationship between Eugene Eyestones, the writer of an advice column called \"The Sexual Intellectual\", and his editor's ex-wife, Laura Warholic, whom Eyestones pities more than likes. This basic story provides the jumping off point for Theroux's satire of American culture. Are we justified in saying that \"Laura Warholic; or, The Sexual Intellectual starts with Eugene and Laura already being divorced.\"? Yes, no, or maybe?", "doc_id": 505, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43077, 14064, 42490, 34958], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rana amurensis (Khabarovsk frog, Siberian wood frog, Heilongjiang brown frog or Amur brown frog) is a species of true frog found in northern Asia. It ranges across western Siberia, as well as northeastern China, northeastern Mongolia, and on the northern Korean Peninsula and on Sakhalin. \"Rana coreana\" was previously included in this species as a subspecies. Are we justified in saying that \"Rana amurenis can be found in the Eastern Hemisphere.\"? Yes, no, or maybe? Yes\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\". Are we justified in saying that \"Aniket Vishwasrao has appeared on TV shows.\"? Yes, no, or maybe? Maybe\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1. Are we justified in saying that \"Club Deportivo Cajamadrid was famous for more than one sport.\"? Yes, no, or maybe? Yes\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams. Are we justified in saying that \"Gaming crime has decreased since 1992.\"? Yes, no, or maybe? Maybe\n###\nMick Napier (born December 12, 1962) is an American director, actor, teacher and author living in Chicago. He is the founder and artistic director of the Annoyance Theatre and an award-winning director at The Second City. He has directed Stephen Colbert, Tina Fey, Rachel Dratch, Horatio Sanz, Nia Vardalos, Andy Richter, Jeff Garlin, and David Sedaris, amongst others. Are we justified in saying that \"Mack Napier has won awards for acting.\"? Yes, no, or maybe?", "doc_id": 355, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42135, 41685, 30846, 42519], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hudson is a town in Hillsborough County, New Hampshire, United States. It is located along the Massachusetts state line. The population was 24,467 at the 2010 census, with an estimated population of 24,645 in 2013. It is the ninth-largest municipality (town or city) in the state, by population. Are we justified in saying that \"The Massachusetts state line runs through Hudson\"? Yes, no, or maybe? Yes\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000. Are we justified in saying that \"No one joined the September 2012 Sun Life financial Players' Championship.\"? Yes, no, or maybe? No\n###\nDjibouti, officially the Republic of Djibouti, is a country located in the Horn of Africa. It is bordered by Eritrea in the north, Ethiopia in the west and south, and Somalia in the southeast. The remainder of the border is formed by the Red Sea and the Gulf of Aden at the east. Are we justified in saying that \"It is bordered by a sea with a name that starts with an R\"? Yes, no, or maybe? Yes\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Tricia and Eva are played by the same actress.\"? Yes, no, or maybe? Yes\n###\nRichard Colson Baker (born April 22, 1990), better known by his stage names MGK and Machine Gun Kelly, is an American rapper and actor, from Cleveland, Ohio. MGK embarked on a musical career as a teenager, releasing a mixtape in 2006. He went on to release four more mixtapes. Are we justified in saying that \"Richard Colson Baker was known for being gentle.\"? Yes, no, or maybe?", "doc_id": 489, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20214, 12781, 24772, 22092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015. Are we justified in saying that \"Yap had parents who were farmers.\"? Yes, no, or maybe? Maybe\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"Besides being known as a power chord or fifth chord, there are other names for this style of guitar playing. \"? Yes, no, or maybe? Maybe\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA). Are we justified in saying that \"PLU Crew consists of 25 members\"? Yes, no, or maybe? Maybe\n###\nA conjectural portrait is a portrait made of a historical figure for whom no authentic contemporary portrait is available. The depiction, then, may be variously informed by written accounts of physical appearance, conjecture based on the subject's culture and background, and/or the artist's conception of the subject's inner essence. Are we justified in saying that \"A conjectural portrait is more popular in Asia than in the West\"? Yes, no, or maybe? Maybe\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG. Are we justified in saying that \"Oman Fisheries Co was based in Suadi Arabia\"? Yes, no, or maybe?", "doc_id": 96, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2084, 23930, 31300, 27769], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996). Are we justified in saying that \"She was born in 1956\"? Yes, no, or maybe? Yes\n###\nRalph D. Malone (born January 12, 1964 in Huntsville, Alabama) was a National Football League player for the Cleveland Browns from 1986\u20131987, and was on the practice roster for the Los Angeles Raiders and the Miami Dolphins from 1986 to 1989. He played collegiately for the Georgia Tech football team. Are we justified in saying that \"Ralph D. Malone later became a sports announcer. \"? Yes, no, or maybe? Maybe\n###\nPlatylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in Ivory Coast, Ghana, Cameroon, the Democratic Republic of the Congo (Shaba), western Uganda, Malawi and northern Zambia. The habitat consists of woodland and open places in the forest zone. Are we justified in saying that \"Platylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in many parts of Africa. They like to live in forests.\"? Yes, no, or maybe? Maybe\n###\nTadpoles is the third album by the Bonzo Dog Band. It is largely a compilation of their work from the television show \"Do Not Adjust Your Set\", on which they were the house band. The US version of the album had a track list slightly different from that of the UK version: the US version removed \"I'm the Urban Spaceman\" and added \"Readymades\" the B-side of their follow-up single \"Mr. Apollo\". Are we justified in saying that \"The TV show, Do Not Adjust Your Set, had a lot of songs from the Bonzo Dog Band.\"? Yes, no, or maybe? Yes\n###\nLemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146. Are we justified in saying that \"Lemoyne is part of the census for statistical purposes.\"? Yes, no, or maybe?", "doc_id": 953, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11121, 26979, 32833, 21097], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "William Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65. Are we justified in saying that \"Traditional Scottish folk music became very popular in nineteen hundred fifty five.\"? Yes, no, or maybe? Maybe\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night. Are we justified in saying that \"Benjamin Stoloff thought Bela Lugosi played the most important character in Night of Terror.\"? Yes, no, or maybe? Maybe\n###\nThe Nanking Massacre was an episode of mass murder and mass rape committed by Japanese troops against the residents of Nanjing (\"Nanking\"), then the capital of the Republic of China, during the Second Sino-Japanese War. The massacre is also known as the Rape of Nanking or, using Pinyin romanization, the Nanjing Massacre or Rape of Nanjing. Are we justified in saying that \"Nanjing is still the capital of the Republic of China\"? Yes, no, or maybe? Maybe\n###\nSwaay is the debut EP by American band DNCE. The EP was released worldwide on October 23, 2015, by Republic Records. The majority of the EP was co-produced and co-written by lead singer and frontman Joe Jonas. The EP debuted at number 39 on the US \"Billboard\" 200. Are we justified in saying that \"DNCE won many awards for the EP\"? Yes, no, or maybe? Maybe\n###\nCherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\". Are we justified in saying that \"Supervixens premiered before 1974.\"? Yes, no, or maybe?", "doc_id": 269, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12903, 28227, 27216, 39361], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Identification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\". Are we justified in saying that \"Jerzy Skolimowski has directed 10 other feature films, since Identification Marks: None.\"? Yes, no, or maybe? Maybe\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia. Are we justified in saying that \"The Big 12 Conference has the fewest members of any conference.\"? Yes, no, or maybe? Maybe\n###\nThe Protectorate of Bohemia and Moravia (German: \"Protektorat B\u00f6hmen und M\u00e4hren\" ; Czech: \"Protektor\u00e1t \u010cechy a Morava\" ) was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau. Are we justified in saying that \"Germany was occupied by Czechoslovakia during the 1930's.\"? Yes, no, or maybe? No\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Broadway Rose has a short middle.\"? Yes, no, or maybe? Maybe\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser was born in eighteen hundred eighty one.\"? Yes, no, or maybe?", "doc_id": 686, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13787, 5957, 18093, 1284], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"Doomsday Device was performed by bozo\"? Yes, no, or maybe? Maybe\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910. Are we justified in saying that \"Yi Bangja did not end up being the empress of the Empire of Korea.\"? Yes, no, or maybe? Yes\n###\nBarry and Stuart (Barry Jones and Stuart MacLeod) are two Scottish BAFTA nominated magicians and comedians whose work has been seen on television and on stage around the world. The double act are known for their comically dark performing style, for taking as inspiration the accounts of Biblical miracles and faking paranormal phenomena to form the basis for some of their illusions. Are we justified in saying that \"Barry and Stuart like each other\"? Yes, no, or maybe? Maybe\n###\nLuton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C. Are we justified in saying that \"Luton Town Ladies Football Club formed a partnership with Luton Town F.C. 3 years after they had been established.\"? Yes, no, or maybe? Yes\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1. Are we justified in saying that \"caja madrid started sponshorship of Club Deportivo Cajamadrid in 1979.\"? Yes, no, or maybe?", "doc_id": 601, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36728, 31827, 24775, 31638], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Takoma Langley Crossroads Transit Center is a bus transit center in Langley Park, Maryland. It is at the intersection of University Boulevard and New Hampshire Avenue, and is the largest bus-only transfer in the Washington, D.C. metropolitan area. It is a future transfer point for the Purple Line. Are we justified in saying that \"The Purple Line will be an improvement to mass transit in the D.C. metropolitan area.\"? Yes, no, or maybe? Maybe\n###\nCorn crab soup is a dish found in Chinese cuisine, American Chinese cuisine, and Canadian Chinese cuisine. The soup is actually cream of corn soup with egg white and crab meat or imitation crab meat added. It is most likely of southern Chinese origin. Are we justified in saying that \"Corn crab soup is definitely of northern Chinese origin.\"? Yes, no, or maybe? No\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA). Are we justified in saying that \"PLU Crew has won many championshps\"? Yes, no, or maybe? Maybe\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. Are we justified in saying that \"Madava Farms is at least 1 acre\"? Yes, no, or maybe? Yes\n###\nJay Ferguson (born John Arden Ferguson; May 10, 1947) is an American rock/pop musician, known for his work with Spirit and Jo Jo Gunne, and his 1978 solo hit \"Thunder Island\". His later career has been as a composer of music for television programs and films. Are we justified in saying that \"Jay Ferguson is friends with Spirit and Jo Jo Gunn\"? Yes, no, or maybe?", "doc_id": 964, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24774, 43027, 4695, 15421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "PLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA). Are we justified in saying that \"PLU Crew consists of 40 members\"? Yes, no, or maybe? Maybe\n###\nBad Company is the debut studio album by the English hard rock supergroup Bad Company. The album was recorded at Headley Grange with Ronnie Lane's Mobile Studio in November 1973, and it was the first album released on Led Zeppelin's Swan Song Records label. Are we justified in saying that \"Bad Company was recorded at Headley Grange with Ronnie Lane's Mobile Studio in month after Halloween in the year that equals 2073 minus 100.\"? Yes, no, or maybe? Yes\n###\nThe iHeartRadio Much Music Video Awards (also known as the MMVAs, and originally known as the Canadian Music Video Awards until 1995, and formerly and commonly known as the MuchMusic Video Awards) are annual awards presented by the Canadian television channel Much to honour the year's best music videos. Are we justified in saying that \"The MuchMusic Video Awards are held annually.\"? Yes, no, or maybe? Yes\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"Chris Gardner wanted this movie to be made about his life. \"? Yes, no, or maybe? Maybe\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide. Are we justified in saying that \"Mutual Friends had 8 protagonists\"? Yes, no, or maybe?", "doc_id": 197, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26362, 9741, 5014, 25571], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Progress in Materials Science is a journal publishing review articles covering most areas of materials science, published by the Pergamon imprint of Elsevier. It was started in 1949 with the title \"Progress in Metal Physics\" with Bruce Chalmers serving as first editor. It was changed to the current title in 1961. Are we justified in saying that \"The Pergamon imprint evolved from Butterworth Springer but both are seeking the same audience.\"? Yes, no, or maybe? Maybe\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia. Are we justified in saying that \"The Grand Duchess Kira of Russia was born on 25 August 1914.\"? Yes, no, or maybe? Maybe\n###\nAn experience point (often abbreviated to exp or XP) is a unit of measurement used in tabletop role-playing games (RPGs) and role-playing video games to quantify a player character's progression through the game. Experience points are generally awarded for the completion of quests, overcoming obstacles and opponents, and for successful role-playing. Are we justified in saying that \"XP is only used in tabletop games.\"? Yes, no, or maybe? No\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC. Are we justified in saying that \"Recently, Tripoli Municipal Stadium has seen less revenue than in past years.\"? Yes, no, or maybe? Maybe\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"Yannis Philippakis was born in nineteen hundred seventy five.\"? Yes, no, or maybe?", "doc_id": 210, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6537, 7721, 10527, 25526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label. Are we justified in saying that \"Lance King started Nightmare in 1962\"? Yes, no, or maybe? No\n###\nResorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming. Are we justified in saying that \"Resorts Casino Tunica has gone through a couple name changes over the years.\"? Yes, no, or maybe? Yes\n###\n\"Emigrante del Mundo\" is the debut single of Lucenzo. It was released in France initially in 2007 and a second time in 2010 after the success of the kuduro music promoted by Lucenzo's new hits. It also appears in the debut album of Lucenzo of the same title \"Emigrante del Mundo\". Are we justified in saying that \"Lucenzo wrote more than one song.\"? Yes, no, or maybe? Yes\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC. Are we justified in saying that \"The stadium began with an 11,000 capacity, but was later expanded to 22,000.\"? Yes, no, or maybe? Maybe\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books. Are we justified in saying that \"Boleslav William Felix Robert Sienkiewicz has won an Oscar award.\"? Yes, no, or maybe?", "doc_id": 182, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35853, 8724, 10616, 10421], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forestville Commonwealth is an archaeological site and national historic district located at Earlton in Greene County, New York. The district contains seven contributing sites. It represents the remains of a utopian community built in 1826-1827 as one of three Owenite experiments in New York State. Are we justified in saying that \"Forestville Commonwealth is one of four Owenite experiments in New York.\"? Yes, no, or maybe? No\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl. Are we justified in saying that \"Calendar Girls only has five actresses in it.\"? Yes, no, or maybe? Maybe\n###\nVladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic. Are we justified in saying that \"Rusanov is Russian.\"? Yes, no, or maybe? Yes\n###\nEdwin John Ellis (1848 \u2013 1916) was a British poet and illustrator. He is now remembered mostly for the three-volume collection of the works of William Blake he edited with W. B. Yeats. It is now criticised, however, for weak scholarship, and preconceptions. Are we justified in saying that \"Edwin John Ellis died of consumption.\"? Yes, no, or maybe? Maybe\n###\nThe 2016 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the second edition of the tournament which was part of the 2016 ATP Challenger Tour. It took place in Happy Valley, Australia between 2\u201310 January 2016. Are we justified in saying that \"A professional tennis tournament played on soft courts took place in Australia.\"? Yes, no, or maybe?", "doc_id": 930, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7184, 13567, 39570, 39949], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pixote: a Lei do Mais Fraco (] , lit. \"Pixote (small child): The Law of the Weak\") is a 1980 Brazilian drama film directed by H\u00e9ctor Babenco. The screenplay was written by Babenco and Jorge Dur\u00e1n, based on the book \"A Inf\u00e2ncia dos Mortos\" (\"The Childhood of the Dead Ones\") by Jos\u00e9 Louzeiro. Are we justified in saying that \" the book \"A Inf\u00e2ncia dos Mortos\"is translated to mean \"The Childhood of the Dead Ones\"\"? Yes, no, or maybe? Yes\n###\nAjay Nagrath is an Indian television and movie actor and is the son of Bollywood actor Anil Nagrath. Currently, he plays the role of \"Pankaj\" in C.I.D. He has done many roles in many TV shows and even films, but there came a point in his life when he was unhappy that his weight had become his identity in the industry. He said \"I used to be a couch potato.\" Are we justified in saying that \"Ajay Nagrath was known by his weight\"? Yes, no, or maybe? Yes\n###\nThe Brown Spectator is a student-run journal of conservative and libertarian political writing at Brown University. It was originally the product of a student independent project. It was first published in 1984 \"as a two-page offering of student writing on brightly colored paper\". Are we justified in saying that \"The Brown Spectator has very few libertarian readers.\"? Yes, no, or maybe? Maybe\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"amor a la mexicana is the spanish way of mexican-syle love\"? Yes, no, or maybe? Maybe\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\". Are we justified in saying that \"Frechette won the Canadian Screen Award for Best Art Direction or Production Design at the 3rd Canadian Screen Awards.\"? Yes, no, or maybe?", "doc_id": 59, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3863, 4811, 33534, 23938], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hidden City Entertainment was a game publisher founded in 2004 (as Hidden City Games, Inc.) by Jesper Myrfors and Paul Peterson to develop and market the chip-throwing game, \"Clout Fantasy.\" After Clout was developed the company recruited Peter Adkison as CEO. Are we justified in saying that \"Hidden City Entertainment was founded in 2001\"? Yes, no, or maybe? No\n###\nCarlo's Bake Shop, commonly known as Carlo's Bakery and also known as Carlo's City Hall Bake Shop, is a bakery in Hoboken, New Jersey, owned by Buddy Valastro. The bakery has gained public attention as the setting of the TLC television series, \"Cake Boss\". Are we justified in saying that \"The TLC television series Cake Boss takes place in Missouri.\"? Yes, no, or maybe? No\n###\nMy Super D is a 2016 Philippine superhero fantasy drama television series directed by Frasco Mortiz and Lino Cayetano, starring Dominic Ochoa in his first leading role, together with Marco Masa and Bianca Manalo. The series was aired on ABS-CBN's \"Primetime Bida\" evening block and worldwide on The Filipino Channel from April 18, 2016 to July 15, 2016, replacing \"Game ng Bayan\". Are we justified in saying that \"Marco Masa has worked with Frasco Mortiz.\"? Yes, no, or maybe? Yes\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005. Are we justified in saying that \"Samuel Eto'o Fils was born in nineteen hundred eighty seven.\"? Yes, no, or maybe? No\n###\nThe Achilles Club is a track and field club formed in 1920 by and for past and present representatives of Oxford and Cambridge Universities. Members have won 19 Olympic Gold Medals (most recently Steph Cook in the pentathlon), and held 38 World Records. One of its founding members was Evelyn Aubrey Montague, who is immortalized in the 1981 film \"Chariots of Fire\". Are we justified in saying that \"The Steph Cook is an Olympic Gold Medal in the pentathlon.\"? Yes, no, or maybe?", "doc_id": 548, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13844, 34597, 33805, 43778], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Van Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor. Are we justified in saying that \"Van Cleef & Arpels make more flower themed objects than they do animal themed objects.\"? Yes, no, or maybe? Maybe\n###\nCastle Rock Estate is an Australian winery based at Porongurup, in the Great Southern wine region of Western Australia. According to prominent Australian wine writer James Halliday, it has an exceptionally beautifully sited and immaculately maintained vineyard, winery and cellar door sales area with sweeping vistas from the Porongurups. Are we justified in saying that \"Castle Rock Estate grows grapes\"? Yes, no, or maybe? Yes\n###\nRuth Gentry (February 22, 1862 \u2013 October 18, 1917) was a pioneering American woman mathematician during the late 19th century and the beginning of the 20th century. She was the first native-born Indiana woman to acquire a PhD degree in mathematics, and most likely the first woman born in Indiana to receive a doctoral degree in any scientific discipline. Are we justified in saying that \"More women went to college for degrees in math because of Ruth.\"? Yes, no, or maybe? Maybe\n###\nAdenanthos terminalis, commonly known as gland flower, yellow gland flower or adenanthos, is a one metre tall shrub in the family Proteaceae. It is found in south eastern regions of Australia, in the states of South Australia and Victoria, and is the most widespread of the two \"Adenanthos\" species occurring outside of Western Australia. Are we justified in saying that \"Adenanthos terminalis is in the family protaceae\"? Yes, no, or maybe? Yes\n###\nThe Bowling Green Falcons men's basketball team is the basketball team that represent Bowling Green State University in Bowling Green, Ohio. The school's team currently competes in the Mid-American Conference. The team last played in the NCAA Division I Men's Basketball Tournament in 1968. The Falcons are now coached by Michael Huger, their 17th head coach. Are we justified in saying that \"The Falcons appeared in the NCAA Division I Men's Basketball Tournament after 1968.\"? Yes, no, or maybe?", "doc_id": 272, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34433, 19605, 7841, 44995], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ethan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films. Are we justified in saying that \"Ethan Suplee was born in 1976. \"? Yes, no, or maybe? Yes\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Petasites is a type of dog\"? Yes, no, or maybe? No\n###\nWe Joined the Navy is a 1962 British comedy film produced by Daniel M. Angel and directed by Wendy Toye which stars Kenneth More, Lloyd Nolan, Joan O'Brien, Derek Fowlds, Graham Crowden, Esma Cannon and John Le Mesurier. It was based on the novel of the same name by John Winton. Are we justified in saying that \"We Joined the Navy has 69 actors in it.\"? Yes, no, or maybe? Maybe\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \"there are 6 people per household on average\"? Yes, no, or maybe? No\n###\nOld Carthusians Football Club is an association football club whose players are former pupils of Charterhouse School in Godalming, Surrey, England. The club was established in 1876 and won the FA Cup in 1881, as well as the FA Amateur Cup in 1894 and 1897. The club currently plays in the Arthurian League and won league and Arthur Dunn Cup doubles in 2006, 2008, 2009, 2011, 2013 and 2014. Are we justified in saying that \"The Club did not play in the Arthurian League in 2005.\"? Yes, no, or maybe?", "doc_id": 415, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16046, 1162, 35009, 31717], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern. Are we justified in saying that \"McKinnon started Little League football as a running back.\"? Yes, no, or maybe? Maybe\n###\nThe Tuancheng Fortress or Tuan Cheng Fortress (Chinese:\u00a0\u00a0\u5718\u57ce\u6f14\u6b66\u5ef3 , \u00a0\u56e2\u57ce\u6f14\u6b66\u5385 , \u00a0\"Tu\u00e1nch\u00e9ng Y\u01cenw\u01d4t\u012bng\", \u00a0\"Round Wall Fortress\") is a historic 18th-century fortress located near the Fragrant Hills in the Haidian District of Beijing, China. Today, the fortress is a national museum and is also known as the Tuancheng Exhibition Hall. Are we justified in saying that \"There are a lot of artifacts in the museum.\"? Yes, no, or maybe? Maybe\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"Juan Cruz Aragone was ranked #5 in the world.\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staunton Mall shoppers are mostly poor people\"? Yes, no, or maybe? Maybe\n###\nOn July 16, 2009, Harvard University professor Henry Louis Gates Jr. was arrested at his Cambridge, Massachusetts home by local police officer Sgt. James Crowley, who was responding to a 9-1-1 caller's report of men breaking and entering the residence. The arrest initiated a series of events that unfolded under the spotlight of the international news media. Are we justified in saying that \"Henry Louis Gates Jr is a teacher\"? Yes, no, or maybe?", "doc_id": 326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42493, 22894, 9412, 15484], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Utamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation. Are we justified in saying that \"This film is a biography.\"? Yes, no, or maybe? No\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"One of the counties that got the album starts with an A\"? Yes, no, or maybe? Yes\n###\nMy Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington. Are we justified in saying that \"Florence Nightgale was covered by Guy de la B\u00e9doy\u00e8re in My Famous Family.\"? Yes, no, or maybe? Yes\n###\n\"Se Telefonando\" is a song performed by the Italian singer Mina, released in May 1966. The music was composed, orchestrated and conducted by Ennio Morricone to Italian lyrics by Di Chiara and Costanzo. (Reportedly Costanzo only contributed one word, in editing a previous version of a verse, to avoid censorship). The song was written for a radio broadcast, called \u201cAria condizionata\u201d. Are we justified in saying that \"The Song was only sang in italian (never had a cover in a different language)\"? Yes, no, or maybe? Maybe\n###\nNana Kwame Abrokwa (born 5 October 1968) is a Ghanaian born German rapper and DJ, performing under the pseudonyms Nana or Darkman / Nana. Nana is not an actual first name, but a Ghanaian title of nobility. His most important achievement came in the second half of the 1990s, when his style was characterized as \"euro-rap\". Are we justified in saying that \"Ghana produced a notable euro-rapper in 1968.\"? Yes, no, or maybe?", "doc_id": 225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8149, 14500, 4257, 39359], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "1 vs. 100 is an Australian spin-off game show based on the American version of the same name and the original Dutch version created by Endemol. The game pits one person against 100 others for a chance to win one million dollars. The program is hosted by former Nine Network CEO and personality Eddie McGuire. Are we justified in saying that \"This game is the most watched game in Australia \"? Yes, no, or maybe? Maybe\n###\nSoul Ballet is a musical project of actor, producer, arranger, programmer, and multi-instrumentalist Rick Kelly \"RK.\" Soul Ballet\u2019s music is smooth contemporary jazz/electronica, characterized as pulsating electronic beats entwined with a dark, moody atmosphere. Are we justified in saying that \"Rich Kelly plays the guitar.\"? Yes, no, or maybe? Maybe\n###\nPrincess Antoinette of Monaco, Baroness of Massy (Antoinette Louise Alberte Suzanne Grimaldi; 28 December 1920 \u2013 18 March 2011) was a member of the princely family of Monaco and the elder sister of Prince Rainier III and aunt of Albert II, Prince of Monaco. Her parents were Count Pierre de Polignac and Princess Charlotte, Duchess of Valentinois. Are we justified in saying that \"Count Pierre de Polignac died on December 28th, 1921.\"? Yes, no, or maybe? Maybe\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980. Are we justified in saying that \"Dr. Jeckyll & Mr. Hyde consisted of more than 1 member.\"? Yes, no, or maybe? Yes\n###\nSamuel Bronston (Samuel Bronshtein, March 26, 1908, Bessarabia \u2013 January 12, 1994, Sacramento, California) was a Bessarabian-born American film producer, film director, and a nephew of socialist revolutionary figure, Leon Trotsky. He was also the petitioner in a U.S. Supreme Court case that set a major precedent for perjury prosecutions when it overturned his conviction. Are we justified in saying that \"Samuel Bronston never met his uncle, Leon Trotsky.\"? Yes, no, or maybe?", "doc_id": 377, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24904, 876, 42600, 9296], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Studies in Mutualist Political Economy is a book on political economy published on 2007 by American mutualist anarchist Kevin Carson. In its preface Carson describes this work as \"an attempt to revive individualist anarchist political economy, to incorporate the useful developments of the last hundred years, and to make it relevant to the problems of the twenty-first century.\" Are we justified in saying that \"Studies in Mutualist Political Economy was written by a anarchist \"? Yes, no, or maybe? Yes\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan. Are we justified in saying that \"New York's hockey team was called the Islanders\"? Yes, no, or maybe? Yes\n###\nDarren Horrigan (born 2 June 1983) is an English footballer who played in the Football League for Lincoln City. A goalkeeper born in Middlesbrough, Horrigan began his career with Birmingham City, and went on to play non-League football for clubs including Stamford Town, Cambridge City, Ilkeston Town, Spennymoor United, Scarborough, Gateshead, Bishop Auckland and Tow Law Town. Are we justified in saying that \"Darren Horrigan is from England.\"? Yes, no, or maybe? Yes\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"The people of Keystone have a wonderful view of the river.\"? Yes, no, or maybe? Maybe\n###\nAlexandre \"Xande\" Ribeiro (born January 20, 1981 in Manaus-Amazonas, Brazil), is a Brazilian Jiu-Jitsu practitioner, mixed martial artist and submission wrestler. He is a two-time World (Mundial) Black Belt Absolute (open weight) World Jiu-Jitsu Champion, five-time World (Mundial) Black Belt Heavy Weight Champion, and three-time World Black Belt Pro Division Champion. Are we justified in saying that \"Alexandre \"Xande\" Ribeiro is 38 Years old\"? Yes, no, or maybe?", "doc_id": 935, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39131, 40876, 35834, 8829], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"That Girl Lucy Moon was inspired by Amy's daughter.\"? Yes, no, or maybe? Maybe\n###\nThe Blackstone Chronicles is a serialized novel by American horror and suspense author John Saul. The series consists of six installments and takes place in a fictional New Hampshire town called Blackstone. The series has been adapted into both a computer game and graphic novel. Are we justified in saying that \"New Hampshire has a town called Blackstone. \"? Yes, no, or maybe? No\n###\nGrant Taylor (Born October 30,1991) is an American professional skateboarder. He is the son of former professional skateboarder Thomas Taylor and won Thrasher Magazine's \"Skater of The Year\" in 2011. Grant\u2019s style of skateboarding is known to be fast and powerful. He is recognized for his unique versatile skateboarding. Are we justified in saying that \"Grant Thomas will teach his son to skateboard.\"? Yes, no, or maybe? Maybe\n###\nStanley Anthony Woods (born October 11, 1965) is a former professional American football linebacker and defensive end in the National Football League (NFL) who played for the Seattle Seahawks from 1987 to 1992, as well as the Los Angeles Rams and the Washington Redskins. He played college football at the University of Pittsburgh. Are we justified in saying that \"Woods was also an elite baseball pitcher.\"? Yes, no, or maybe? Maybe\n###\nThe MAV-1 (Maneuvering Air Vehicle) is a low observable Unmanned Air Vehicle prototype developed between ST Aerospace and Defence Science and Technology Agency for its swarming unmanned air vehicle research programme. The prototype was unveiled in Asian Aerospace 2004 and the first test flight was reported in 2005. Are we justified in saying that \"The MAV-1 is controlled by a person.\"? Yes, no, or maybe?", "doc_id": 506, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17066, 7111, 42954, 42158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jara, also known as Jera, is a Nigerian language reported to be spoken by 46,000 people in 2000. It is spoken in Borno and Gombe States, in the Biu, Kwaya-Kusar, Akko, and Yamaltu-Deba LGAs. It is an Afro-Asiatic language, in the Biu\u2013Mandara branch of Chadic family. Use of Jara is declining; it is being displaced by Fulfulde and Hausa. Are we justified in saying that \"Use of Jara is increasing and is replacing other Nigerian languages\"? Yes, no, or maybe? No\n###\nPhacelia mutabilis is a species of flowering plant in the borage family known by the common name changeable phacelia. It is native to the western United States and Baja California, where it can be found in mountains and foothills, in forested and open habitat types, and deserts. Are we justified in saying that \"Borage is native to Arizona.\"? Yes, no, or maybe? Maybe\n###\nMaryborough Airport (IATA: MBH,\u00a0ICAO: YMYB) is located approximately 3 km north of the town centre. The airport serves as a small regional airport serving Maryborough and Rainbow Bay. However, increasing competition with Hervey Bay Airport has led to a decrease in commercial air traffic. Are we justified in saying that \"Maryborough Airport is in New Zealand.\"? Yes, no, or maybe? Maybe\n###\nGulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"Gulf Air services only 41 destinations\"? Yes, no, or maybe? Yes\n###\n\"Pour Me\" is a debut song recorded by American country music group Trick Pony. It was released in October 2000 as the first single from their debut album \"Trick Pony\". The song was written by group members Keith Burns, Ira Dean and Heidi Newfield with Rory Waters Beighley and Sammy Harp Wedlock. Are we justified in saying that \"Trick Pony is the name of an American country music group, their debut album and one of their songs.\"? Yes, no, or maybe?", "doc_id": 293, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19005, 16500, 10799, 8713], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Otard, also known as Chateau de Cognac, is a French cognac house founded in 1795 by Jean-Baptiste Antoine Otard. The company has remained in the hands of the same family since its establishment. The firm is based in the Ch\u00e2teau des Valois (Ch\u00e2teau de Cognac), Cognac, Charente, its home since 1796. Are we justified in saying that \"Otard was founded five years prior to 1800.\"? Yes, no, or maybe? Yes\n###\nTo Drown A Rose is a single by Death in June. Additional music performers include: Christ 777, Douglas P., Gary Carey, Jan O', John Balance, Rose McDowall. The vinyl has the phrases \"Our time has been...\" and \"...and will be again\" scratched into it. The test pressing for this release was done on 12\" vinyl as opposed to the finalized 10\" format. Are we justified in saying that \"To Drown A Rose is known by millions.\"? Yes, no, or maybe? Maybe\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University. Are we justified in saying that \"Ruth Pryor was married \"? Yes, no, or maybe? Maybe\n###\nRodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets. Are we justified in saying that \"Rodrequis La'Vant Stephens was the highest paid player when he was on the Seattle Seahawks\"? Yes, no, or maybe? Maybe\n###\nLloyd Cole, also known as The X Album, is the debut solo album by English singer, songwriter and musician Lloyd Cole released on February 21, 1990 by Polydor and Capitol. Previously Cole was best known for his work with The Commotions but this album marked a departure from their signature sound and an opportunity for him to collaborate with other musicians and explore new ideas. Are we justified in saying that \"Lloyd Cole is also the name of more than one person\"? Yes, no, or maybe?", "doc_id": 712, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19700, 35384, 10530, 30631], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ashcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Ashcroft is in the USA.\"? Yes, no, or maybe? Yes\n###\nBrown University is a private Ivy League research university in Providence, Rhode Island, United States. Founded in 1764 as the College in the English Colony of Rhode Island and Providence Plantations, Brown is the seventh-oldest institution of higher education in the United States and one of the nine colonial colleges chartered before the American Revolution. Are we justified in saying that \"Brown University has a debate team\"? Yes, no, or maybe? Maybe\n###\nThe Return of the Condor Heroes, also called The Giant Eagle and Its Companion, is a wuxia novel by Jin Yong (Louis Cha). It is the second part of the \"Condor Trilogy\" and was preceded by \"The Legend of the Condor Heroes\" and followed by \"The Heaven Sword and Dragon Saber\". It was first serialised between 20 May 1959 and 5 July 1961 in the Hong Kong newspaper \"Ming Pao\". Are we justified in saying that \"It was the second wuxia novel ever written\"? Yes, no, or maybe? Maybe\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead. Are we justified in saying that \"The Sierra Leone Civil War didn't accomplish anything.\"? Yes, no, or maybe? Maybe\n###\nThe Corridor (Lithuanian: Koridorius ) is a 1995 Lithuanian drama film directed by \u0160ar\u016bnas Bartas. It has a fragmentary narrative without dialogue and depicts several people in Vilnius. According to the director, the title symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\". Are we justified in saying that \"Vilnius symbolizes \"the atmosphere of a corridor between yesterday and today, containing many doors\".\"? Yes, no, or maybe?", "doc_id": 235, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36299, 10585, 6055, 7772], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Exergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid. Are we justified in saying that \"Exergonix Inc, is an energy storage company based in Kansas City, Missouri that develops and builds renewable energy solutions for a wide range of applications. \"? Yes, no, or maybe? Maybe\n###\nIrfan Khoosat (Urdu: \u0639\u0631\u0641\u0627\u0646 \u06a9\u06be\u0648\u0633\u0679\u200e ) is a Pakistani actor, producer and a well-known comedian. He is famous for his comic role as \"Hawaldar Karamdad\" in the TV series Andhera Ujala in which he portrayed simpleton and blabbermouth character of a low-ranked policeman. He is also known as stage comedian. He also won Nigar Award for his comic role in 1985 film \"Hum se hai zamana\". Are we justified in saying that \"Irfan will continue his career until 2020.\"? Yes, no, or maybe? Maybe\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound. Are we justified in saying that \"German dogs are good scent dogs.\"? Yes, no, or maybe? Maybe\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg. Are we justified in saying that \"The swan diagram is an outdated model.\"? Yes, no, or maybe? Maybe\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013. Are we justified in saying that \"Brandon Tyler McManus met with Amy.\"? Yes, no, or maybe?", "doc_id": 110, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17236, 8438, 2513, 42031], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records. Are we justified in saying that \"Weezer is a Chinese rock band\"? Yes, no, or maybe? No\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals. Are we justified in saying that \"Udinese Calcio finished first place in Serie A\"? Yes, no, or maybe? No\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others. Are we justified in saying that \"The film was produced by Joseph Lovett.\"? Yes, no, or maybe? Maybe\n###\nFaer\u00fbn is a fictional subcontinent, the primary setting of the \"Dungeons & Dragons\" world of \"Forgotten Realms\". It is described in detail in the \"Forgotten Realms Campaign Setting\" (2001) from Wizards of the Coast, and various locales and aspects are described in more depth in separate campaign setting books. Around a hundred novels and several computer and video games use the Faer\u00fbn setting. Are we justified in saying that \"The inconsequential place of Faerun is talked about sometimes in the D&D lore\"? Yes, no, or maybe? Maybe\n###\nDonaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville. Are we justified in saying that \"Donaldson Center Airport is located in South Carolina, six kilometers south of the central business district of Greenville.\"? Yes, no, or maybe?", "doc_id": 35, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3964, 3577, 6349, 7501], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eric Black is an American journalist. He was a longtime reporter for the Minnesota Star Tribune newspaper, and has also been a Twin Cities blogger. He is a columnist for online newspaper MinnPost, primarily writing about politics and the historical background of current issues. Are we justified in saying that \"Eric Black has been employed by a company\"? Yes, no, or maybe? Yes\n###\nSt. Ives Town F.C. is a football club based in St Ives, Cambridgeshire, England. They play in the Southern League Premier Division. This St Ives Town should not be confused with the Cornwall Combination team playing in St Ives, Cornwall, which is also called St Ives Town F.C. Are we justified in saying that \"St. Ives Town F.C. has never won a championship\"? Yes, no, or maybe? Maybe\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes. Are we justified in saying that \"Hook, Line and Sinker is an Australian fishing television program which runs for 60 minutes\"? Yes, no, or maybe? No\n###\nThe Stranger Left No Card (1952) is a British short film directed by Wendy Toye. The film won the Best Fiction award at the 1953 Cannes Film Festival, where it was described as \"a masterpiece\" by Jean Cocteau. It marked the film debut of actor Alan Badel. Are we justified in saying that \"The short film aired in the very early 1950's\"? Yes, no, or maybe? Yes\n###\nSergeant Alistair Slater, MM (25 July 1956 \u2013 2 December 1984), was a British Army soldier who served in B Squadron, Air (7) Troop, 22 Special Air Service (SAS), who was killed on 2 December 1984 while on operations against the Provisional Irish Republican Army in Kesh, a village in County Fermanagh in Northern Ireland. Are we justified in saying that \"Slater was killed by the Provisional Irish Republican Army\"? Yes, no, or maybe?", "doc_id": 613, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39673, 38599, 24591, 12258], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"It is sunny outside\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Augusta County is in the middle of the United States. \"? Yes, no, or maybe? Maybe\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph Ernst Friedrich von Forcade de Biaix was a good speaker.\"? Yes, no, or maybe? Maybe\n###\nJoel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden. Are we justified in saying that \"Joel Madden has used the same last name his whole life\"? Yes, no, or maybe? No\n###\nMax Carver (born Robert Maxwell Martensen Jr; August 1, 1988) is an American actor. He is known for his role as Preston Scavo in the ABC television series \"Desperate Housewives\", and as Aiden on the MTV teen-horror drama \"Teen Wolf\". He starred in the first season of the HBO series \"The Leftovers\". His twin brother Charlie Carver portrayed the twin of his characters in all three shows. Are we justified in saying that \"He was born in the century before the current one\"? Yes, no, or maybe?", "doc_id": 881, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27888, 16220, 41666, 26102], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Burnaston is a village and civil parish in the South Derbyshire district of Derbyshire, England, just south-west of the city of Derby. The population of the civil parish at the 2011 Census was 1,531. The village has swift and direct road links with nearby cities Derby and Nottingham, as well as the city of Birmingham that is some forty miles south along the A38 dual carriageway. Are we justified in saying that \"Burnaston is just north-west of the city of Derby. \"? Yes, no, or maybe? No\n###\nJames Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015. Are we justified in saying that \"Big Game James is a 12 time PBA All-star.\"? Yes, no, or maybe? Yes\n###\nSkaneateles ( or ) is an affluent village in the town of Skaneateles, Onondaga County, New York, United States. The village is named from and located on the shores of Skaneateles Lake, one of the Finger Lakes. As of the 2010 census, the village had a population of 2,450 residents. Are we justified in saying that \"Skaneateles has a population of almost 2500 people\"? Yes, no, or maybe? Yes\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys. Are we justified in saying that \"Scott Edward Morriss was born in 1980\"? Yes, no, or maybe? No\n###\nBullitt East High School is a high school located at 11450 Highway 44 East in the city of Mount Washington, Kentucky. It is part of the Bullitt County Public Schools district. Sports teams include: Archery, Swimming, Football, Soccer, Tennis, Track and Field, Baseball, Softball, Wrestling, Basketball, Volleyball and Cheerleading. Are we justified in saying that \"Bullitt East High School is not in Washington.\"? Yes, no, or maybe?", "doc_id": 7, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41671, 37368, 118, 3161], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Glacier retreat or glacial retreat is type of glacial motion discussed in several articles, depending on the time frame of interest, and whether the climatological process or individual glaciers are being considered. Articles on these topics include: Are we justified in saying that \"Glacier retreat is not explained in this articles.\"? Yes, no, or maybe? Maybe\n###\nAirline Captain Irene Koki Mutungi, commonly known as Koki Mutungi, is a professional pilot in Kenya, the largest economy in the East African Community. She was the first female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft. She flies for Kenya Airways, the national airline of Kenya. Are we justified in saying that \"Koki Mutungi was born in Kenya and is the fist female on the African continent to become certified as a Captain of the Boeing 787 \"Dreamliner\" aircraft.\"? Yes, no, or maybe? Maybe\n###\nKimberly Beck (born January 9, 1956) is a former American actress and model. She is best known for her role as Trish Jarvis in Joseph Zito's \"\" (1984). Her other film roles include Alfred Hitchcock's \"Marnie\" (1964), Luc Besson's \"The Big Blue\" (1988), George T. Miller's \"Frozen Assets\" (1992), and Roland Emmerich's \"Independence Day\" (1996). Are we justified in saying that \"Kimberly was the star actress in \"The Big Blue\".\"? Yes, no, or maybe? Maybe\n###\nMarie Hedwig Auguste of Sulzbach (German: \"Marie Hedwig Auguste von Sulzbach\" ; born: 15 April 1650 in Sulzbach; died: 23 November 1681 in Hamburg) was a Countess Palatine of Sulzbach by birth and by marriage, Archduchess of Austria and by her second marriage, Duchess of Saxe-Lauenburg. Are we justified in saying that \"Marie Hedwig Aususte was Archduchess of Saxe-Lauenburg.\"? Yes, no, or maybe? No\n###\nNewtrament is a musician, MC and DJ known for releasing an early UK electro/hip hop record - \"London Bridge is Falling Down\" - on Jive Records. It was based on the nursery rhyme (previously adapted by the reggae group Culture) with a political message that electoral politics were a sham. Are we justified in saying that \"newtrament didn't write london bridge is falling down\"? Yes, no, or maybe?", "doc_id": 849, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21566, 3444, 41416, 26779], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arturo Guzm\u00e1n Decena (a.k.a. Z-1) (13 January 1976 \u2013 21 November 2002) was a Mexican Army Special Forces operative who in 1997 defected to the Gulf Cartel and subsequently founded the criminal syndicate's enforcement wing at the behest of drug baron Osiel C\u00e1rdenas Guill\u00e9n. Known today as Los Zetas, the cartel's armed wing ultimately broke apart and formed its own drug trafficking organization. Are we justified in saying that \"arturo guzman decena is best known for Z-1 for his gun knowledge\"? Yes, no, or maybe? Maybe\n###\nDiablo is a 2015 Canadian-American psychological western film co-written and directed by Lawrence Roeck and starring Scott Eastwood, Walton Goggins, Camilla Belle and Danny Glover. It was the first Western starring Eastwood, the son of Western icon Clint Eastwood. Are we justified in saying that \"2015 movie Diablo starred Clint Eastwood.\"? Yes, no, or maybe? No\n###\nLegoland Discovery Center Dallas Fort Worth is an indoor family entertainment center located at Grapevine Mills mall in Grapevine, Texas, which is situated between the cities of Dallas and Fort Worth, Texas. The attraction includes Lego-theme rides, a soft play area, a 4D cinema and a gift shop. The center is owned and operated by British leisure group Merlin Entertainments. Are we justified in saying that \"The most prevalent injury at Legoland Discovery Center is sunburns.\"? Yes, no, or maybe? Maybe\n###\nThe 1919 PGA Championship was the second PGA Championship, which is now considered one of golf's major championships. It was held September 16\u201320 at the Engineers Country Club in Roslyn Harbor, New York, east of New York City on Long Island in Nassau County. Are we justified in saying that \"The second PGA Championship was held after the end of World War I.\"? Yes, no, or maybe? Yes\n###\nThe 2007 Internazionali BNL d'Italia was the 2007 edition of the Rome Masters tennis tournament. The men's tournament was part of the 2007 ATP Masters Series and was held on May 5-13. The women's event was a 2007 WTA Tier I Series event and was held on May 13-20. Are we justified in saying that \"The Rome Masters tennis tournament was not held in 2008.\"? Yes, no, or maybe?", "doc_id": 206, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19115, 11886, 39079, 21245], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Justin Tinucci is an American actor, musician and professional indoor skydiver who is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Jason. He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2017. Are we justified in saying that \"Justin Tinucci is an American actor, He will appear in an upcoming Sony Pictures Home Entertainment film called Devil's Whisper in 2015. he is best known for his roles on \"Incredible Crew\" as a recurring guest star, iCarly, Big Love, Trophy Wife and the Netflix show Lady Dynamite where he plays Sean.\"? Yes, no, or maybe? No\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award. Are we justified in saying that \"Rachel Brosnahan has never moved her lips.\"? Yes, no, or maybe? No\n###\nKew Bridge railway station is a railway station in Brentford and Gunnersbury, in the London Borough of Hounslow, and is in Travelcard Zone 3. The station and all trains serving it are operated by South Western Railway. The station was named after the nearby Kew Bridge. Are we justified in saying that \"Kew Bridge railway station is very modern.\"? Yes, no, or maybe? Maybe\n###\nMaps is the debut release by Ohio/Detroit-based pop punk band Mixtapes. The album was recorded over one weekend and released for free on Death to False Hope Records. It was later re-released in November 2011 on vinyl via Animal Style Records, featuring the 10 songs from \"Maps \"and a newly recorded EP titled \"Companions\". The album was re-titled \"Maps & Companions\". Are we justified in saying that \"The re-release was on vinyl. \"? Yes, no, or maybe? Yes\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\". Are we justified in saying that \"The Last of Us was the only game developed by Naughty Dog.\"? Yes, no, or maybe?", "doc_id": 554, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32370, 15082, 17998, 25721], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming. Are we justified in saying that \"It is between 2 bays\"? Yes, no, or maybe? Yes\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"He held the position of the Tibetan Youth Congress since September 2007, and on August 8, 2008 he was re-elected.\"? Yes, no, or maybe? Yes\n###\nReal Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group, or stable, active in the Mexican professional wrestling promotion Asistencia, Asesor\u00eda y Administraci\u00f3n (AAA) and consists of young \"tecnicos\" (Face or \"good guys\") who all use a high flying, high risk wrestling style. Are we justified in saying that \"Real Fuerza A\u00e9rea (Spanish for \"Royal Air Force\") was a Mexican professional wrestling group in the AAA. It consists of young \"tecnicos\" who all use a high flying, high risk wrestling style.\"? Yes, no, or maybe? Yes\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi. Are we justified in saying that \"NASA John H. Glenn Research Center is directed by Janet L. Kavandi \"? Yes, no, or maybe? Yes\n###\nCriminal Mindscape is a television documentary series on MSNBC that profiles the minds of extreme criminals. Different interviewers interview subjects such as Ron Luff and Joseph Paul Franklin. Interviewers are often from various fields of criminal justice as opposed to journalism. Interviewers attempt to develop psychological profiles of individual criminals. Are we justified in saying that \"Criminal Mindscape's interviewers all come from different fields of criminal justice\"? Yes, no, or maybe?", "doc_id": 560, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35177, 15917, 31232, 10866], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Murder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"The albums they release in 2017 will progressively become more psychedelic\"? Yes, no, or maybe? Maybe\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Hero without a country did not win any awards\"? Yes, no, or maybe? Maybe\n###\nCity Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle. Are we justified in saying that \"City Mall is a large mall in Jordan that has been open for over a decade. There are many films shown in the cinema here. Some of the films are American.\"? Yes, no, or maybe? Yes\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release. Are we justified in saying that \"The members of The Sloppy Meateaters are vegans. \"? Yes, no, or maybe? Maybe\n###\nThe Magic Roundabout in Swindon, England, was constructed in 1972 and consists of five mini-roundabouts arranged around a sixth central, anticlockwise roundabout. Located near the County Ground, home of Swindon Town F.C., its name comes from the popular children's television series \"The Magic Roundabout\". In 2009 it was voted the fourth scariest junction in Britain, in a poll by Britannia Rescue. Are we justified in saying that \"The Magic Roundabout was voted the fourth scariest junction in Britain in 2010 in a poll by Britannia Rescue.\"? Yes, no, or maybe?", "doc_id": 196, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21767, 9773, 23392, 19722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy. Are we justified in saying that \"The National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. It is owned by people who love children.\"? Yes, no, or maybe? Maybe\n###\nA symphonic song cycle can either refer to a symphony composed of separate movements played consecutively or to a set of symphonic works linked by theme, common composer, or common conductor. A symphonic cycle should not be confused with the closely related song cycle. Are we justified in saying that \" Symphonic songs are very short.\"? Yes, no, or maybe? Maybe\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"They have other songs that have peaked higher\"? Yes, no, or maybe? Maybe\n###\nMurray, Utah was declared a city July 3, 1902, instituting a mayor-council form of government. The mayor of Murray was originally partisan, but switched to a non-partisan position. The term of mayor was originally two years, but amended to a four-year term in the 1940s in accordance with state law. The following is a list of Mayors of Murray, Utah. Are we justified in saying that \"Murray has been a city for less than 200 years\"? Yes, no, or maybe? Yes\n###\n\"679\" is the second single by American rapper Fetty Wap from his self-titled debut album. The song features Remy Boyz member Monty and former Remy Boyz member P-Dice. \"679\" peaked at number 4 on the US \"Billboard\" Hot 100, becoming his second highest-charting single after \"Trap Queen\". The album version of the song omits P-Dice's verse, only featuring Monty. Are we justified in saying that \"Trap Queen is the highest-charting single of Fetty Wap.\"? Yes, no, or maybe?", "doc_id": 89, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9362, 25009, 14265, 25258], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Simon Corbell (born 21 November 1970) is a former Australian politician and Deputy Chief Minister of the Australian Capital Territory. He was also Attorney-General, Minister for Health, Minister for the Environment and Minister for the Capital Metro. Are we justified in saying that \"Simon Corbell is currently 50 years old.\"? Yes, no, or maybe? No\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98. Are we justified in saying that \"Most teens liked Death Race.\"? Yes, no, or maybe? Maybe\n###\nTom Clancy's Splinter Cell is a 2002 stealth video game developed by Ubi Soft Montreal and built on the Unreal Engine 2. It is the first \"Splinter Cell\" game in the series. Endorsed by author Tom Clancy, it follows the activities of NSA black ops agent Sam Fisher. The character of Fisher is voiced by actor Michael Ironside. Are we justified in saying that \"The NSA has a paramilitary group responsible for secret operations.\"? Yes, no, or maybe? Yes\n###\nThe Grand Prix des Fronti\u00e8res was a motor race held at a street circuit in Chimay, Belgium. The race was created by Jules Buisseret, who was also responsible for the circuit's existence. The first event was held in 1929 and was discontinued after the 1972 event for safety reasons. Are we justified in saying that \"The Grand Prix des Fronti\u00e8res was a car race\"? Yes, no, or maybe? Yes\n###\nRonald Francis Arias (born November 30, 1941) is a former senior writer and correspondent for \"People magazine\" and \"People en Espa\u00f1ol\". He is also a highly regarded author whose novel \"The Road to Tamazunchale\" has been recognized as a milestone in Chicano literature. Are we justified in saying that \"Ronald Francis Arias is a famous mexican author\"? Yes, no, or maybe?", "doc_id": 811, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10063, 33777, 40422, 20679], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Europrop International GmbH (EPI) is a joint venture among four European aircraft engine manufacturers, MTU Aero Engines, Snecma, Rolls-Royce, and Industria de Turbo Propulsores. The company's sole product is the Europrop TP400-D6 7970 skW/10,690shp turboprop developed for the Airbus A400M Atlas. Are we justified in saying that \"Europrop International GmbH (EPI) is a joint venture among eight thousand and four hundred European aircraft engine manufacturers.\"? Yes, no, or maybe? No\n###\nUSS \"Christopher\" (DE-100) was a Cannon class destroyer escort built for the U.S. Navy during World War II. She served in the Atlantic Ocean and provided escort service against submarine and air attack for Navy vessels and convoys. She was named for a Navy Cross recipient, Harold Jensen Christopher, who was killed at Pearl Harbor aboard on 7 December 1941. Are we justified in saying that \"USS \"Christopher\" was named after a guy who died in 1941\"? Yes, no, or maybe? Yes\n###\nMary Pierce (born 15 January 1975) is a French retired tennis professional who played on the Women's Tennis Association (WTA) tour. Born in Canada, she is a citizen of Canada, and the United States. Pierce played for France in team competitions and in the Olympics. Are we justified in saying that \"Mary Pierce has a middle name.\"? Yes, no, or maybe? No\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph de Biaix was a member of parliament in the German Reichstag.\"? Yes, no, or maybe? Yes\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Chris McKendry was born more than 1968 years ago.\"? Yes, no, or maybe?", "doc_id": 29, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43865, 43222, 11832, 10056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The NME Awards 2017 were held in London, England, on 15 February 2017, at the Brixton Academy and was hosted by English comedian Huw Stephens. Beyonc\u00e9 led the nominations with five, followed by The 1975, Bastille, Christine And The Queens and Skepta with four nominations each. Are we justified in saying that \"Huw Stephens attended the Brixton Academy that evening.\"? Yes, no, or maybe? Yes\n###\nThe diminished seventh chord is commonly used in the harmony of both Western classical music and also in jazz and popular music of the twentieth and twenty-first centuries. Classical composers of the eighteenth and nineteenth centuries imaginatively exploited the chord's dramatic and expressive potential. (See below). Are we justified in saying that \"The diminished seventh chord is is associated with music from the 21st century.\"? Yes, no, or maybe? Yes\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"Colorz of Rage hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley had blossoming careers independent of the urban drama film.\"? Yes, no, or maybe? Maybe\n###\nToolbox Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. It is a remake of the 1978 film of the same name and was produced by the same people behind the original. The film centralizes on the occupants of an apartment who are stalked and murdered by a masked killer. Are we justified in saying that \"Toolbox Murders is a 2004 comedy film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch.\"? Yes, no, or maybe? No\n###\nRevolution Money is a financial services company based in St. Petersburg, Florida. The company's products include a PIN-based credit card, online person-to-person payments service with a linked stored value card, and gift card. Revolution Money is the only credit card that does not charge retailers interchange fees. The company partnered with Yahoo! Sports and Fifth Third Bank. Are we justified in saying that \"Revolution Money does charge some fees.\"? Yes, no, or maybe?", "doc_id": 722, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41895, 4813, 8600, 16290], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "InterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV. Are we justified in saying that \"Rede Globo owns InterTV Grande Minas\"? Yes, no, or maybe? Maybe\n###\n\"Paradise\" is the only single release from Styx's 1997 live double album \"Return to Paradise\". The song was originally written and recorded by Dennis DeYoung for his musical \"The Hunchback of Notre Dame\". The song was re-recorded by Styx for inclusion as one of three new studio tracks on the live album. Are we justified in saying that \"Paradise was released in 1990\"? Yes, no, or maybe? No\n###\nWBZW (1520 AM) is a radio station in Altamonte Springs, Florida. Owned by Pennsylvania Media Associates, Inc., the station operates at 1520 kHz with a daytime power of 5 kW & a nighttime power of 350 watts. Its transmitter is located in Apopka, Florida. The station currently programs a Business News/Talk format. Are we justified in saying that \"Th radio station has more listeners in the noon so it consumes more power in the day time.\"? Yes, no, or maybe? Maybe\n###\nCarol Ann Crawford (February 22, 1934 \u2013 August 10, 1982), also known as Carol Stolkin and Carol Ross, was an American backgammon and bridge player from Buffalo, New York who spent many years in Detroit, Michigan.. In 1973, she became the second woman to win the world backgammon championships. Are we justified in saying that \"Carol Ann Crawford always lost.\"? Yes, no, or maybe? No\n###\nTwelve Days of OK Go is a compilation album by American rock band OK Go. It was released on December 31, 2012. OK Go started releasing the songs on December 10, with one song released each weekday. The last song, a cover of \"Any Time at All\", was released on Christmas. A bonus track, a cover of \"This Will Be Our Year,\" was released on New Year's Eve. Are we justified in saying that \"Ok Go is a punk band\"? Yes, no, or maybe?", "doc_id": 143, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10340, 18889, 41030, 32455], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"The Auspicious Crane took part in some of the most important naval battles of World War 2.\"? Yes, no, or maybe? Yes\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute. Are we justified in saying that \"Fifty-Four Forty or Fight! was a term coined to express Polk's agenda.\"? Yes, no, or maybe? Yes\n###\nThe Axe Giant: Original Motion Picture Soundtrack is the soundtrack to the 2013 film, \"Axe Giant: The Wrath of Paul Bunyan\". The album features the film score composed by Midnight Syndicate's Edward Douglas and \"The Ballad of Paul Bunyan\" performed by Hick'ry Hawkins. Are we justified in saying that \"The film score for The Axe Giant was composed in 2013\"? Yes, no, or maybe? Maybe\n###\nIn theoretical physics, particularly in discussions of , Mach's principle (or Mach's conjecture) is the name given by Einstein to an imprecise hypothesis often credited to the physicist and philosopher Ernst Mach. The idea is that local inertial frames are determined by the large scale distribution of matter, as exemplified by this anecdote: Are we justified in saying that \"The idea regarding local inertial frames was widely supported by many, the first time Mach discussed his principle in the topic of theoretical physics.\"? Yes, no, or maybe? Maybe\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur. Are we justified in saying that \"George Edward Foreman has never physically hurt another person\"? Yes, no, or maybe?", "doc_id": 346, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39001, 2071, 8004, 11197], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"Nashville West's members were all good. \"? Yes, no, or maybe? Maybe\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station. Are we justified in saying that \"he silver cty bristol crashed due to the engine\"? Yes, no, or maybe? Maybe\n###\nThe Internazionali Tennis Val Gardena S\u00fcdtirol \"(also known as the Sparkassen ATP Challenger on the ATP Challenger Tour)\" is a tennis tournament held in Ortisei, Italy since 2000. The event is part of the ATP Challenger Tour and the ITF Women's Circuit and is played on indoor hard courts. The event was previously a $100,000+H ITF Women's Circuit category from 2008 to 2009. Are we justified in saying that \"The Internazionali Tennis Val Gardena S\u00fcdtirol tennis tounament was held in Ortisei, Italy from 2008 to 2009.\"? Yes, no, or maybe? Yes\n###\nMy Famous Family is a British television programme on genealogy, co-hosted by Bill Oddie and Guy de la B\u00e9doy\u00e8re. Each episode shows an ordinary member of the public with a famous ancestor: Queen Victoria, Florence Nightingale, George Stephenson, Lawrence of Arabia, or the Duke of Wellington. Are we justified in saying that \"Bill Oddie hosts My Famous Family by himself.\"? Yes, no, or maybe? No\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington. Are we justified in saying that \"The Angel and the Soldier Boy is Clannad's first movie soundtrack.\"? Yes, no, or maybe?", "doc_id": 409, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [175, 42912, 29503, 23678], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Natasha Choufani is a Lebanese actress. Born and raised in the UAE, she grew up in a multi-cultural society. Her ability to act in different dialects and languages had helped open many doors to playing diverse characters in theater, film and TV at home and abroad. Are we justified in saying that \"Natasha did not grow up in a multi-cultural society.\"? Yes, no, or maybe? No\n###\nThe Louvin Brothers were an American musical duo composed of brothers Ira Lonnie Loudermilk (1924\u20131965) and Charlie Elzer Loudermilk (1927\u20132011), better known as Ira and Charlie Louvin. The brothers are cousins to John D. Loudermilk, a Nashville Songwriters Hall of Fame member. Are we justified in saying that \"They weren't the first artists inducted in the hall of fame in their family\"? Yes, no, or maybe? Maybe\n###\nTroy University was a short-lived university established at Troy, New York in 1858 under the auspices of the Methodist Episcopal Church. The school closed in 1861. The building that housed the university remained a prominent Troy landmark until 1969. On the site now is Rensselaer Polytechnic Institute's Folsom Library. Are we justified in saying that \"Troy University has been seen by george.\"? Yes, no, or maybe? Maybe\n###\nPata Nahi Rabb Kehdeyan Rangan Ch Raazi (Punjabi: \u0a2a\u0a24\u0a3e \u0a28\u0a39\u0a40\u0a02 \u0a30\u0a71\u0a2c \u0a15\u0a3f\u0a39\u0a5c\u0a3f\u0a06\u0a02 \u0a30\u0a70\u0a17\u0a3e\u0a02 \u2019\u0a1a \u0a30\u0a3e\u0a1c\u0a3c\u0a40 ) is a 2012 Punjabi film starring Neeru Bajwa, Tarun Khanna, Gurpreet Ghuggi, Amar Noorie in lead roles. It's directed by Ravinder Peepat and Produced by Harman and Jasmeet Singh Judge Films Production Pvt. Ltd. The film is released by Kapil Batra Films Production House. Are we justified in saying that \"Neeru Bajwa, Tarun Khanna, and Gurpreet Ghuggi were disparaged for their performances.\"? Yes, no, or maybe? Maybe\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date. Are we justified in saying that \"Fuel band I created something like human due to it being the hemorrhage of the hits and not popular\"? Yes, no, or maybe?", "doc_id": 373, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24009, 33588, 26617, 14834], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards. Are we justified in saying that \"The Proteus Design Suite is used by the US army\"? Yes, no, or maybe? Maybe\n###\nThe Nutty Professor is a 1963 American science fiction-romantic comedy film produced, directed, co-written (with Bill Richmond) and starring Jerry Lewis. The score was composed by Walter Scharf. The film is a parody of Robert Louis Stevenson's \"Dr. Jekyll and Mr. Hyde\". Are we justified in saying that \"The runtime of the film is 52 minutes\"? Yes, no, or maybe? Maybe\n###\nCape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming. Are we justified in saying that \"Cape Vakop was chartered over 35 years ago\"? Yes, no, or maybe? Yes\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries. Are we justified in saying that \"Diageo does not own any liquor companies.\"? Yes, no, or maybe? No\n###\nThe 1975\u201376 Seattle SuperSonics season was the 9th season of the Seattle SuperSonics in the National Basketball Association (NBA). The SuperSonics finished the season in second place in the Western Conference with a 43\u201339 record, the same as the previous year and reached the playoffs for a second consecutive season, where they lost to the Phoenix Suns in the Conference Semifinals in six games. Are we justified in saying that \"The Seattle SuperSonics don't have other nicknames\"? Yes, no, or maybe?", "doc_id": 736, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15188, 22824, 12107, 36918], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Doomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"Doomsday Device is a popular term used in professional wrestling. \"? Yes, no, or maybe? Maybe\n###\nSwaay is the debut EP by American band DNCE. The EP was released worldwide on October 23, 2015, by Republic Records. The majority of the EP was co-produced and co-written by lead singer and frontman Joe Jonas. The EP debuted at number 39 on the US \"Billboard\" 200. Are we justified in saying that \"Swaay has been sung by Trump.\"? Yes, no, or maybe? Maybe\n###\nTony Rena Snell Jr. (born November 10, 1991) is an American professional basketball player for the Milwaukee Bucks of the National Basketball Association (NBA). Snell played college basketball for the New Mexico Lobos before declaring for the NBA draft after his junior year. He was drafted with the 20th overall pick in 2013 NBA draft by the Chicago Bulls. Are we justified in saying that \"Tony was inspired by a basketball player early in his life.\"? Yes, no, or maybe? Maybe\n###\nUniversity of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. Are we justified in saying that \"University of Maryland Eastern Shore is a cheap university \"? Yes, no, or maybe? Maybe\n###\nBad Family () is a South Korean television series starring Kim Myung-min, Nam Sang-mi, Im Hyun-sik, Yeo Woon-kay, Kang Nam-gil, Geum Bo-ra, Kim Heechul and Lee Young-yoo. It aired on SBS from March 22 to May 11, 2006 on Wednesdays and Thursdays at 21:55 for 16 episodes. Are we justified in saying that \"New episodes of Bad Family are still being made.\"? Yes, no, or maybe?", "doc_id": 770, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17362, 27079, 3669, 2990], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Semonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781. Are we justified in saying that \"Semonkong is a refuge for Basotho people\"? Yes, no, or maybe? Yes\n###\nYahoo Serious (born 27 July 1953), born Greg Gomez Pead (name-change by deed poll in 1980), is an Australian film actor, director, and score composer. He is best known for his 1988 comedy film \"Young Einstein\". He also created \"Reckless Kelly\" in 1993 and \"Mr. Accident\" in 2000. Serious writes, directs, produces, stars in, and has composed the scores for his movies. Are we justified in saying that \"Yahoo Serious is a very fat man\"? Yes, no, or maybe? Maybe\n###\nElizabeth \"Long Liz\" Stride (n\u00e9e Gustafsdotter) (27 November 1843 \u2013 30 September 1888) is believed to be a victim of the notorious unidentified serial killer called Jack the Ripper, who killed and mutilated several women in the Whitechapel area of London from late August to early November 1888. Are we justified in saying that \"Elizabeth Stride was born in 1843\"? Yes, no, or maybe? Yes\n###\nThe University of Nebraska\u2013Lincoln, often referred to as Nebraska, UNL or NU, is a public research university in the city of Lincoln, in the state of Nebraska in the Midwestern United States. It is the state's oldest university, and the largest in the University of Nebraska system. Are we justified in saying that \"The University of Nebraska-Lincoln is often referred to as UNL\"? Yes, no, or maybe? Yes\n###\nConcrete Sky was the second EP released by Beth Orton, with the lead track taken from her 2002 album \"Daybreaker\". It contains four songs, and was released on CD. \"Concrete Sky\" features vocals and guitar from Ryan Adams, and was written by Beth Orton and Johnny Marr. Are we justified in saying that \"Ryan Adams plays the flute. \"? Yes, no, or maybe?", "doc_id": 836, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14195, 33543, 31210, 29587], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Duel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg. Are we justified in saying that \"Steven Spielberg is the well known director of many movies and short plays.\"? Yes, no, or maybe? Maybe\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"Cuban imports to the United States were stopped in 1961, but have since resumed.\"? Yes, no, or maybe? Maybe\n###\nAlways (; lit. Only You) is a South Korean film directed by Song Il-gon. Starring So Ji-sub and Han Hyo-joo in the lead roles, it is about a romance between an ex-boxer who has closed his heart to the world and a telemarketer who remains spirited despite slowly going blind. Are we justified in saying that \"Always is a South Korean movie\"? Yes, no, or maybe? Yes\n###\nPerformance Car, commonly abbreviated to PC, was an automobile magazine from the United Kingdom published by EMAP between October 1983 and July 1998. As suggested by the title, the magazine focussed on the high performance sector of the car market, from hot hatches through to supercars. Are we justified in saying that \"Performance Car has a X.\"? Yes, no, or maybe? No\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists. Are we justified in saying that \"Contra Conspiracy was originally known as Contra Control.\"? Yes, no, or maybe?", "doc_id": 557, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28915, 39985, 45236, 10035], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Takahashi River (\u9ad8\u6881\u5ddd , Takahashi-gawa ) is a Class A major river in the western part of Okayama Prefecture. It acts as the main drainage for the Takahashi River Drainage System, and is one of the three main drainage rivers in Okayama Prefecture (the others being the Yoshii River and the Asahi River). Are we justified in saying that \"The Takahashi River serves as a major drainage river.\"? Yes, no, or maybe? Yes\n###\nWake Up, Ron Burgundy: The Lost Movie (also known as Anchorman: The Adventure Continues) is the 2004 counterpart film to the film \"\", which was also released in the same year. Directed by Adam McKay and written by McKay and Will Ferrell, it stars Ferrell, Christina Applegate, David Koechner, Steve Carell, and Paul Rudd. Are we justified in saying that \"The film stars 7 people\"? Yes, no, or maybe? Maybe\n###\nPrince Louis Ferdinand Oskar Christian of Prussia (German: \"Louis Ferdinand Oskar Christian Prinz von Preu\u00dfen\"; 25 August 1944 \u2013 11 July 1977), also called Louis Ferdinand II or Louis Ferdinand Jr., nicknamed \"Lulu\", was a member of the House of Hohenzollern and the fifth of seven children of Prince Louis Ferdinand of Prussia and his wife, Grand Duchess Kira of Russia. Are we justified in saying that \"Louis Ferdinand II had seven siblings. \"? Yes, no, or maybe? No\n###\nCoriolano: eroe senza patria (English Translation: Coriolanus: \"Hero without a Country\") is a 1963 Italian historical drama film set in Rome in 493 BC. The plot is an adaptation of the Roman legend about the general who won great victories for the Romans over their enemies the Volscians, but was then forced into exile by his political enemies at home. Are we justified in saying that \"Coriolanus had a large army.\"? Yes, no, or maybe? Maybe\n###\nLamme Goedzak is a character in Charles De Coster's novel \"The Legend of Thyl Ulenspiegel and Lamme Goedzak\" (1867). He is the best friend of Thyl Ulenspiegel. While Ulenspiegel himself is derived from Dutch-German-Flemish folklore Lamme Goedzak is entirely created by De Coster. Despite this he has become one of the most recognizable Flemish folklore characters since. Are we justified in saying that \"Charles De Coster published a novel in 1865\"? Yes, no, or maybe?", "doc_id": 812, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30909, 18033, 622, 7791], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Volcano, I'm Still Excited!! was an American indie rock band from Brooklyn, New York (originally from Austin, Texas). The band's name (which has been described as \"ludicrous\") was reportedly inspired by the Tom Hanks film \"Joe Versus the Volcano\", though the band has never revealed the inspiration for the name. Are we justified in saying that \"Volcano, I'm Still Excited!! has been said to be a crazy sounding name for a group.\"? Yes, no, or maybe? Yes\n###\nPostal codes in Brunei are alphanumeric, consisting of two letters followed by four digits in the format of YZ0000, where Y denotes the district code, Z denotes the mukim code, the first two digits denote the area or village code, and the last two digits denote the nearest post office code (e.g. the postal code for Pantai Mentiri Golf Club is BU2529). Are we justified in saying that \"Postal codes in Brunei are alphanumeric but never start with a letter.\"? Yes, no, or maybe? No\n###\nThe International University of Rabat or IUR is a semi-public university founded in 2010 in Morocco. It delivers double-degrees, in collaboration with foreign universities, in law, engineering, aeronautics, energy engineering, architecture, business management and political sciences. Are we justified in saying that \"IUR is a public university.\"? Yes, no, or maybe? No\n###\nThe 2009\u201310 Udinese Calcio season was the club's 15th consecutive and 30th overall season in Serie A. The team competed in Serie A, finishing 15th, and in the Coppa Italia, reaching the semi-finals. The highlight of Udinese's season was captain Antonio Di Natale's excellent campaign, as he finished top scorer in Serie A, or \"capocannoniere\", with 29 goals. Are we justified in saying that \"Antonio Di Natale was not an important person on the Udinese Calcio .\"? Yes, no, or maybe? No\n###\nUSS \"Fletcher\" (DD/DDE-445), named for Admiral Frank Friday Fletcher, was the lead \"Fletcher\"-class destroyer , and served in the Pacific during World War II. She received fifteen battle stars for World War II service, and five for Korean War service. Are we justified in saying that \"Frank Friday Fletcher was the Admiral of the USS Fletcher\"? Yes, no, or maybe?", "doc_id": 302, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38564, 15761, 21820, 44573], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The position of South African ambassador to the United States is the most prestigious and top diplomatic post in South Africa. The position was first held in March 1949, following the upgrade of South Africa's diplomatic mission to an embassy. The post has been held by many important politicians and is currently held by M. J. Mahlangu. Are we justified in saying that \"M. J. Mahlangu, who currently holds the position of South African ambassador to the United States, has held the position for many years.\"? Yes, no, or maybe? Maybe\n###\nThe Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video. Are we justified in saying that \"The Newcomers was filmed in Vermont and caused a boost of tourism to the state.\"? Yes, no, or maybe? Maybe\n###\nRanila is a village in the Charkhi Dadri district of the Indian state of Haryana. It lies approximately 30 km south east of the district headquarters town of Bhiwani. As of the 2011 Census of India , the village had 1,927 households with a population of 10,199 of which 5,393 were male and 4,806 female. The nearby villages are Bhageshwari 5\u00a0km, Sanjarwas 4\u00a0km, Pilana 2.5\u00a0km etc. Are we justified in saying that \"the village had 2 million people in 2011\"? Yes, no, or maybe? No\n###\nMartin H\u00f6hener (born June 23, 1980) is a Swiss professional ice hockey defenceman. He is currently playing for the SC Bern of Switzerland's National League A. He was selected by the Nashville Predators in the 9th round (284th overall) of the 2000 NHL Entry Draft. Are we justified in saying that \"Martin H\u00f6hener currently resides in Switzerland.\"? Yes, no, or maybe? Maybe\n###\nJames King of William (January 28, 1822 \u2013 May 20, 1856) was a crusading San Francisco, California, newspaper editor whose assassination by a criminal in 1856 resulted in the establishment of the second San Francisco Vigilance Committee and changed the politics of the city. King was among the first newspapermen to be honored by the California Journalism Hall of Fame. Are we justified in saying that \"James King of William was born in San Francisco, California.\"? Yes, no, or maybe?", "doc_id": 632, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10003, 2635, 34562, 7463], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A surf break at Point Leo, on the Mornington Peninsula, one of the closest surf beaches to Melbourne in Victoria, Australia known as First Reef or more colloquially just \"The Reef\". Until the 1970s there was little or no resident surfing population in Point Leo, so the Reef was mainly surfed by the few transient waveriders who were exploring the many breaks to be found in Westernport Bay. Are we justified in saying that \"Before the 1970s there were a large number of people surfing at First Reef, which is a surf break at Point Leo.\"? Yes, no, or maybe? No\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire. Are we justified in saying that \"Peter John Reynolds was knighted in OBE for his research in experimental archaeology\"? Yes, no, or maybe? Maybe\n###\nThe Melodi Grand Prix Junior 2012 was Norway's eleventh national Melodi Grand Prix Junior for young singers aged 8 to 15. It was held in Oslo Spektrum, Oslo, Norway and broadcast live Norwegian Broadcasting Corporation (NRK). It was hosted by Margrethe R\u00f8ed and Tooji, the winner of Norwegian Melodi Grand Prix 2012. Are we justified in saying that \"The Melodi Grand Prix Junior 2012 was won by a 16 year old.\"? Yes, no, or maybe? No\n###\nMission: Impossible III \u2013 Music from the Original Motion Picture Soundtrack is a soundtrack album for the 2006 film \"\", composed by Michael Giacchino. Unlike the previous two films in the series, there was no album released containing the film's contemporary music. Are we justified in saying that \"Mission: Impossible III was the most popular movie of the series\"? Yes, no, or maybe? Maybe\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012. Are we justified in saying that \"The nominees for the 39th People's Choice Awards were announced during the same year that the ceremony was held\"? Yes, no, or maybe?", "doc_id": 220, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15555, 1678, 24474, 37263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Corrina, Corrina is a 1994 American feature film set in 1959 about a widower (Ray Liotta) who hires a housekeeper/nanny (Whoopi Goldberg) to care for his daughter (Tina Majorino). It was written and directed by Jessie Nelson, in her feature film directing debut. It was the final film in which Don Ameche starred; he died shortly after filming was completed. Are we justified in saying that \"Corrina, Corrina is set 35 years before it was released.\"? Yes, no, or maybe? Yes\n###\n\"Have You Ever Met That Funny Reefer Man\", often known simply as \"The Reefer Man\", is a 1932 American jazz song composed by J. Russel Robinson, with lyrics by Andy Razaf. It was first recorded by Cab Calloway and his orchestra, with versions by others over the years, including by Harlan Lattimore, Murphy's Law and Big Bad Voodoo Daddy. Are we justified in saying that \"The song was released in 1931\"? Yes, no, or maybe? No\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"The film View from the Top was not the first time Bruno Barreto and Gwyneth Paltrow worked together\"? Yes, no, or maybe? Maybe\n###\n\"They\" is a short story written by American science fiction author Robert A. Heinlein. It was first published in the April 1941 issue of \"Unknown\", and can be found in Heinlein's short story collection \"The Unpleasant Profession of Jonathan Hoag\". It also appears in a number of multi-author anthologies. Are we justified in saying that \"Robert A. Heinlein wrote more than one short story.\"? Yes, no, or maybe? Yes\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time. Are we justified in saying that \"The population count of Jefferson County is over 1,000,000 as of today. \"? Yes, no, or maybe?", "doc_id": 673, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14075, 35319, 11499, 21428], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Balaji K. Kumar is a Film Director who entered Tamil cinema as a director with the 2013 thriller film \"Vidiyum Munn\" which released on 29 November 2013 and received positive reviews from critics. Then started his career as story board artist for advertising firms like Ogilvy & Mather, JWT, Saatchi & Saatchi. Are we justified in saying that \"Balaji K. Kumar is world famous.\"? Yes, no, or maybe? Maybe\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\". Are we justified in saying that \"\"Paint It Black\" is a song as a British rock band.\"? Yes, no, or maybe? Yes\n###\nMutual Friends is a British comedy drama television series broadcast in six episodes on BBC One in from 26 August until 30 September 2008. The series starred Marc Warren, Alexander Armstrong, Keeley Hawes, Sarah Alexander, Claire Rushbrook, Emily Joyce, Naomi Bentley and Joshua Sarphie as a group of old friends whose lives are thrown into chaos when one of their group commits suicide. Are we justified in saying that \"Mutual Friends was written in 2008.\"? Yes, no, or maybe? Maybe\n###\nRufus Lackland Taylor (January 6, 1910 \u2013 September 14, 1978) was an officer in the United States Navy. There he became Director of the Office of Naval Intelligence and a Vice Admiral. In 1966 he was appointed as Deputy Director of the Defense Intelligence Agency (DIA), then shortly thereafter as Deputy Director of the CIA, where he served from 1966 to 1969. Are we justified in saying that \"Rufus Lackland Taylor (January 6, 1910 \u2013 September 13, 1978) was an officer in the United States Navy. He was appointed as Deputy Director of the Defense Intelligence Agency (DAI).\"? Yes, no, or maybe? No\n###\nThe Rock \u2018n\u2019 Roll Mardi Gras Marathon is an annual international marathon race which takes place in New Orleans, Louisiana, in the United States. It is part of the Rock 'n' Roll Marathon Series of road running competitions and it also features the Rock \u2018n\u2019 Roll Mardi Gras Half Marathon. Are we justified in saying that \" The Rock \u2018n\u2019 Roll Mardi Gras Marathon is a marathon in the u.s with half relay marathon\"? Yes, no, or maybe?", "doc_id": 838, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19880, 18586, 35994, 24038], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ann Rae Rule (n\u00e9e Stackhouse; October 22, 1931 \u2013 July 26, 2015) was an American true crime author of \"The Stranger Beside Me\", about serial killer, and Rule's co-worker, Ted Bundy. Rule was also known for her book \"Small Sacrifices\", about Oregon child murderer Diane Downs. Many of Rule's books center on murder cases that occurred in the Pacific Northwest and her adopted home state of Washington. Are we justified in saying that \"Ann Rule was married\"? Yes, no, or maybe? Maybe\n###\nFernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier. Are we justified in saying that \"Fernande Olivier spoke to Picasso in French.\"? Yes, no, or maybe? Maybe\n###\nRegent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day. Are we justified in saying that \"The CEO of Habib Group is British.\"? Yes, no, or maybe? Maybe\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"The district was names after Clement\"? Yes, no, or maybe? Maybe\n###\nThe Probert-Price Collection is a collection of items from the Probert-Price estate, primarily hundreds of vintage dresses which belonged to Renee Probert-Price, original It girl and well-known London socialite of her time (1917-2013). Renee died in 2013 aged 96, and left over 300 dresses, hats, furs, shoes and handbags dating from the 1930s and 1980s to her great niece and goddaughter. Are we justified in saying that \"Renee Probert-Price left at least 50 hats to her great niece and goddaughter.\"? Yes, no, or maybe?", "doc_id": 447, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10402, 28762, 36843, 18582], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donald Clark \"Donny\" Osmond (born December 9, 1957) is an American singer, actor, radio personality, and former teen idol. Osmond has also been a talk and game show host, record producer and author. In the mid-1960s, he and four of his elder brothers gained fame as the Osmonds. Osmond went solo in the early 1970s, covering such hits as \"Go Away Little Girl\" and \"Puppy Love\". Are we justified in saying that \"Donny Osmond is an only child.\"? Yes, no, or maybe? No\n###\nLove Island is a 1952 American film directed by Bud Pollard starring Paul Valentine and Eva Gabor. Originally released in Cinecolor, the film uses extensive footage taken in Bali used from the film \"\" (1935). It was the final directorial effort of Bud Pollard who had previously directed several race films and exploitation films. Are we justified in saying that \"Love Island was released in nineteen hundred fifty three.\"? Yes, no, or maybe? No\n###\nShannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University. Are we justified in saying that \"Shannon Kelley set records as a quarterback.\"? Yes, no, or maybe? Maybe\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s. Are we justified in saying that \"The New Ulm Oil Company Service Station had gasoline\"? Yes, no, or maybe? Yes\n###\nPrincess Amalie \"Auguste\" of Anhalt-Dessau (German: \"Prinzessin Amalie Auguste von Anhalt-Dessau\" ; 18 August 1793 \u2013 12 June 1854) was a German princess of Anhalt-Dessau who was Princess consort of Schwarzburg-Rudolstadt from 1816 to 1854 as the wife of Friedrich G\u00fcnther, Prince of Schwarzburg-Rudolstadt. Are we justified in saying that \"Princess Amalie \"Auguste\" of Anhalt-Dessau was Princess consort in the early 1860's.\"? Yes, no, or maybe?", "doc_id": 867, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23408, 8053, 25441, 3226], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"Imagine dragons wrote their first song in 2002\"? Yes, no, or maybe? Maybe\n###\nJohnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson. Are we justified in saying that \"Johnson College Prep is named after John H. Johnson and Eunice Johnson.\"? Yes, no, or maybe? Yes\n###\nAktar-Ul Islam (Bengali: \u0986\u09b0\u09cb \u0989\u09b2 \u0987\u09b8\u09b2\u09be\u09ae ; born 1980) is an award-winning English chef, restaurateur and entrepreneur. In 2009, his restaurant Lasan became the first Indian restaurant in the United Kingdom to be selected as the \"Best Local Restaurant\" by Gordon Ramsay on Channel 4's \"The F Word\". In June 2011, he won the fish course in the final of the BBC Two series \"Great British Menu\". Are we justified in saying that \"Lasan learned to cook in India.\"? Yes, no, or maybe? Maybe\n###\nElizabeth \"Long Liz\" Stride (n\u00e9e Gustafsdotter) (27 November 1843 \u2013 30 September 1888) is believed to be a victim of the notorious unidentified serial killer called Jack the Ripper, who killed and mutilated several women in the Whitechapel area of London from late August to early November 1888. Are we justified in saying that \"Elizabeth \"Long Liz\" Stride was born more than 150 years ago.\"? Yes, no, or maybe? Yes\n###\nVarun Sharma is an Indian actor who made his debut in Farhan Akhtar's 2013 film production \"Fukrey\", which was a surprise hit in Bollywood. Since his appearance in \"Fukrey\", he has appeared in other comedy films, such as \"Kis Kisko Pyaar Karoon\" and \"Dilwale\" etc Are we justified in saying that \"Varun Sharma is not currently acting\"? Yes, no, or maybe?", "doc_id": 647, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28326, 31283, 26267, 32276], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Guns of Diablo is a Metrocolor 1965 Western directed by Boris Sagal, starring Charles Bronson, Susan Oliver and Kurt Russell. Charles Bronson is a wagon scout (Linc Murdock), who runs into difficulties when he meets old flame Maria (Susan Oliver), now married to corrupt lawman Rance Macklin (Jan Merlin). Are we justified in saying that \"Guns of Diablo was a popular Western.\"? Yes, no, or maybe? Maybe\n###\nKdan Mobile Software is a privately owned application & software development company, whose headquarter is located in Tainan City (Taiwan) with branch offices in both Irvine (US) and Changsha City (China). Founded in 2009, the company focused on building mobile software applications and online cloud services that allow users to better leverage their productivity and creativity. Are we justified in saying that \"In 2009, mobile software was a booming business.\"? Yes, no, or maybe? Maybe\n###\nSigmoid colon volvulus, also known as sigmoid volvulus, is a common cause of bowel obstruction and constipation. It is common in Asia, India (7% of intestinal obstruction) and especially South India because of the high fibre diet. It is very common cause of large bowel obstruction in Peru and Bolivia due to high altitude. Are we justified in saying that \"Chewing food more thoroughly could help Bolivians avoid bowel problems.\"? Yes, no, or maybe? Maybe\n###\nThe Tiki Bar is Open was singer-songwriter John Hiatt's sixteenth album, released in 2001. It was his last album with Vanguard Records. Although they are uncredited, the album features backing band The Goners, the same cadre of friends who backed Hiatt in his 1988 release Slow Turning. It was coincidentally released on September 11, 2001. Are we justified in saying that \"The Goners didn't reform after September 11, 2001.\"? Yes, no, or maybe? Maybe\n###\nLucjan Karasiewicz (born July 10, 1979 in Tarnowskie G\u00f3ry) is a Polish politician. He was elected to Sejm on September 25, 2005 getting 6844 votes in 28 Cz\u0119stochowa, standing for Law and Justice. He joined Poland Comes First when that party split from Law and Justice in 2010. Are we justified in saying that \"Lucjan Karasiewicz is currently living\"? Yes, no, or maybe?", "doc_id": 751, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19669, 43806, 39291, 19492], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Greatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"Greatest Hits Volume 1 was not released in 1969\"? Yes, no, or maybe? Yes\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"Once Upon a Time premiered less than 100 years ago\"? Yes, no, or maybe? Yes\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport. Are we justified in saying that \"Cameroon has an airline.\"? Yes, no, or maybe? Yes\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race. Are we justified in saying that \"Tom\u00e1s Nistal Fern\u00e1ndez is a former road cyclist.\"? Yes, no, or maybe? Yes\n###\nLincoln is a town in Providence County, Rhode Island, United States. The population was 21,105 at the 2010 census. Lincoln is located in northeastern Rhode Island, north of Providence. Lincoln is part of the Providence metropoliton statistical area and the Greater Boston combined statistical area. Are we justified in saying that \"The population of Lincoln is over 21,105,\"? Yes, no, or maybe?", "doc_id": 14, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34492, 8843, 26909, 2918], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lost Souls is a 1992 horror novel by American writer Poppy Z. Brite, his first one. It is the only novel-length adventure of Brite's 'Steve and Ghost' characters, popularized in numerous short stories. The novel is an extended version of the short story \"The Seed of Lost Souls\". Are we justified in saying that \"\"The Seed of Lost Souls\" is the follow up novel to Lost Souls\"? Yes, no, or maybe? No\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Jenner is south of Stillwater Cove Regional Park.\"? Yes, no, or maybe? Yes\n###\nMiss Peregrine's Home for Peculiar Children is a contemporary fantasy debut novel by American author Ransom Riggs. The story is told through a combination of narrative and vernacular photographs from the personal archives of collectors listed by the author. Are we justified in saying that \"Miss Peregrine's Home for Peculiar Children is a fantasy story.\"? Yes, no, or maybe? Yes\n###\nPaolo Romano, also known as Paolo Tuccone and as Paolo di Mariano di Tuccio Taccone was an Italian early Renaissance sculptor and goldsmith. Giorgio Vasari in his \"Lives of the Most Excellent Painters, Sculptors, and Architects\" recounts that Paolo Romano was a modest man whose sculpture was far superior to that of his boastful contemporary Mino del Reame. Are we justified in saying that \"Paolo Romano was an Italian early Renaissance sculptor and goldsmith and painter.\"? Yes, no, or maybe? Maybe\n###\nThe North African ostrich or red-necked ostrich (\"Struthio camelus camelus\"), also known as the Barbary ostrich, is the nominate subspecies of the common ostrich from West and North Africa. It is the largest subspecies, making it the largest living bird. Are we justified in saying that \"The North African ostrich can also be found in West Africa\"? Yes, no, or maybe?", "doc_id": 911, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17672, 24478, 9067, 9989], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas. Are we justified in saying that \"A bombing had occurred in the area.\"? Yes, no, or maybe? Yes\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"Imagine Dragons are currently working on a new album.\"? Yes, no, or maybe? Maybe\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show is hosted by Trevor Noah.\"? Yes, no, or maybe? Maybe\n###\nKazuhiro Wada (\u548c\u7530 \u4e00\u6d69 , \"Wada Kazuhiro\" , born June 19, 1972 in Gifu, Gifu, Japan) is a retired Japanese professional baseball player. He played mostly as an outfielder for the Chunichi Dragons and the Seibu Lions of the Nippon Professional Baseball league in a career spanning 18 years. Following retirement in 2015, he has become a color commentator for Dragons broadcasts for the NHK. Are we justified in saying that \"Kazuhiro Wada played in the Nippon Professional Basball league\"? Yes, no, or maybe? Yes\n###\n\"I'm Not the One\" is a song by the American rock band The Cars, from their fourth album, \"Shake It Up\". It features Ric Ocasek on lead vocals, Benjamin Orr singing the 'You Know Why' phrase, with the whole group repeating \"going round and round\" as backing vocals throughout the song. Are we justified in saying that \"Elliot easton sang backup vocals\"? Yes, no, or maybe?", "doc_id": 818, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1048, 9726, 14655, 3091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Giovanni Ferrero (born 21 September 1964) is an Italian businessman. He assumed the leadership of the confectionery company Ferrero SpA after the death of his brother Pietro Ferrero in 2011. He has a net worth of $24.2 billion as of May 2016, according to Bloomberg. Are we justified in saying that \"Giovanni Ferrero currently lives in Italy.\"? Yes, no, or maybe? Maybe\n###\nThe Great Dictator is a 1940 American political satire comedy-drama film written, directed, produced, scored by and starring British comedian Charlie Chaplin, following the tradition of many of his other films. Having been the only Hollywood filmmaker to continue to make silent films well into the period of sound films, this was Chaplin's first true sound film. Are we justified in saying that \"Chaplin himself has dialogue in The Great Dictator.\"? Yes, no, or maybe? Maybe\n###\n\"Superman's Dead\" is a song by Canadian alternative rock group Our Lady Peace. It was released in December 1996 as the lead single from their second album \"Clumsy\". This has become one of Our Lady Peace's most popular songs in both Canada and the U.S., as well as many other parts of the world. Are we justified in saying that \"Superman's Dead was the third single. \"? Yes, no, or maybe? No\n###\nDuke is a fictional character from the \"\" toyline, comic books, and cartoon series. He is the G.I. Joe Team's First Sergeant, and debuted in 1983. The character is also featured in both the \"\" animated series and comic books. Channing Tatum portrays Duke in the 2009 live-action film, \"\", and the 2013 sequel \"\". Are we justified in saying that \"Channing Tatum plays a fictional character in the 2009 G.I Joe live-action film.\"? Yes, no, or maybe? Yes\n###\nLichfield Cathedral is situated in Lichfield, Staffordshire, England. It is the only medieval English cathedral with three spires. The Diocese of Lichfield covers all of Staffordshire, much of Shropshire and part of the Black Country and West Midlands. The 99th and current Bishop of Lichfield is Michael Ipgrave who was appointed on 10 June 2016. Are we justified in saying that \"Michael Ipgrave was appointed in the sixth month of 2016\"? Yes, no, or maybe?", "doc_id": 725, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17976, 22421, 42701, 36938], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Swift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933. Are we justified in saying that \"Swift Rivers didn't exist before 1930\"? Yes, no, or maybe? Yes\n###\nHundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries. Are we justified in saying that \"Hundreds of ancient stone religious monuments lie on the island of Java. Too many were built.\"? Yes, no, or maybe? Maybe\n###\nThe Chingford branch line is a railway line between Clapton Junction (just west of Clapton station) and Chingford station. Services currently operate between Liverpool Street station and Chingford. The branch is currently part of the Lea Valley Lines network. Are we justified in saying that \"The Chingford branch line is for women only\"? Yes, no, or maybe? Maybe\n###\nMelinda Heather \"Mindy\" Cohn (born May 20, 1966) is an American actress, voice actress, comedian and singer. She is known for her role as Natalie Green, the student of Edna Garrett (played by Charlotte Rae) in the long-running sitcom \"The Facts of Life\", and for being the voice of Velma Dinkley in the \"Scooby-Doo\" franchise from 2002 to 2015. Are we justified in saying that \"Melinda Heather \"Mindy\" Cohn is a silly actress \"? Yes, no, or maybe? Maybe\n###\nLaudelino Jos\u00e9 \"Lino\" de Barros (born June 29, 1975 in Bonito, Mato Grosso do Sul) is a Brazilian boxer, who represented his native country in the light heavyweight division at the 2000 Summer Olympics. There he was eliminated in the first round by Australia's Danny Green. A year earlier, at the 1999 Pan American Games, Barros won the silver medal in his weight division. Are we justified in saying that \"Laudelino Jos\u00e9 \"Lino\" de Barros was born in the 7th decade of the 20th century\"? Yes, no, or maybe?", "doc_id": 19, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6422, 27295, 43796, 44957], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Prom Night IV: Deliver Us from Evil is a 1992 Canadian slasher horror film directed by Clay Borris and starring Nicole de Boer and J.H. Wyman. The film follows a deranged Catholic priest who begins murdering teenagers on their prom night. It is the fourth and final film in the \"Prom Night\" franchise. Like the previous , it was released briefly in theaters before later being released to video. Are we justified in saying that \"1992 was the last year in which a movie from the \"Prom Night\" film franchise was released \"? Yes, no, or maybe? Yes\n###\nDana Berliner is Litigation Director at the Institute for Justice, a public interest law firm in Arlington, Virginia founded in 1991 by Chip Mellor and Clint Bolick. She was co-lead counsel for Susette Kelo in the landmark United States Supreme Court case \"Kelo v. City of New London\". Are we justified in saying that \"Dana Berliner was co-lead counsel only one time.\"? Yes, no, or maybe? Maybe\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey. Are we justified in saying that \"Ballymena United Football Club only plays semi-professionally\"? Yes, no, or maybe? Yes\n###\nCynthia Mort (born June 18, 1956) is an American director, screenwriter, and producer. Mort has worked primarily in television since beginning her career in 1994, writing for the sitcom \"Roseanne\". Her notable works include the HBO series \"Tell Me You Love Me\" as a creator and executive producer, the revenge film \"The Brave One\" (2007) as a screenwriter, and the biopic \"Nina\" (2016) as a director. Are we justified in saying that \"Mort worked on on films just as much as she did television shows.\"? Yes, no, or maybe? No\n###\nContra Conspiracy (also known as Contra Control) is a 1988 action film written and directed by Thomas Dewier which is now distributed by Troma Entertainment. The film was produced by City Lights. The plot follows a Hollywood film crew shooting a movie in the Mojave Desert, only to be disrupted by a group of terrorists. Are we justified in saying that \"There were 2 film crews in this movie. One doing the filming and one doing the acting.\"? Yes, no, or maybe?", "doc_id": 71, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21026, 6725, 10784, 17611], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005. Are we justified in saying that \"Samuel Eto'o Fils won the African Player of the Year award a record three times.\"? Yes, no, or maybe? No\n###\nThe Real Howard Spitz is a 1998 family comedy film directed by Vadim Jean, produced by Paul Brooks and written by Jurgen Wolff. Starring Kelsey Grammer, Amanda Donohoe and Genevieve Tessier, it is a Canadian and U.K co-production. A failed detective writer, Howard Spitz has hit rock bottom until an 8-year-old girl helps him write children's books. Are we justified in saying that \"The Real Howard Spitz is a horror film.\"? Yes, no, or maybe? No\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area. Are we justified in saying that \"El Paso County is home to the Pikes Peak Center.\"? Yes, no, or maybe? Yes\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th. Are we justified in saying that \"Vasili Vyacheslavovich Blagov currently lives in London.\"? Yes, no, or maybe? Maybe\n###\nAleksandr Danilovich Aleksandrov (Russian: \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440 \u0414\u0430\u043d\u0438\u0301\u043b\u043e\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 , alternative transliterations: \"Alexandr\" or \"Alexander\" (first name), and \"Alexandrov\" (last name)) (August 4, 1912 \u2013 July 27, 1999), was a Soviet/Russian mathematician, physicist, philosopher and mountaineer. Are we justified in saying that \"Aleksandr Danilovich Aleksandrov had more than 4 jobs.\"? Yes, no, or maybe?", "doc_id": 176, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31428, 30175, 41163, 25478], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Staunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"The mall has a store that starts with a B, and a J\"? Yes, no, or maybe? Yes\n###\nThe Bigger Picture is a 2014 British animated short film directed by Daisy Jacobs. It has been nominated for the Academy Award for Best Animated Short Film at the 87th Academy Awards. It won the BAFTA Award for Best Short Animation at the 68th British Academy Film Awards. Are we justified in saying that \"The Bigger Picture has the voice of Carter.\"? Yes, no, or maybe? Maybe\n###\nUS Organization, or Organization Us, is a Black nationalist group in the United States founded in 1965. It was established as a community organization by Maulana Karenga. It was a complementary organization of the Black Panther Party in California. One of the early slogans was, \"Wherever US is, We are.\" US stands for us Black people vs 'them' the oppressors. Are we justified in saying that \"Organization Us is founded in California\"? Yes, no, or maybe? Maybe\n###\nGreivis Josu\u00e9 V\u00e1squez Rodr\u00edguez (born January 16, 1987) is a Venezuelan professional basketball player who last played for the Brooklyn Nets of the National Basketball Association (NBA). He was drafted in 2010 after a U.S. college career with the University of Maryland men's basketball team. V\u00e1squez finished second on the Terrapins' all-time scoring list, with 2,171 career points. Are we justified in saying that \"Greivis Josu\u00e9 V\u00e1squez Rodr\u00edguez was born over 10 years ago\"? Yes, no, or maybe? Yes\n###\nThe University of Florida Board of Trustees is the governing body of the University of Florida, the Flagship University for the State University System of Florida. The University is located in Gainesville, Florida, United States. As of September 1, 2011, the Board includes thirteen members. The current Chair of the Board is Carlos Alfonso. Are we justified in saying that \"The University of Florida Board of Trustees currently has 13 members.\"? Yes, no, or maybe?", "doc_id": 459, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24431, 22466, 3725, 28705], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Boon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle. Are we justified in saying that \"Boon Brewery produces Faro Beer\"? Yes, no, or maybe? Yes\n###\nFS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani. Are we justified in saying that \"FS Kozani is a highly paid football team in Greece\"? Yes, no, or maybe? Maybe\n###\nMike Bossy the Scoring Machine is a pinball machine manufactured by Game Plan, Inc. as a prototype in 1982, featuring New York Islanders hockey star Mike Bossy. There was only one machine produced. Designed by Ed Cebula, it was the only professionally endorsed pinball machine produced by Game Plan. Are we justified in saying that \"Game Plan, Inc. is no longer in business. \"? Yes, no, or maybe? Maybe\n###\nEnglandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo. Are we justified in saying that \"Englandsfarere is a film from the 20th century. \"? Yes, no, or maybe? Yes\n###\nThe Malloreon is a five-part fantasy book series written by David Eddings, which follows \"The Belgariad\". The Malloreon is set in the same world as The Belgariad, but expands on several aspects of the setting, especially the eastern continent of Mallorea. Are we justified in saying that \"The Belgariad, although published before the Malloreon series, was in fact written after it.\"? Yes, no, or maybe?", "doc_id": 516, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1081, 20348, 14206, 13907], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Bosch\" is an American police procedural television series produced by Amazon Studios. It stars Titus Welliver as Los Angeles Police detective Harry Bosch. The show, developed for Amazon by Eric Overmyer, takes its inspiration from three of Michael Connelly\u2019s novels: \"City of Bones\", \"Echo Park\", and \"The Concrete Blonde\". Are we justified in saying that \"The Concrete Blonde was written by Michael Connelly.\"? Yes, no, or maybe? Yes\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"Rudbeckia hirta is a very smelly plant\"? Yes, no, or maybe? Maybe\n###\nABC Western Victoria (call sign: 3WV) is an ABC Local Radio station in the Wimmera region of Victoria, Australia. Its primary transmitter is located in Horsham which broadcasts on 594\u00a0kHz AM. The station is clearly receivable over most of western Victoria, well beyond the Wimmera. In the past, it was designed to serve this broader region. Are we justified in saying that \"ABC has a number of local stations.\"? Yes, no, or maybe? Maybe\n###\nThe Green Goblin's Last Stand is a 1992 fan film by Dan Poole, based on the comic book story \"The Night Gwen Stacy Died\", published by Marvel Comics in \"The Amazing Spider-Man\" #121\u2013122. Poole is the director, producer, creative editor, screenwriter, and star of the film. The film and its attendant documentary received showings and accolades at several small film festivals. Are we justified in saying that \"Dan Poole hates comic books.\"? Yes, no, or maybe? No\n###\nSt. Mark's Coptic Orthodox Cathedral is a Coptic church located in the Abbassia District in Cairo, Egypt. The cathedral is the Seat of the Coptic Orthodox Pope. It was built during the time when Pope Cyril VI of Alexandria was Pope of the Coptic Orthodox Church, and was inaugurated by him in 1969. Are we justified in saying that \"St. Mark's Coptic Orthodox Cathedral was created by Pope Cyril.\"? Yes, no, or maybe?", "doc_id": 586, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3333, 37417, 23823, 27068], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Don Sinclair Davis, PhD (August 4, 1942 \u2013 June 29, 2008) was an American character actor best-known for playing General Hammond in the television series \"Stargate SG-1\" (1997\u20132007), and earlier for playing Major Garland Briggs on the television series \"Twin Peaks\" (1990\u20131991). He was also a theater professor, painter, and United States Army captain. Are we justified in saying that \"Don Sinclair Davis was on Twin Peaks before he turned 50.\"? Yes, no, or maybe? Yes\n###\nThe Legendary Majik Mijits is an album that was recorded by Steve Marriott and Ronnie Lane when they reformed under the name of \"Majik Mijits\" in 1981 and gave a one-off concert at the Bridgehouse pub in East London. The lineup included Jim Leverton, Mick Green, Mick Weaver, Dave Hynes and Sam Brown. Are we justified in saying that \"The band had three members.\"? Yes, no, or maybe? No\n###\nThe New Ulm Oil Company Service Station is a historic gas station in New Ulm, Minnesota. The private, commercial structure was placed on the National Register of Historic Places (NRHP) on December 31, 1979. Its strong, fanciful visual images exemplify independent gas station designs of the 1920s. Are we justified in saying that \"The private, commercial structure was placed on the National Register of Historic Places more than 1970 days ago.\"? Yes, no, or maybe? Yes\n###\nBarry Redden (born July 21, 1960) is a former American football running back who played for the Los Angeles Rams, the San Diego Chargers, and the Cleveland Browns of the National Football League (NFL). He spent much of his career playing in the shadow of Pro Football Hall of Fame running back Eric Dickerson. Are we justified in saying that \" Barry Redden is a very clever man\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"the Six-Day war took place near Christmas\"? Yes, no, or maybe?", "doc_id": 344, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21322, 29566, 7680, 34539], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League. Are we justified in saying that \"Marques Ackerman donated to charity in 2017.\"? Yes, no, or maybe? Maybe\n###\n\"We're an American Band\" (from the album of the same name) became Grand Funk Railroad's first #1 single on September 29, 1973, Mark Farner's 25th birthday. Written by Don Brewer and produced by Todd Rundgren, its huge chart success broadened Grand Funk's appeal. It was sung by Brewer rather than Farner, who usually took lead vocals. Are we justified in saying that \"We're an American Band has no As.\"? Yes, no, or maybe? No\n###\nThe European Democrat Union (EDU) is one of the three European wings of the International Democrat Union, along with the European People's Party (EPP) and the Alliance of European Conservatives and Reformists (AECR). Its members include Christian democratic, liberal conservative, and conservative political parties. It is only a nominal sub-entity of the IDU, since it ceased its activities in 2002. Are we justified in saying that \"The EDU, the EPP, and the AECR all have acronyms that contain the letter E. \"? Yes, no, or maybe? Yes\n###\nLiberal Citizens Action (in Spanish: \"Acci\u00f3n Ciudadana Liberal\") was a political party in Spain at the time of the transition to democracy. ACL emerged from the Liberal Federation (\"Federaci\u00f3n Liberal\"), an alliance of five parties, in 1977. The president of the party was Jos\u00e9 Mar\u00eda de Areilza, Minister of Foreign Affairs 1975-1976. Areilza had left Adolfo Suarez's Democratic Center Union (UCD). Are we justified in saying that \"The UCD was the acronym for the Democratic Center Union. \"? Yes, no, or maybe? Yes\n###\nBabar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\". Are we justified in saying that \"The film released in a country that starts with a C\"? Yes, no, or maybe?", "doc_id": 144, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30238, 26546, 32292, 14399], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Something Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date. Are we justified in saying that \"I created Something Like Human that made the Hemorrhage of the songs because they were popular on the record label\"? Yes, no, or maybe? Maybe\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Smithereens was released om Tilbrook's own Quixotic label.\"? Yes, no, or maybe? Yes\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98. Are we justified in saying that \"Death Race is pie\"? Yes, no, or maybe? No\n###\nBen Barzman (October 12, 1910 \u2013 December 15, 1989) was a Canadian journalist, screenwriter, and novelist, blacklisted during the McCarthy Era and best known for his screenplays for the films \"Back to Bataan\" (1945), \"El Cid\" (1961), and \"The Blue Max\" (1966). Are we justified in saying that \"Ben Barzman was born less than 10000 days ago.\"? Yes, no, or maybe? No\n###\nLouis S. Peterson (June 17, 1922 \u2013 April 27, 1998) was a playwright, actor, screenwriter, and professor. He was an American playwright and the first African-American playwright to have a dramatic play produced on Broadway. He was also one of the first African-American writers to be nominated for an Emmy Award. Are we justified in saying that \"Louis S. Peterson was an adult when he wrote his first play.\"? Yes, no, or maybe?", "doc_id": 3, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21423, 25794, 44823, 2626], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jerick Deshun McKinnon (born May 3, 1992) is an American football running back for the Minnesota Vikings of the National Football League (NFL). He was drafted by the Vikings in the third round of the 2014 NFL Draft. He played college football at Georgia Southern. Are we justified in saying that \"McKinnon has only played ball for Georgia Southern and the Minnesota Vikings.\"? Yes, no, or maybe? Maybe\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally. Are we justified in saying that \"Sometimes Frank feels haunted by his past.\"? Yes, no, or maybe? Maybe\n###\nThe Appalachian IMG Sports Network was founded in 2007 as Appalachian ISP Sports Network. It is a group of 17 radio stations that carry Appalachian State University sports. The flagship station is WKBC-FM 97.3 in North Wilkesboro, North Carolina. When ISP Sports was bought by IMG Worldwide subsidiary, IMG College, in 2010, the network switched to its current name. Are we justified in saying that \"The Appalachian ISP Sports Network had its name changed 5 years after it was founded.\"? Yes, no, or maybe? No\n###\nSamson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation. Are we justified in saying that \"The first performance was on 4 October 1877\"? Yes, no, or maybe? No\n###\nLittle Casterton is a small village and civil parish in Rutland, England. The population of the civil parish at the 2001 census was 148, increasing to 218 at the 2011 census. It is about two miles (3 km) north of Stamford on a minor road that runs to the south of the River Gwash between Great Casterton and Ryhall. Are we justified in saying that \"Little Casterton is near Rutland, England\"? Yes, no, or maybe?", "doc_id": 831, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39409, 31651, 20690, 42227], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "102 Squadron \"\"Panchos\"\" (\"Esquadra 102\") was an elementary flight training squadron of the Portuguese Air Force disbanded in 1992. Formed in 1962, the squadron administered air force training and performed at air shows throughout Portugal. Between 1963 and its disbandment in 1992, the squadron lost nine pilots. Are we justified in saying that \"Panchos was a flight training squadron in the Portuguese Air Force.\"? Yes, no, or maybe? Yes\n###\nJohn Gilbert (born John Cecil Pringle; July 10, 1899 \u2013 January 9, 1936) was an American actor, screenwriter and director. He rose to fame during the silent film era and became a popular leading man known as \"The Great Lover\". At the height of his career, Gilbert rivaled Rudolph Valentino, another silent film era leading man, as a box office draw. Are we justified in saying that \"He acted in mostly scifi movies\"? Yes, no, or maybe? Maybe\n###\n\"Kiss and Tell\" is a song by Bryan Ferry, the erstwhile lead vocalist for Roxy Music. It was released as the second single from his seventh album \"B\u00eate Noire\" in early 1988, being Ferry's twenty-sixth single. The song peaked at number 41 on the UK Singles Chart and at number 31 on the US Billboard 100. It also appears in the film \"Bright Lights, Big City\", adapted from the Jay McInerney novel. Are we justified in saying that \"This song charted on the US billboard at 30\"? Yes, no, or maybe? No\n###\nRepublic New York Corporation was the holding company for the Republic National Bank of New York and the Safra Republic Bank. The company was controlled by billionaire Edmond Safra, who was killed in a fire in his Monte Carlo penthouse apartment by his nurse Ted Maher. Republic New York Corporation was sold shortly after its chairman's death to HSBC Bank USA, the US subsidiary of HSBC of the UK. Are we justified in saying that \"Republic New Yorke was sold to HSBC\"? Yes, no, or maybe? Yes\n###\nThomas Tull (born 1970) is an American businessman and film producer. He is the former chairman of the Board and chief executive officer (CEO) of Legendary Entertainment. His firm has produced and/or financed several major motion pictures, including \"The Dark Knight Trilogy\", \"The Hangover\" and its sequels, \"300\", \"Man of Steel\" and others. Are we justified in saying that \"Thomas Tull has directed 78 films.\"? Yes, no, or maybe?", "doc_id": 821, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41673, 16007, 26584, 4815], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brennan Hesser (born 1980) is an American television actress, best known for co-starring in Tori Spelling's VH1 sitcom, \"So NoTORIous\". She also starred in Fox's drama, \"Jonny Zero\". She also guest starred in an episode of the CBS television show, \"The Guardian\". As a youngster, she attended the prestigious Interlochen Arts Camp in Northern Michigan. Are we justified in saying that \"She has starred in shows on Fox, CBS, VH1 and ABC.\"? Yes, no, or maybe? No\n###\nBallymena United Football Club is a semi-professional football club from Northern Ireland. Based in Ballymena, County Antrim, the team competes in the NIFL Premiership and plays home matches at the Ballymena Showgrounds.The club is managed by iconic Irish League player/manager David Jeffrey. Are we justified in saying that \"Ballymena United Football Club pays its players more than the average salary for football players\"? Yes, no, or maybe? Maybe\n###\nGirilal Jain (1924 \u2013 19 July 1993), was an Indian journalist. He served as the editor of The Times of India from 1978 till 1988. He was sympathetic to Hindu nationalism and authored books on the subject, the best known of which, \"The Hindu Phenomenon\", was published posthumously. The Government of India awarded him the civilian honour of the Padma Bhushan in 1989. Are we justified in saying that \"Girilal Jain is Indian.\"? Yes, no, or maybe? Yes\n###\n\"Paradise\" is the only single release from Styx's 1997 live double album \"Return to Paradise\". The song was originally written and recorded by Dennis DeYoung for his musical \"The Hunchback of Notre Dame\". The song was re-recorded by Styx for inclusion as one of three new studio tracks on the live album. Are we justified in saying that \"Paradise was released in 1991\"? Yes, no, or maybe? No\n###\nFrank Vincent Ferrante (born April 26, 1963) is an American stage actor, comedian and director known for his stage portrayals of legendary American comedian Groucho Marx in the Arthur Marx/Robert Fisher play \"\" and in \"An Evening With Groucho\", which tours internationally. Are we justified in saying that \"Frank Vincent Ferrante died young\"? Yes, no, or maybe?", "doc_id": 407, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29947, 42129, 20248, 19573], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Murder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"Murder of the Universe has been covered by slayer\"? Yes, no, or maybe? Maybe\n###\nThe March of Ancona (Italian: \"Marca Anconitana\" ) (also Anconetana) was a frontier march centred on the city of Ancona and, then, Macerata in the Middle Ages. Its name is preserved as an Italian region today, the Marches, and it corresponds to almost the entire modern region and not just the Province of Ancona. Are we justified in saying that \"Italy has many cities with names as ancient as the first world war.\"? Yes, no, or maybe? Maybe\n###\nMarques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League. Are we justified in saying that \"Marques Ackerman is a world class footballer.\"? Yes, no, or maybe? No\n###\nThe National Rehabilitation Hospital (NRH) in Dun Laoghaire, Dublin, is an Irish publicly funded hospital that provides rehabilitation treatment for patients who have a physical or cognitive disability due to illness or injury. Although it is funded by the state the hospital is owned by a Catholic religious order, the Sisters of Mercy. Are we justified in saying that \"The National Rehabilitation Hospital is a very bad hospital\"? Yes, no, or maybe? Maybe\n###\nThe Sandlot is a 1993 American coming-of-age baseball film co-written and directed by David M. Evans, which tells the story of a group of young baseball players during the summer of 1962. It stars Tom Guiry, Mike Vitar, Karen Allen, Denis Leary and James Earl Jones. The filming locations were in Glendale, Midvale, Salt Lake City, and Ogden, Utah. Are we justified in saying that \"The Sandlot takes place in Odgen, Utah. \"? Yes, no, or maybe?", "doc_id": 922, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40402, 174, 12404, 27904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cherry, Harry & Raquel! is a 1970 softcore exploitation film produced and directed by American film director Russ Meyer. Following the success of \"Vixen!\" (1968), the film is notable for the first appearance of actor (and Meyer regular) Charles Napier playing Harry Thompson, a California border sheriff and marijuana smuggler who makes a reappearance in 1975's \"Supervixens\". Are we justified in saying that \"The film \"Vixen!\" was the first film that Charles Napir appeared in.\"? Yes, no, or maybe? No\n###\nHis Excellency: George Washington is a 2004 biography of the first President of the United States, General George Washington. It was written by Joseph Ellis, a professor of History at Mount Holyoke College, who specializes in the founding fathers and the revolutionary and federalist periods. Are we justified in saying that \"Mount Holyoke college has a well respected history department.\"? Yes, no, or maybe? Maybe\n###\nSophie Lang Goes West is a 1937 American crime film directed by Charles Reisner and written by Frederick Irving Anderson, Doris Anderson, Brian Marlow and Robert Wyler. The film stars Gertrude Michael, Lee Bowman, Sandra Storme, Buster Crabbe, Barlowe Borland, C. Henry Gordon and Jed Prouty. The film was released on September 10, 1937, by Paramount Pictures. Are we justified in saying that \"Charles Reisner directed the movie Sophie Lang Goes West.\"? Yes, no, or maybe? Yes\n###\nBela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act. Are we justified in saying that \"The child of B\u00e9la Lugosi participated in the legal actions that led to California Celebrities Rights Act.\"? Yes, no, or maybe? Yes\n###\nLee Scott Wolosky (born July 17, 1968) is the former U.S. Special Envoy for Guantanamo Closure. He served under the last three U.S. Presidents in significant national security positions, and was on leave as a Partner at Boies, Schiller & Flexner LLP. On July 14 2016, President Obama accorded Wolosky the personal rank of Ambassador. Are we justified in saying that \"Lee Scott Wolosky was born the year Martin Luther King Jr. was assassinated.\"? Yes, no, or maybe?", "doc_id": 233, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34292, 40212, 26541, 6722], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Drivin' Around Song\" is a song recorded by American country rap singer Colt Ford and country music singer Jason Aldean. It is the third single from his fourth studio album, \"Declaration of Independence\". The song was written by Chris Tompkins and Craig Wiseman. Are we justified in saying that \"He did at least 4 studio albums\"? Yes, no, or maybe? Yes\n###\nCleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre. Are we justified in saying that \"The Cleethorpes play in England which is near Scotland.\"? Yes, no, or maybe? Maybe\n###\nHard Landing is a 2004 thriller novel by British author Stephen Leather. Published in 2004 by Hodder & Stoughton, it is the first book in the Dan \u2018Spider\u2019 Shepherd series. \"Hard Landing\" is an international bestseller and is available in ebook and paperback. Are we justified in saying that \" Hard Landing is the first book in the Dan 'Spider' Shepherd series that was released before the second decade of the twenty-first century.\"? Yes, no, or maybe? Yes\n###\nThe Hanover Hound is a breed of dog sometimes referred to as a Hanoverian Hound. It is a hunting and tracking dog descended from bloodhounds of medieval times. It was first introduced into France in the 1980s and is still a very rare breed. It was cross-bred with the Bavarian Hound which gave rise to the Bavarian Mountain Hound. Are we justified in saying that \"The Bavarian Mountain Hound will decrease in popularity in 2019 because the Hanover Mountain Hound and Bavarian Hound are purer breeds.\"? Yes, no, or maybe? Maybe\n###\nMads Wiel Nygaard's Endowment is an annually awarded literary prize from the publishing house Aschehoug. The prize is a recognition of superior literary work. The publisher's editorial management makes the award based on their collective judgement of merit. Applications are not accepted. Are we justified in saying that \"There is a chance that applications are accepted.\"? Yes, no, or maybe?", "doc_id": 652, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44102, 23383, 14277, 10767], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places. Are we justified in saying that \"The William Martin Armistead House wasn't considered Historic until 2009.\"? Yes, no, or maybe? Maybe\n###\nThe 2012 Sun Life Financial Players' Championship was held from April 17 to 22 at the Consolidated Credit Union Place in Summerside, Prince Edward Island. It was the last Grand Slam event of the 2011\u201312 curling season and the twentieth time the tournament has been held. The purse is CAD$100,000 for both the men's and women's tournaments, and the winner of each tournament will receive CAD$18,000. Are we justified in saying that \"The 2013 Sun Life Financial Players' Championship had a purse of CAD$120,000 for both the men's and women's tournaments.\n\"? Yes, no, or maybe? Maybe\n###\nPeter Billingsley (born April 16, 1971), also known as Peter Michaelsen and Peter Billingsley-Michaelsen, is an American actor, director, and producer, known for his role as Ralphie in the 1983 movie \"A Christmas Story\" and as \"Messy Marvin\" in the Hershey's Chocolate Syrup commercials during the 1970s. He began his career as an infant in television commercials. Are we justified in saying that \"Peter Billingsley never acted in a film\"? Yes, no, or maybe? No\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg. Are we justified in saying that \"Steven Spielberg's first film to direct was Duel. \"? Yes, no, or maybe? Yes\n###\nThe Wire is an American crime drama television series set and produced in Baltimore, Maryland. Created and primarily written by author and former police reporter David Simon, the series was broadcast by the cable network HBO in the United States. \"The Wire\" premiered on June 2, 2002, and ended on March 9, 2008, comprising 60 episodes over five seasons. Are we justified in saying that \"There was a long lapse of time during that range when episodes were not released.\"? Yes, no, or maybe?", "doc_id": 453, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29080, 6263, 41870, 23943], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Once Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"Once Upon a Time is a drama set in New England.\"? Yes, no, or maybe? Yes\n###\nEugene Gearty is an American sound engineer. He was nominated for an Academy Award in the category Best Sound for the film \"Gangs of New York\". He has worked on over 80 films since 1983. At the 84th Academy Awards, Gearty won an Oscar for Best Sound Editing for his work on Martin Scorsese's \"Hugo\". He also won Emmy Award for Boardwalk Empire. Are we justified in saying that \"He also won Emmy Award for Boardwalk Empire. He was also known for his work as an assistant director.\"? Yes, no, or maybe? Maybe\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"The black eyed Susan has been found in every province and state in North America\"? Yes, no, or maybe? No\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"The Beatles Greatest Hits Volume 1 was first exclusive to only 2 countries.\"? Yes, no, or maybe? Yes\n###\nThe 2017 Congolese police decapitation attacks occurred on 23 March 2017 in the DR Congo. About 40 police officers were ambushed then decapitated. Six police officers were released. All of the surviving police spoke the Tshiluba language. The Kamwina Nsapu terrorist group attacked the police convoy. Are we justified in saying that \"doctor congo was a the location of the 2017 Congolese police decapitation attacks\"? Yes, no, or maybe?", "doc_id": 659, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32497, 40681, 19123, 24153], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Southpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company. Are we justified in saying that \"Jake Gyllenhaal has never played a boxer prior to appearing in the 2015 sports dram film Southpaw.\"? Yes, no, or maybe? Maybe\n###\nThe Hill Country Film Festival is a yearly film festival in Fredericksburg, Texas, United States. It was established in 2010. The executive director is Chad Matthews, and it is presented by the Hill Country Film Society, who hold free screenings at the festival and, afterward, monthly. In 2013, \"Texas Monthly\" selected it as a \"quirky, discerning\" pick. Are we justified in saying that \"The Hill Country Film Festival is a yearly film festival in Dallas Texas, United States\"? Yes, no, or maybe? No\n###\nHipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011. Are we justified in saying that \"Hipmunk is the best travel company in California.\"? Yes, no, or maybe? Maybe\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"Rudbeckia hirta tastes bitter.\"? Yes, no, or maybe? Maybe\n###\nVincent Edward \"Bo\" Jackson (born November 30, 1962) is a former baseball and American football player. He is one of the few athletes to be named an All-Star in two major sports, and the only one to do so in both baseball and football. He is widely considered one of the greatest athletes of all time. Are we justified in saying that \"Bo Jackson was called Vincent when he played baseball.\"? Yes, no, or maybe?", "doc_id": 960, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36266, 14879, 26450, 16071], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harold E. Ennes was a broadcasting pioneer who authored many textbooks for broadcast and broadcast-related communications training and was a member of the Indianapolis chapter of the Society of Broadcast Engineers. He was a member of SBE's national Certification Committee and made many contributions to the early development of the SBE Certification Program. Are we justified in saying that \"Harold E. Ennes was a broadcasting pioneer who authored and lectured many textbooks for broadcast and broadcast-related communications training.\"? Yes, no, or maybe? Maybe\n###\nThe Friant-Kern Canal is a 152 mi Central Valley Project aqueduct managed by the United States Bureau of Reclamation in Central California to convey water to augment irrigation capacity in Fresno, Tulare, and Kern counties. Construction began in 1949 and the canal was completed in 1951, at a cost of $60.8 million. Are we justified in saying that \"The Friant-Kern Canal is home to marine species native to California. \"? Yes, no, or maybe? Maybe\n###\nSongbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist. Are we justified in saying that \"Songbook is a bad acoustic live album\"? Yes, no, or maybe? Maybe\n###\nThe 1982 Bavarian Tennis Championships was a men's Grand Prix tennis circuit tournament held in Munich, West Germany which was played on outdoor clay courts. It was the 66th edition of the tournament and was held form 17 May through 23 May 1982. Gene Mayer won the singles title. Are we justified in saying that \"The 1982 Bavarian Tennis Championships was a women's match\"? Yes, no, or maybe? No\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town. Are we justified in saying that \"The largest store in Harbour Place is located in the centre of the shopping centre\"? Yes, no, or maybe?", "doc_id": 542, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37257, 32920, 45440, 11851], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Pedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva (born January 21, 1977) is a Portuguese former swimmer, who specialized in sprint freestyle events. He is a two-time Olympian (2000 and 2004) and a former Portuguese record holder in the 50 m freestyle (22.86). Silva is a resident athlete for Sport Alg\u00e9s e Dafundo, and is trained by his long-time coach, director, and mentor M\u00e1rio Madeira. Are we justified in saying that \"Pedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva is a small child\"? Yes, no, or maybe? No\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio. Are we justified in saying that \"Lawrence has worked for the BBC\"? Yes, no, or maybe? Maybe\n###\nLeonard Pilkington (1527\u20131599) was an English academic and clergyman. A Marian exile, he became Regius Professor of Divinity at Cambridge and Master of St John's College, Cambridge at the start of the reign of Elizabeth I. In his subsequent church career, he followed the way opened when his brother James Pilkington became Bishop of Durham. Are we justified in saying that \"Leonard Pilkington was 45 years old when he passed away\"? Yes, no, or maybe? No\n###\nKhan Kluay 2 is a three-dimensional animated movie from Thailand, directed by Taweelap Srivuthivong and released in 2009. It is the sequel to \"Khan Kluay\" and follows the further adventures of the war elephant of King Naresuan the Great. It is set during the war between Ayutthaya and Bago. Its theme is the need to protect family and country. The movie grossed 79 million baht. Are we justified in saying that \"Khan Kluay 2 is the most successful movie ever.\"? Yes, no, or maybe? Maybe\n###\nBroadway Rose is a 1922 American silent romantic drama film released by Metro Pictures and directed by Robert Z. Leonard. It stars Leonard's then-wife Mae Murray and Monte Blue. The film is based on an original story by Edmund Goulding written for star Murray, and was produced by Leonard's and Murray's production company Tiffany Pictures. Are we justified in saying that \"Robert Z. Leonard is not an American actor.\"? Yes, no, or maybe?", "doc_id": 394, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9694, 35561, 9715, 45010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fake? is a Japanese alternative rock band formed in 2001 by Ken Lloyd and Inoran. Their music has been described as alternative mixed with electronic sounds. Their sound has also been called \"Mixture Rock\" as well as an \"alternative punk rock mix.\" Lyrics are mainly in English and sometimes in Japanese. Are we justified in saying that \"Fake? was created by Ken Inoran\"? Yes, no, or maybe? No\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks. Are we justified in saying that \"Cocaine sells millions annually\"? Yes, no, or maybe? Maybe\n###\nJames Brandon (born 20 September 1980) is a British journalist, who was kidnapped in Iraq during 2004 while on assignment from the \"Sunday Telegraph\" and \"The Scotsman\", covering the occupation and insurgency. He was kidnapped by the Mahdi Army militia and was released after one day. Are we justified in saying that \"James Brandon is a famous British Journalist.\"? Yes, no, or maybe? Maybe\n###\nJonathan Erlich and Andy Ram were the defending champions, but Erlich chose not to participate due to an elbow injury, and only Ram competed that year.Ram partnered with Max Mirnyi, but lost to Feliciano L\u00f3pez and Fernando Verdasco in the second round. Are we justified in saying that \"Ram and Erlich were soundly defeated by Mirnyi, Feliciano, and Verdasco in the second round.\"? Yes, no, or maybe? No\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"Johan Martin Schr\u00f6der was nearly 60 when he founded Martinair\"? Yes, no, or maybe?", "doc_id": 246, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36923, 28214, 9059, 28418], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. Are we justified in saying that \"University of Maryland Eastern Shore is a great university \"? Yes, no, or maybe? Maybe\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia. Are we justified in saying that \"The Big 12 Conference is the oldest of its kind in the country.\"? Yes, no, or maybe? Maybe\n###\n\"The Daily Show\" is an American late-night satirical television program that airs Monday through Thursday on Comedy Central in the United States. It originally premiered on July 21, 1996, and is currently the longest-running series original program on Comedy Central. Are we justified in saying that \"The Daily Show is 30 minutes long.\"? Yes, no, or maybe? Maybe\n###\nGamalost (also Gammelost, Gammalost), which translates as \"old cheese\", is a pungent traditional Norwegian cheese, which was once a staple of the Norwegian diet. Like many traditional Norwegian foods, such as flat bread, dry salted meats and stockfish, Gamalost could be stored for long periods without refrigeration. Are we justified in saying that \"Gamalost was exported from Norway\"? Yes, no, or maybe? Maybe\n###\nAndrea Albert Pierre Casiraghi (born 8 June 1984) is the elder son of Caroline, Princess of Hanover, and her second husband Stefano Casiraghi. He is the eldest grandchild of Rainier III, Prince of Monaco, and American actress Grace Kelly. Casiraghi is currently fourth in the line of succession to the Monegasque throne, following his twin cousins and mother. Are we justified in saying that \"Casiraghi is the last in the line of succession\"? Yes, no, or maybe?", "doc_id": 735, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1605, 26210, 42423, 35273], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Luton Town Ladies Football Club was founded in 1997 and formed a partnership with its male counterpart, Luton Town F.C. in 2000. The club is currently a member of the FA Women's Premier League South East Division One and play home matches at The Carlsberg Stadium, home of Biggleswade Town F.C. Are we justified in saying that \"It formed a partnership in 1998\"? Yes, no, or maybe? No\n###\nThe Bermuda Broadcasting Company is the largest broadcasting company in Bermuda. Sometimes abbreviated locally as \"BBC\", it is not related to the BBC, a public broadcaster in the United Kingdom. A commercial, for-profit broadcasting company since its beginning in the 1950s, the chairman is Fernance B. Perry. Are we justified in saying that \"The Bermuda Broadcasting Company began in the 1950s.\"? Yes, no, or maybe? Yes\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"Martina Hingis was the top seed when she won her first singles title.\"? Yes, no, or maybe? No\n###\nMatthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season. Are we justified in saying that \"Mansfield was a good player but he was kind of slow.\"? Yes, no, or maybe? Maybe\n###\nSyracuse IMG Sports Network is the radio and television name for Syracuse University sports. The radio affiliates broadcast football, as well as men's and women's basketball and men's lacrosse games. Time Warner Cable Sports broadcasts the coaches' show and a weekly program titled \"Syracuse Sidelines\". Are we justified in saying that \"Syracuse University men's lacrosse games are broadcasted on Time Warner Cable Sports.\"? Yes, no, or maybe?", "doc_id": 356, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6776, 19691, 26633, 34221], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"Home Depot has no significant presence in the state of Georgia\"? Yes, no, or maybe? No\n###\nThe Latin American Boom was a flourishing of literature, poetry and criticism in Latin America during the 1960s and 1970s, when writers from this region explored new ideas and came to international renown in a way that had not happened previously. Major figures of the boom include Julio Cort\u00e1zar, Gabriel Garc\u00eda M\u00e1rquez, Carlos Fuentes, Jorge Luis Borges, and Mario Vargas Llosa. Are we justified in saying that \"The Latin American Boom was about African Americans\"? Yes, no, or maybe? No\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"Johan Martin Schr\u00f6der was born before 1970\"? Yes, no, or maybe? Yes\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"They regrouped sometime in the 70's\"? Yes, no, or maybe? Maybe\n###\n\"Oh My\" is a song by American hip hop artist DJ Drama, released on May 13, 2011, as the lead single from his third studio album \"Third Power\". The song was produced by frequent collaborator Drumma Boy and features rappers Fabolous, Roscoe Dash and Wiz Khalifa. The song peaked at #18 on the \"Billboard\" and #12 on the Top R&B/Hip-Hop Songs, making it the most successful song for DJ Drama to date. Are we justified in saying that \"Most people have not heard of DJ Drama.\"? Yes, no, or maybe?", "doc_id": 371, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3049, 8555, 35260, 44289], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Blackwater Lightship is a 2004 Hallmark Hall of Fame TV movie adaptation of the novel \"The Blackwater Lightship\" by Colm T\u00f3ib\u00edn. It aired on CBS on February 4, 2004. The movie stars Angela Lansbury, Gina McKee, Sam Robards, Dianne Wiest, and Keith McErlean. Lansbury received an Emmy nomination for it in 2004. Are we justified in saying that \"The Blackwater Lightship only aired on CBS.\"? Yes, no, or maybe? Maybe\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is accessed by a state route.\"? Yes, no, or maybe? Yes\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"Lourdes Ver\u00f3nica Ar\u00e9valos Elias will start her singing career in 2021\"? Yes, no, or maybe? Maybe\n###\nPeter Himmelman (born November 23, 1959 in St. Louis Park, Minnesota) is an American singer-songwriter and film and television composer from Minnesota, who formerly played in the Minneapolis indie rock band Sussman Lawrence before pursuing an extensive solo career. Himmelman is also the founder of Big Muse, a company which helps individuals and organizations unlock their creative potential. Are we justified in saying that \"Peter Himmelman has never written music for the stage.\"? Yes, no, or maybe? Maybe\n###\nThe Monument to Vasil Levski (Bulgarian: \u041f\u0430\u043c\u0435\u0442\u043d\u0438\u043a \u043d\u0430 \u0412\u0430\u0441\u0438\u043b \u041b\u0435\u0432\u0441\u043a\u0438 , \"Pametnik na Vasil Levski\") in the centre of Sofia, the capital of Bulgaria, is one of the first monuments to be built in the then newly liberated Principality of Bulgaria. It commemorates the hanging of Bulgarian national hero and major revolutionary figure Vasil Levski on the same spot on 18 February 1873. Are we justified in saying that \"The Monument to Vasil Levski may or may not have been the first monument in the Principality of Bulgaria.\"? Yes, no, or maybe?", "doc_id": 579, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43264, 25192, 37531, 20818], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cape Vakop ( ) is a headland between Hound Bay and Luisa Bay on the north coast of South Georgia. It was charted by the Second German Antarctic Expedition, 1911\u201312, under Wilhelm Filchner. The name appears on a chart based upon surveys of South Georgia in 1926\u201330 by DI personnel, but may represent an earlier naming. Are we justified in saying that \"Cape Vakop was likely uninhabited during the early 20th century\"? Yes, no, or maybe? Yes\n###\nStuart is a city in, and the seat of, Martin County, Florida, United States. Located on Florida's Treasure Coast, Stuart is the largest of four incorporated municipalities in Martin County. The population was 15,593 in the 2010 census. It is part of the Port St. Lucie, Florida Metropolitan Statistical Area. Are we justified in saying that \"Stuart is extremely far from the sea\"? Yes, no, or maybe? No\n###\nSadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award. Are we justified in saying that \"Sadat's wife, Jehan was a bad woman\"? Yes, no, or maybe? Maybe\n###\nMichelle Do (born 1983) is a Vietnamese-American table tennis player from Milpitas, California. At age 17, Do became the youngest ever member of the U.S. Women\u2019s Table Tennis Team, for the 2000 Summer Olympics. She attended Scripps Ranch High School in San Diego, California. Are we justified in saying that \"Do did not attend college.\"? Yes, no, or maybe? Maybe\n###\nIdentification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\". Are we justified in saying that \"Little Hamlet is a 1964 Polish drama film directed by Jerzy Skolimowski. \"? Yes, no, or maybe?", "doc_id": 754, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15708, 35752, 18375, 44393], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Deanne Olivia Bell is an American television personality currently hosting CNBC's reality docu-series \"Make Me a Millionaire Inventor.\" She has previously worked on PBS's \"Design Squad\", Discovery Channel's \"Smash Lab\", and National Geographic's \"The Egyptian Job\". She has also co-hosted DIY Network's \"Money Hunters\" and ESPN's \"Rise Up.\" Are we justified in saying that \"Deanne Olivia Bell hosted \"Bash Lab\"\"? Yes, no, or maybe? No\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation. Are we justified in saying that \"The 89th Medium Tank Battalion was the very slowest armored tank unit of the United States Army\"? Yes, no, or maybe? Maybe\n###\nPaul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas. Are we justified in saying that \"Paul Albert Raymond Barlatier de Mas was the grandson of a famous captain\"? Yes, no, or maybe? Yes\n###\nEnd of the Past is a book by Pakistani journalist, cultural critic and satirist Nadeem F. Paracha. Published by Vanguard Publications in 2016, it is a social history of Pakistan in which Paracha charts the religious and cultural evolution of Pakistan through the country's cultural, sporting and ideological histories. Are we justified in saying that \"Nadeem F. Paracha only writes non-fiction.\"? Yes, no, or maybe? Maybe\n###\nJustin Alaric Holiday (born April 5, 1989) is an American professional basketball player for the Chicago Bulls of the National Basketball Association (NBA). He played college basketball for the University of Washington. He won an NBA championship in 2015 as a member of the Golden State Warriors. Are we justified in saying that \"Justin Alaric Holiday is a 30 year old American professional basketball player for the Chicago Bulls.\"? Yes, no, or maybe?", "doc_id": 86, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39007, 27332, 14704, 978], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cinnaholic is a vegan bakery franchise that started in 2010 and currently operates in eight states. The company's owners appeared on the television show Shark Tank in 2014, which ended with them ultimately turning down a $200,000 investment offer from Robert Herjavec. The company has adopted a franchise business model and has plans to open 100 locations by 2020. Are we justified in saying that \"Cinnaholic is a vegan bakery.\"? Yes, no, or maybe? Yes\n###\nO'Donnell High School is a 1A high school located in O'Donnell, Texas (USA). It is part of the O'Donnell Independent School District located in southeast Lynn County. In 2011, the school was rated \"Academically Acceptable\" by the Texas Education Agency. Are we justified in saying that \"Texas is located in the USA.\"? Yes, no, or maybe? Yes\n###\nJon Moulton (born 15 October 1950) is a British venture capitalist. He is the founder and managing partner of the private equity firm Better Capital, and is the former managing partner of the private equity firm Alchemy Partners. Moulton is best known for leading Alchemy Partners' bid to buy MG Rover from BMW in 2000, which ultimately lost out to a rival offer from the Phoenix Consortium. Are we justified in saying that \"Moulton is no longer affiliated with Alchemy Partners.\"? Yes, no, or maybe? Yes\n###\nTrojan War is a 1997 American romantic comedy film directed by George Huang. It stars Will Friedle, Jennifer Love Hewitt, and Marley Shelton. The film was a critical and box office disaster. Produced for $15 million, it made only $309 in ticket sales because it was played in a single movie theater and was pulled after only a week. Are we justified in saying that \"Trojan War is the lowest grossing film of all time.\"? Yes, no, or maybe? Maybe\n###\nThe Blackpool Gazette is an English evening newspaper based in Blackpool, Lancashire. Published every day except Sunday, it covers the towns and communities of the Fylde coast. It was founded as \"The West Lancashire Evening Gazette\" in 1929 before being renamed the \"Evening Gazette\", and then \"Blackpool Gazette\". The paper's history dates back to a weekly publication founded in 1873. Are we justified in saying that \"The Blackpool Gazette is published 7 days a week.\"? Yes, no, or maybe?", "doc_id": 965, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33971, 37511, 12187, 7940], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cranborne Priory was a priory in Cranborne in Dorset, England. The priory church survives as Cranborne's parish church, the Church of St\u00a0Mary and St\u00a0Bartholomew, and is a Grade I listed building, with parts of the building dating back to the 12th century. Are we justified in saying that \"Cranborne Priory church is a grade 1 building\"? Yes, no, or maybe? Yes\n###\nSulakshana is an Indian actress born on August 1 ,1965 who has performed in Tamil, Telugu, Kannada and Malayalam films at the age of two and half in the movie Kaaviya Thalaivi as child Krishna in the name of Dolly . After that she acted in Thulabharam as child artist in Tamil,Telugu,Malayalam and Hindi (all version) in the name of Rajani . Are we justified in saying that \"Sulakshana is an Indian actress who is very fat\"? Yes, no, or maybe? Maybe\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"The 1974 New York Mets was led by a woman.\"? Yes, no, or maybe? No\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Destiny was a foreign language film.\"? Yes, no, or maybe? Yes\n###\nThe China Stars was a baseball team established in 2005. It was made up of the best players in the China Baseball League. The team was established in purpose of playing with the winners from the professional baseball league in Japan, Taiwan, and Korea in the annual Konami Cup Asia Series. The China Stars lost all the 9 games in their three participartions. Are we justified in saying that \"The China Stars participated in the series three years in a row.\"? Yes, no, or maybe?", "doc_id": 959, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24572, 38348, 4963, 18461], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 3rd Macau International Movie Festival ceremony, organized by the Macau Film and Television Media Association and China International Cultural Communication Center, honored the best films of 2011 in the Greater China Region and took place on December 7, 2011, at the Venetian Macao, in Macau. Are we justified in saying that \"The 2nd Macau International Movie Festival took place on December 6, 2010\"? Yes, no, or maybe? Maybe\n###\nClub Deportivo Cajamadrid was a professional basketball and handball team in Spain. It was founded in 1979 and the basketball team played in Liga ACB from 1983 to 1986. The club was sponsored by Caja Madrid until 1991, when the bank decided to retire its support and continued as a different club called Juventud Alcal\u00e1. Are we justified in saying that \"Club Deportivo Cajamadrid was sponsored by Caja Madrid until 1991.\"? Yes, no, or maybe? Yes\n###\nNew Hampshire Route 202A (abbreviated NH\u00a0202A) is a 14.639 mi east\u2013west state highway in Strafford and Rockingham counties in southeastern New Hampshire. The western terminus is in Northwood at U.S. Route\u00a0202 and New Hampshire\u00a09, near their intersection with U.S. Route\u00a04. Its eastern terminus is in downtown Rochester at New Hampshire Route\u00a0108 and New Hampshire Route\u00a0125. Are we justified in saying that \"NH 202A is the widest route in the US\"? Yes, no, or maybe? Maybe\n###\nThe 2015 Auburn Tigers softball team is an American softball team, representing the Auburn University for the 2015 NCAA softball season. In 2014, the Auburn Tigers softball team went 42-19-1 during Clint Myers first season. The Auburn Tigers play their home games at Jane B. Moore Field. Are we justified in saying that \"The Auburn Tigers play games at Jane B. Moore Field.\n\"? Yes, no, or maybe? Yes\n###\nLaura Elena Z\u00fa\u00f1iga Huizar (born January 3, 1985) is a Mexican model and beauty queen, center of a drug trafficking scandal in December 2008. The critically acclaimed 2011 film \"Miss Bala\" (\"Miss Bullet\") is loosely based on Z\u00fa\u00f1iga and her involvement in the events of December 2008. Are we justified in saying that \"Miss Bala is only about a drug trafficking scandal.\"? Yes, no, or maybe?", "doc_id": 584, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5491, 8513, 24586, 33236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mikhail Nikolayevich Baryshnikov (Russian: \u041c\u0438\u0445\u0430\u0438\u0301\u043b \u041d\u0438\u043a\u043e\u043b\u0430\u0301\u0435\u0432\u0438\u0447 \u0411\u0430\u0440\u044b\u0301\u0448\u043d\u0438\u043a\u043e\u0432 , Latvian: \"Mihails Bari\u0161\u0146ikovs\" ; born January 27, 1948), nicknamed \"Misha\" (Russian diminutive of the name \"Mikhail\"), is a Soviet and American dancer, choreographer, and actor. Are we justified in saying that \"Mikhail Baryshnikov was called Misha\"? Yes, no, or maybe? Yes\n###\nThe Leader of the Opposition of Singapore is usually the leader of the second largest political party represented in the Parliament of Singapore. During the 1955 Legislative Assembly election, the late Lee Kuan Yew was the \"de facto\" Leader of the Opposition, as the People's Action Party was then the second largest political party represented in the Legislative Assembly. Are we justified in saying that \"The Parliament of Singapore was formed in 1900\"? Yes, no, or maybe? Maybe\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph Ernst Friedrich von Forcade de Biaix was always a politician.\"? Yes, no, or maybe? Maybe\n###\nIntervilles was a French comedy game show first broadcast in 1962. The show was aired since July 17, 1962 on RTF, then on ORTF. After stopping for 20 years, it reappeared on July 10, 1985 on FR3, then from July 4, 1986 to September 6, 1999 on TF1. France 3 aired the show since July 5, 2004, then France 3 from June 23, 2006 to August 26, 2009. Are we justified in saying that \"The game returned in 1985.\"? Yes, no, or maybe? Yes\n###\nThe Featured Artists Coalition (FAC) is a nonprofit organisation set up to protect the rights of featured musical artists, particularly in the new digital age. It encourages a greater connection between fans and artists and aims to promote transparency in the music industry specifically to the benefit of the artists themselves. Are we justified in saying that \"The Featured Artists Coalition wants to benefit fans. \"? Yes, no, or maybe?", "doc_id": 242, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26517, 11448, 9453, 41295], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Babar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\". Are we justified in saying that \"The film was the debut film for the character of Babar.\"? Yes, no, or maybe? No\n###\nThe 39th People's Choice Awards ceremony, honoring the best in popular culture for 2012, was held January 9, 2013 at the Nokia Theatre in Los Angeles, and was broadcast live on CBS and simulcast over Xbox Live at 9:00 pm EST. The ceremony was hosted by Kaley Cuoco. The nominees were announced in November, 2012. Are we justified in saying that \"The People's Choice Awards always happen in January\"? Yes, no, or maybe? Maybe\n###\nThe Girdler sulfide (GS) process, also known as the GeibSpevack (GS) process, is an industrial production method for filtering out of natural water the heavy water (deuterium oxide = DO) which is used in particle research, in Deuterium NMR spectroscopy, deuterated solvents for proton NMR spectroscopy, in heavy water nuclear reactors (as a coolant and moderator) and in deuterated drugs. Are we justified in saying that \"The value of natural water will decrease because heavy water has more scientific uses.\"? Yes, no, or maybe? Maybe\n###\nGeneo Grissom (born June 4, 1992) is an American football defensive end for the New England Patriots. He played college football at Oklahoma. He was drafted by the New England Patriots in the third round with the 97th overall pick of the 2015 NFL Draft. Are we justified in saying that \"Geneo Grissom was born in England.\"? Yes, no, or maybe? No\n###\nTripoli Municipal Stadium is a 22,000 capacity multi-use stadium in Tripoli, Lebanon. It is located near the city center. It was recently rehabilitated to welcome Arabic competitions as well as Asian and International ones. It is also the home ground of Tripoli SC. Are we justified in saying that \"Tripoli Municipal stadium gained capacity when it was recently rehabilitated.\"? Yes, no, or maybe?", "doc_id": 928, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6403, 29952, 31908, 6445], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Oceanus ( ; Greek: \u1f68\u03ba\u03b5\u03b1\u03bd\u03cc\u03c2 \"\u014ckean\u00f3s\", ] ), also known as Ogenus (\"Ogenos\", \u03a9\u03b3\u03b7\u03bd\u03bf\u03c2) or Ogen (\u03a9\u03b3\u03b7\u03bd), was a divine figure in classical antiquity, believed by the ancient Greeks and Romans to be the divine personification of the sea, an enormous river encircling the world. Are we justified in saying that \"Greeks and Romans had a lot of conflict to its origin \"? Yes, no, or maybe? Maybe\n###\nIdichapuli Selvaraj (c. 1939 \u2013 30 January 2012) was a veteran Tamil comedy actor. He acted in more than hundreds of films. He acted along with lot of actors. His brother Pandu is also a comedy actor. He also worked as an assistant director for the M. G. Ramachandran films like \"Idhayakkani\" and \"Ulagam Sutrum Valiban\". Are we justified in saying that \"Selvaraj worked with a lot of actors.\"? Yes, no, or maybe? Yes\n###\nThe Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"CUHK is located in Shatin\"? Yes, no, or maybe? Yes\n###\nIn economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg. Are we justified in saying that \"Australia was only one of several countries with the currency peg\"? Yes, no, or maybe? Maybe\n###\nWest Town Mall is an upscale shopping mall located in Knoxville, Tennessee, United States. Opened in August 1972, this one-level mall is located in the western portion of Knoxville in the West Hills community. West Town Mall is located along Interstates 40/75 and Kingston Pike. The mall has over 1300000 sqft of Gross leasable area, making it the largest of any enclosed shopping mall in Tennessee. Are we justified in saying that \"\"Upscale\" does not contain the word \"up\".\"? Yes, no, or maybe?", "doc_id": 88, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39932, 36173, 15490, 17767], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre. Are we justified in saying that \"cleethorpes is an american football club\"? Yes, no, or maybe? No\n###\nThe Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"The Master of Revenge stars Jo Jae-hyun\"? Yes, no, or maybe? Yes\n###\nVersailles is a television series, set during the construction of the Palace of Versailles during the reign of Louis XIV, that premiered on 16 November 2015 on Canal+ in France and on Super Channel in Canada, in May 2016 on BBC2 in Britain, and on 1 October 2016 on Ovation in the U.S. Are we justified in saying that \"Versailles premiered in the U.S. on October 1 2016 in the evening.\"? Yes, no, or maybe? Maybe\n###\nAsana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook. Are we justified in saying that \"Asana was founded by Mark Zukerberg\"? Yes, no, or maybe? No\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Kirkas Palestina was not accepted as a nominee into the 71st Academy Awards because it won 5 Israeli Film Academy Awards in 1998.\"? Yes, no, or maybe?", "doc_id": 823, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41281, 1416, 11342, 8319], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"Nydala Abbey was a monastery for nuns.\"? Yes, no, or maybe? Maybe\n###\nMake It Big () is a 2002 South Korean comedy film. Song Seung-heon, Kim Young-jun and Kwon Sang-woo play three high school students who are startled when a bagful of money and a dead man fall on top of their car. Once they realize just how much money is in the bag, they give up any thought of calling the police. Are we justified in saying that \"Make it Big is a 2002 South Korean romantic film that involves two high school students. \"? Yes, no, or maybe? No\n###\nGrotto Geyser is a fountain-type geyser located in the Upper Geyser Basin in Yellowstone National Park in the United States. Grotto Geyser is the namesake for the group of geysers that includes Grotto Fountain Geyser, South Grotto Fountain Geyser, Indicator Spring, Spa Geyser, and Rocket Geyser. Are we justified in saying that \"Grotto Geyser is a fountain type volcano located at a national park\"? Yes, no, or maybe? No\n###\nAvani Modi is an Indian model and film actress, a well-known face in Indian movies and theatre plays in Gujarati theatre She made her Bollywood debut in Madhur Bhandarkar's drama film \"Calendar Girls\", which is scheduled to release on 25 September 2015. The movie is based upon the story of five girls and their journey as an annual calendar girl. Are we justified in saying that \"Madhur Bhandarkar's drama film \"Calendar Girls\" was successful because it starred Avani Modi.\"? Yes, no, or maybe? Maybe\n###\nThe \"Charleston\"-class amphibious cargo ships were a class of amphibious cargo ships in service with the United States Navy. These ships served in Amphibious Readiness Groups between 1968 and 1994. The ships were the last amphibious cargo ships built for the U.S. Navy, their role having been taken over by amphibious transport docks. Are we justified in saying that \"The \"Charleston\"-class amphibious cargo ships were not able to float\"? Yes, no, or maybe?", "doc_id": 511, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13894, 32727, 30563, 21263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o has a father.\"? Yes, no, or maybe? Yes\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Claire Huffaker wrote the script for Seven Ways from Sundown\"? Yes, no, or maybe? Yes\n###\nFor Screening Purposes Only is the debut album by UK dance-punk trio Test Icicles. After being released in 2005, the album was critically praised for being unique and compelling in an increasingly homogenous indie music scene. Following the group's split in February 2006, the album remains Test Icicles' only LP. Are we justified in saying that \"The album was praised as unique\"? Yes, no, or maybe? Yes\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"St Clement's street is 8 miles long\"? Yes, no, or maybe? Maybe\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season. Are we justified in saying that \"Utah Jazz of the National Basketball Association (NBA) has been around the world. \"? Yes, no, or maybe?", "doc_id": 829, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13601, 13005, 1584, 25981], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Atiha Sen Gupta (born 1988) is a British playwright and screenwriter. She is writer-in-residence for 2016-2017 at Theatre Royal Stratford East in London, where her play \"Counting Stars\" was produced in 2016. In the same year she won the International Achievement Recognition Awards (IARA) Award for Best Playwright. Are we justified in saying that \"Atiha Sen Gupta is from outside Europe.\"? Yes, no, or maybe? No\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine. Are we justified in saying that \"MIT Technology Review is ready mostly by teenagers\"? Yes, no, or maybe? Maybe\n###\nThere Was a Crooked Man... is a 1970 western starring Kirk Douglas and Henry Fonda and directed by Joseph L. Mankiewicz. This was the only western made by Mankiewicz, director of such notable films as \"All About Eve\", \"Guys and Dolls\" and \"Cleopatra\". It was written by David Newman and Robert Benton, their first script after \"Bonnie and Clyde\". Are we justified in saying that \"Mankiewicz made over 20 films.\"? Yes, no, or maybe? Maybe\n###\nPeter Franco (November 27, 1973) is an audio engineer and music producer. Franco was first recognized for his work in 2007 on Daft Punk's \"Alive 2007\" album. He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2013, winning the 2014 Grammy Award for Best Engineered Album, Non-Classical. He has been one of four engineers for Daft Punk studio recordings since 2008. Are we justified in saying that \"He recorded, and assisted in mixing \"Random Access Memories\" by Daft Punk in 2017.\"? Yes, no, or maybe? No\n###\nRobert Louis (Robert) Paquette (born 1951) is an American historian, Publius Virgilius Rogers Professor of American History at Hamilton College, and co-founder of the Alexander Hamilton Institute for the Study of Western Civilization. He is particularly known for his work on the history of slavery in Cuba. Are we justified in saying that \"Robert Paquette was a slave in Cuba.\"? Yes, no, or maybe?", "doc_id": 292, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23211, 30958, 2904, 8339], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Domingo Per\u00f3n (] ; 8 October 1895 \u2013 1 July 1974) was an Argentine lieutenant general and politician. After serving in several government positions, including Minister of Labour and Vice President, he was thrice elected President of Argentina, serving from June 1946 to September 1955, when he was overthrown in a \"coup d'\u00e9tat\", and then from October 1973 until his death in July 1974. Are we justified in saying that \"Juan Domingo Per\u00f3n was always popular.\"? Yes, no, or maybe? Maybe\n###\nYough Senior High School is a high school located in the southeastern region of Westmoreland County, Pennsylvania, USA (Parents of Students/Staff/Educators). The school is operated by the Yough School District. Students attend from the townships of Sewickley Township, Westmoreland County, Pennsylvania and South Huntingdon. Yough High School has graduating class sizes from 180-200. Are we justified in saying that \"Yough Senior High School is operated by its eponymous district.\"? Yes, no, or maybe? Yes\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg. Are we justified in saying that \"Snoop Dogg does not like Stan Lane.\"? Yes, no, or maybe? Maybe\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound. Are we justified in saying that \"There are many other breed dogs from Germany other than the Bavarian Mountain Hound.\"? Yes, no, or maybe? Maybe\n###\nAloe ( or ), also written \"Alo\u00eb\", is a genus containing over 500 species of flowering succulent plants. The most widely known species is \"Aloe vera\", or \"true aloe\", so called because it is cultivated as the standard source of so-called \"aloe vera\" for assorted pharmaceutical purposes. Other species, such as \"Aloe ferox\", also are cultivated or harvested from the wild for similar applications. Are we justified in saying that \"Aloe has between 400 and 490 total species.\"? Yes, no, or maybe?", "doc_id": 290, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8489, 28736, 37448, 10101], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Port Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"Port Melbourne is split between two areas.\"? Yes, no, or maybe? Yes\n###\nThorley Wash or Thorley Flood Pound is a 17.3 hectare biological Site of Special Scientific Interest in Thorley, south of Bishop's Stortford in Hertfordshire. It was formerly a flood pound for the Stort Navigation, which was decommissioned in 2004 and converted to a more natural state. It was purchased by the Herts and Middlesex Wildlife Trust from the Environment Agency in 2011. Are we justified in saying that \"Thorley Wash is more than 7.3 hectares.\"? Yes, no, or maybe? Yes\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"\"Come Back in One Piece\" is the sixth official single \"? Yes, no, or maybe? No\n###\nSaid bin Salim Al Shaksy (Arabic: \u0633\u0639\u064a\u062f \u0628\u0646 \u0633\u0627\u0644\u0645 \u0627\u0644\u0634\u0642\u0635\u064a) (born Zanzibar in 1934 - 2015) was the founder and chairman of The Shaksy Group. Al Shaksy has been a member and Managing Director of several Joint-Stock Companies, including Al Bank Al Ahli Al Omani SAOG, Oman Fisheries Co. SAOG and Oman Hotels Co. SAOG. Are we justified in saying that \"Al Shaksy has been a Director.\"? Yes, no, or maybe? Yes\n###\nHudepohl Brewing Company is a brewery established in Cincinnati, Ohio in 1885 by founder Ludwig Hudepohl II. Hudepohl was the son of Bavarian immigrants and had worked in the surgical tool business before starting his brewery. Hudepohl combined with Schoenling Brewing Company in 1986. Today, the Hudepohl-Schoenling Brewing Company is a wholly owned subsidiary of Christian Moerlein Brewing Co.. Are we justified in saying that \"Hudepohl Brewing Company was founded by Ludpig\"? Yes, no, or maybe?", "doc_id": 570, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3, 4557, 37819, 11490], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Nantwich Town Football Club is a semi-professional football club based in Nantwich, Cheshire, England. The club was founded in 1884 and is nicknamed \"The Dabbers\", a reference to the town's tanning industry. The club is currently a member of the Northern Premier League Premier Division, the seventh tier in the English football league system, with home matches played at the Weaver Stadium. Are we justified in saying that \"Nantwich Town Football Club is a professional football club.\"? Yes, no, or maybe? No\n###\nAn Act for naturalizing Louis Sekeyhaye, George Frederick Handel, and others (13 Geo. I), later given the short title of Handel's Naturalisation Act 1727, was a 1727 Act of the Parliament of Great Britain with the intent of naturalising and granting British citizenship to German-born composer George Frideric Handel and other foreign citizens. Are we justified in saying that \"Handel's Naturalisation Act 1727 was initially rejected by Parliament \"? Yes, no, or maybe? Maybe\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries. Are we justified in saying that \"People really like flowers\"? Yes, no, or maybe? Maybe\n###\nKim Won-sik (Hangul:\u00a0\uae40\uc6d0\uc2dd , born February 15, 1993) better known by his stage name Ravi (Hangul:\u00a0\ub77c\ube44 ), is a South Korean rapper, singer-songwriter, producer, signed under Jellyfish Entertainment. He is a member of the South Korean boy group VIXX and VIXX sub-unit VIXX LR. He debuted as a solo artist on January 9, 2017, with the release of his debut mini album \"R.EAL1ZE\". Are we justified in saying that \"VIXX was more popular in China \"? Yes, no, or maybe? Maybe\n###\nLik\u00ebng\u00eb are pork sausages flavored with salt, pepper and seed of Fennel (far\u00eb mbrai), made in Piana degli Albanesi and Santa Cristina Gela. \"Lik\u00ebng\u00eb\" is the Undefinite Singular, \"Lik\u00ebnga\" is the Definite Singular and is cognate with the Italian Lucanica and the Greek Loukaniko. Are we justified in saying that \"A former vegetarian can eat Lik\u00ebng\u00eb.\"? Yes, no, or maybe?", "doc_id": 285, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28298, 21486, 11969, 3581], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Songbook is an acoustic live album by American musician and Soundgarden vocalist Chris Cornell, released on November 21, 2011. The live album features songs recorded during Cornell's Songbook Tour, an acoustic solo tour which took place during March\u2013May 2011 in the US, and is his first live album as a solo artist. Are we justified in saying that \"Chris Cornell released his live album with his band.\"? Yes, no, or maybe? No\n###\nDerailed is a 2005 American crime thriller film based on the novel of the same name by James Siegel. The film is directed by Mikael H\u00e5fstr\u00f6m and stars Clive Owen, Jennifer Aniston, Vincent Cassel, Giancarlo Esposito, David Morrissey, RZA and Xzibit. This was also the first film to be released by The Weinstein Company in the United States. The film is set in Chicago. Are we justified in saying that \"Derailed sold billions.\"? Yes, no, or maybe? Maybe\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC. Are we justified in saying that \"SNOBOL was used by Putin.\"? Yes, no, or maybe? Maybe\n###\nSplice is a 2009 Canadian-French science fiction horror film directed by Vincenzo Natali and starring Adrien Brody, Sarah Polley, and Delphine Chan\u00e9ac. The story concerns experiments in genetic engineering being done by a young scientific couple, who attempt to introduce human DNA into their work of splicing animal genes. Guillermo del Toro, Don Murphy, and Joel Silver executive produced. Are we justified in saying that \"Dark Castle Entertainment enjoys producing movies that are weird and out of the norm. \"? Yes, no, or maybe? Maybe\n###\nNathan Never is a black-and-white, science fiction Italian comic book, published monthly in Italy since 1991 by Sergio Bonelli Editore. It is written by Michele Medda, Antonio Serra and Bepi Vigna. Artists who worked to series include Claudio Castellini, Roberto De Angelis, Dante Bastianoni, Nicola Mari, Pino Rinaldi, Giancarlo Olivares and Onofrio Catacchio. Are we justified in saying that \"Nathan Never is published 12 times a year.\"? Yes, no, or maybe?", "doc_id": 604, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28790, 36614, 21971, 16140], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Reid Report is an hour-long weekday U.S. and world political commentary program on MSNBC. Hosted by Joy-Ann Reid, it premiered on February 24, 2014, in the time slot formerly occupied by \"NewsNation with Tamron Hall\". The show ended on February 27, 2015 due to low ratings. Are we justified in saying that \"The Reid Report was cancelled in 2015 because of low ratings.\"? Yes, no, or maybe? Yes\n###\nLausche (Czech: \"Lu\u017e\" ), is, at 793 m , the highest peak of the Lusatian Mountains and the highest mountain in the German part of the Upper Lusatia region. The conical mountain is part of the Zittau Mountains range, situated on the border of the German state of Saxony with the Bohemian region of the Czech Republic. Are we justified in saying that \"Lausche is located in the French border state of Saxony.\"? Yes, no, or maybe? No\n###\nThe Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife. Are we justified in saying that \"The Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It was hard to understand.\"? Yes, no, or maybe? Maybe\n###\nJulian William Kennedy Burnside AO QC (born 9 June 1949) is an Australian barrister, human rights and refugee advocate, and author. He practises principally in commercial litigation, trade practices and administrative law. He is known for his staunch opposition to the mandatory detention of asylum seekers, and has provided legal counsel in a wide variety of high-profile cases. Are we justified in saying that \"Julian specializes in three principal areas of law.\"? Yes, no, or maybe? Yes\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing. Are we justified in saying that \"The Charter Township of Lansing was produced in a state bordering Canada.\"? Yes, no, or maybe?", "doc_id": 939, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2545, 30947, 27425, 33915], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The End Tour was the farewell tour of the heavy metal band Black Sabbath, featuring founding members Ozzy Osbourne, Tony Iommi and Geezer Butler. The tour concluded Sabbath's 40+ year career. The final show was February 4, 2017, in their home city of Birmingham, UK. Are we justified in saying that \"Ozzy Osbourne, Tony Iommi and Geezer Butler were founding members of Black Sabbath.\"? Yes, no, or maybe? Yes\n###\nWuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county. Are we justified in saying that \"The county is the second largest in the country\"? Yes, no, or maybe? Maybe\n###\n41 Commando or No. 41 (Royal Marine) Commando was a unit of the Royal Marines trained as Commandos during the Second World War. They were part of the all Royal Marine 4th Special Service Brigade that took part in the Normandy landings in June 1944 and later that served in World War II, the Korean War, and in Northern Ireland. They were disbanded in 1981. Are we justified in saying that \"41 Commando is a special service brigade.\"? Yes, no, or maybe? Yes\n###\n\"Paint It Black\" (originally released as \"Paint It, Black\") is a song by the English rock band The Rolling Stones, written by the songwriting partnership of Mick Jagger and Keith Richards, and first released as a single on 6 May 1966. It was later included as the opening track to the US version of their 1966 album, \"Aftermath\". Are we justified in saying that \"Paint It Black was released by The Rolling Stones in 1966\"? Yes, no, or maybe? Yes\n###\nJeon Do-yeon (born February 11, 1973) is a South Korean actress. She has won many awards in her career, including best actress at the 2007 Cannes Film Festival for her portrayal of a broken woman who has lost everything in Lee Chang-dong's \"Secret Sunshine\". Are we justified in saying that \"Jeon started in the film Secret Sunshine.\"? Yes, no, or maybe?", "doc_id": 315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2527, 25053, 26382, 2723], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kim Hyang-gi (born August 9, 2000) is a South Korean actress. Kim began her career as a child actress, and has starred in films and television series such as \"Wedding Dress\" (2010), \"The Queen's Classroom\" (2013), \"Thread of Lies\" (2014) and \"Snowy Road\" (2017). Are we justified in saying that \"The film Snowy Road was well received\"? Yes, no, or maybe? Maybe\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes. Are we justified in saying that \"There are private foundations involved in estate planning.\"? Yes, no, or maybe? Yes\n###\nKhatarnaak is a 1990 Hindi-language Indian feature film directed by Bharat Rangachary, starring Sanjay Dutt, Farha Naaz and Anita Raj in lead roles, upon release the film was a box office hit. Film's track \" Aasmaan pe baithi Chandani\" music is uncredited copy of B.J. Thomas's track \"Raindrops keep falling on my head .\" Are we justified in saying that \"Bharat Rangachary forgot to credit B. J. Thomas.\"? Yes, no, or maybe? Maybe\n###\nJason Ian Drucker (born \u20092005 ) is an American child actor. He starred as Greg Heffley in the 2017 film \"\". He also played Tommy Miller, the youngest of the Miller Family, in Nickelodeon's \"Every Witch Way\". In 2018, he will co-star in the \"Transformers\" spin-off \"Bumblebee\". Are we justified in saying that \"Jason Ian Drucker acted is a series produced by Nickelodeon.\"? Yes, no, or maybe? Yes\n###\nSugar & Spice is a 2001 American teen crime comedy film directed by Francine McDougall, and starring Marley Shelton, Marla Sokoloff, Mena Suvari, James Marsden, and Melissa George. The plot follows a group of high school cheerleaders who conspire and commit armed robbery when one of them becomes pregnant and desperate for income. Are we justified in saying that \"Melissa George was pregnant in the movie about cheerleaders from 2001.\"? Yes, no, or maybe?", "doc_id": 189, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16721, 17338, 23017, 30058], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "On 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The man was short.\"? Yes, no, or maybe? Maybe\n###\nSemonkong is a community council located in the Maseru District of Lesotho. Semonkong, meaning \"Place of Smoke\", was established in the 1880s as a refuge for Basotho displaced by the Gun War. It is located close to several major natural features, including the Maletsunyane Falls and the 3096-metre peak of Thaba Putsoa. The population in 2006 was 7,781. Are we justified in saying that \"Semonkong can be translated to Place of Smoke\"? Yes, no, or maybe? Yes\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"The school doesn't have many students\"? Yes, no, or maybe? Maybe\n###\nGiovanni Visconti \u2014 according to Lorenzo Cardella nephew of Pope Gregory X. He was ostensibly created cardinal-bishop of Sabina by his uncle in 1275 and in 1276 was named judge in the case concerning the translation of bishop Giovanni of Potenza to the archbishopric of Monreale, postulated by the cathedral chapter of Monreale. He died in 1277 or 1278. Are we justified in saying that \"Giovanni Visconti left the church\"? Yes, no, or maybe? Maybe\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169. Are we justified in saying that \"The administrative center of the rural okrug has a population below 200.\"? Yes, no, or maybe?", "doc_id": 585, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26485, 5196, 25464, 26835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ahmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Ahmad Kemal Idris was born on February 10, 1923.\"? Yes, no, or maybe? Yes\n###\nLance King (born November 23, 1962) is an American heavy metal vocalist specializing in melodic rock progressive and power metal. Lance has sung with many groups over the last 35 years and started the record label Nightmare in 1990 to release his own music and is presently still at the helm of the label. Are we justified in saying that \"Lance King was also known for his art displayed in various museums.\"? Yes, no, or maybe? Maybe\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places. Are we justified in saying that \"The William Martin Armistead House was added to the U.S. National Register of Historic Places over 5 years ago\"? Yes, no, or maybe? Yes\n###\nJefferson County is a county located in the U.S. state of Kentucky. As of the 2010 census, the population was 741,096. It is the most populous county in Kentucky and is more than twice as large as the second most populous, Fayette. The county was formed in 1780 and named for future President Thomas Jefferson, who was Governor of Virginia at the time. Are we justified in saying that \"Fayette County was not named after Thomas Jefferson.\"? Yes, no, or maybe? Yes\n###\nMiriam Auhea Kalani Kui Kawakiu o Kek\u0101uluohi Keali\u02bbiuhiwaihanau o Kalani Makahonua Ahilapalapa Kai Wikapu o Kaleilei a Kalakua also known as Ka\u02bb ahumanu III (July 27, 1794 \u2013 June 7, 1845), was Kuhina Nui of the Kingdom of Hawaii, a queen consort of both King Kamehameha I and Kamehameha II, and mother of another king. Are we justified in saying that \"Ka\u02bb ahumanu III was born on the seventh day of the month.\"? Yes, no, or maybe?", "doc_id": 513, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5034, 11107, 44669, 27444], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Morley College Choir was founded by Gustav Holst, during the period he was teaching music at Morley College. The choir was led for many years by Michael Tippett, who conducted the ensemble for the first-ever recording of Thomas Tallis' Spem in Alium, and premiered a number of Tippett's works, including A Child of Our Time in March 1944. Are we justified in saying that \"Morley College Choir has won many awards\"? Yes, no, or maybe? Maybe\n###\nThe Merkur XR4Ti is a high-performance 3-door hatchback sold in North America from 1985 to 1989. A product of the Ford Motor Company, the car was based on a version of the European Ford Sierra XR4i adapted to US regulations. The XR4Ti and the Merkur brand name were both projects sponsored by Ford vice president Bob Lutz. Are we justified in saying that \"The XR4Ti and the Merkur brand name were not sponsored and it was based on a version of the European Ford Sierra XR4i adapted to US regulations.\"? Yes, no, or maybe? No\n###\nOverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation. It is considered one of the first survival horror games to make use of a fully three-dimensional virtual environment, second only to Riverhillsoft's own \"Doctor Hauzer\", a fully 3D survival horror game released for the 3DO in 1994. Are we justified in saying that \"OverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation and stars the character Doctor Hauzer.\"? Yes, no, or maybe? No\n###\nIppadikku Rose (Tamil: \u0b87\u0baa\u0bcd\u0baa\u0b9f\u0bbf\u0b95\u0bcd\u0b95\u0bc1 \u0bb0\u0bcb\u0bb8\u0bcd ; English: Yours truly, Rose ) is a Tamil talk show aired on Vijay TV. The show hosted by Rose. The talk show deals with current affairs touching a wide variety of social issues including traditions, taboos, rebels and culture. This is the first TV show in India hosted by a transgender person. The show is telecast at every Thursday at 11:PM IST. Are we justified in saying that \"Ippadikku Rose is named after the flower.\"? Yes, no, or maybe? No\n###\nHonest Ed's was a landmark discount store located in Toronto, Ontario, Canada. It was named for its proprietor, Ed Mirvish, who opened the store in 1948 and oversaw its operations for almost 60 years, until his death in 2007. The store continued in operation until it was permanently closed on December 31, 2016. Are we justified in saying that \"Honest Ed's closed on New Year's Eve 2016.\"? Yes, no, or maybe?", "doc_id": 663, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3374, 39593, 29773, 23085], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Daniel James Shellabarger (known as Daniel Suelo, or simply Suelo, and The Man Who Quit Money, born 1961) is an American simple living adherent who stopped using money in the autumn of 2000. He was born in Arvada, Colorado, a suburb of Denver, and currently lives part-time in a cave near Moab, Utah when he is not wandering the country. Are we justified in saying that \"The context explicitly states that Daniel James Shellabarger is known to roam the country, it follows that he has visited more than one state. Also, he was born in Colorado and now lives in Utah\"? Yes, no, or maybe? Yes\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"Aragone attended the University of Virginia but never graduated.\"? Yes, no, or maybe? Maybe\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The 1960 Gator Bowl had a winner.\"? Yes, no, or maybe? Yes\n###\nJustin Smith (born 9 August 1977 in Narromine, New South Wales) is an Australian former rugby league footballer. A utility player, Smith played for the St. George Illawarra Dragons, South Sydney Rabbitohs and the North Queensland Cowboys in the National Rugby League (NRL). Are we justified in saying that \"He was a popular player\"? Yes, no, or maybe? Maybe\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\". Are we justified in saying that \"Carmen Lebbos was born in nineteen hundred sixty three.\"? Yes, no, or maybe?", "doc_id": 568, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15965, 38782, 40015, 15156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ruddy Joraider Lugo (born May 22, 1980) is a former Dominican-American Major League Baseball right-handed relief pitcher. Lugo is the younger brother of shortstop Julio Lugo. He attended Xaverian High School (famous alumni include Chris Mullin and Rich Aurilia) in Brooklyn, New York. Are we justified in saying that \"Ruddy Luge used to be a Dominican-American major League Baseball right-handed relief pitcher.\"? Yes, no, or maybe? Yes\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"There were two players named Most Valuable Player but they didn't deserve the award.\"? Yes, no, or maybe? Maybe\n###\nLori-Jane Powell (born November 8, 1971) is a retired Canadian racquetball player from Prince Albert, Saskatchewan. Powell was Canadian Champion five times: thrice in singles and twice in doubles. She was forced to retire from competition in 2006 due to a right knee injury. Are we justified in saying that \"Lori-Jane Powell retired from racquetball thrice.\"? Yes, no, or maybe? No\n###\nDoomsday Device is a term used in professional wrestling to reference a tandem move in which one wrestler hoists the opponent on their shoulders so that they are facing in the same direction in what is known as the electric chair position, while another wrestler climbs the ring post to the top turnbuckle and delivers a flying attack on that prone opponent. Are we justified in saying that \"Doomsday Device was a popular term.\"? Yes, no, or maybe? Maybe\n###\nA sugar-baker was the owner of a sugar house, a factory for the refining of raw sugar from the Barbados. Sugar refining would normally be combined with sugar trading, which was a lucrative business. The architectural historian Kerry Downes gives an example of one sugar baker's house in Liverpool being estimated to bring in \u00a340,000 a year in trade from the Barbados. Are we justified in saying that \"Sugar refining is a lucrative business\"? Yes, no, or maybe?", "doc_id": 882, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41007, 956, 23903, 33545], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Antonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"Antonio Lewis is an American rapper from Brooklyn who has directed a music video and also produces all of his music. \"? Yes, no, or maybe? Maybe\n###\nMarvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds. Are we justified in saying that \"marvin was very loyal to the gang\"? Yes, no, or maybe? Maybe\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television. Are we justified in saying that \"Sebastian Philip Bierk was born before April 3, 1969.\"? Yes, no, or maybe? Yes\n###\nInnyaly (Russian: \u0418\u043d\u043d\u044f\u043b\u044b ) is a rural locality (a \"selo\") in Tolonsky Rural Okrug of Lensky District in the Sakha Republic, Russia, located 336 km from Lensk, the administrative center of the district and 30 km from Tolon, the administrative center of the rural okrug. Its population as of the 2002 Census was\u00a0169. Are we justified in saying that \"Innyaly is located 336 kms from Lensk.\"? Yes, no, or maybe? Yes\n###\nYngwie Johan Malmsteen ( ; born Lars Johan Yngve Lannerb\u00e4ck; 30 June 1963) is a Swedish guitarist and bandleader. Malmsteen first became known in the 1980s for his neoclassical metal playing style in heavy metal. In 2009, \"Time\" magazine rated Malmsteen as among the 10 greatest electric guitar players of all time. Are we justified in saying that \"Yngwie Johan Malmsteen is currently living\"? Yes, no, or maybe?", "doc_id": 382, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17934, 17314, 33725, 19583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "FS Kozani is a football club from Kozani, Greece. The club was founded in April 1964 in Kozani, Greece following the merger between Makedonikos and Olympiakos Kozanis. In the emblem the team kept the date both teams were founded in 1928 and took the colours of Olympiakos, the red and Makedonikos, the white. Kozani FC presently plays at the Kozani Stadium in Kozani. Are we justified in saying that \"FS Kozani players live in Greece all year round\"? Yes, no, or maybe? Maybe\n###\nHabib (Habibollah) Elghanian (Persian: \u062d\u0628\u06cc\u0628 (\u062d\u0628\u06cc\u0628\u200c\u0627\u0644\u0644\u0647) \u0627\u0644\u0642\u0627\u0646\u06cc\u0627\u0646\u200e \u200e , 5 April 1912 \u2013 9 May 1979) was a prominent Iranian Jewish businessman and philanthropist who served as the president of the Tehran Jewish Society and acted as the symbolic head of the Iranian Jewish community in the 1970s. Are we justified in saying that \"Israel will put up a stature of Habib in 2020 for his achievements\"? Yes, no, or maybe? Maybe\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185. Are we justified in saying that \"In 2005, Suffolk had more than one least populated parish,\"? Yes, no, or maybe? Yes\n###\nLate Spring () is a 2014 South Korean romance melodrama starring Park Yong-woo, Kim Seo-hyung and Lee Yoo-young. It portrays the true beauty and the platonic love discovered between a genius sculptor and his final model. It made its world premiere at the Santa Barbara International Film Festival in January 2014. Are we justified in saying that \"The film is a drama/\"? Yes, no, or maybe? Yes\n###\nMikhail Nikolayevich Baryshnikov (Russian: \u041c\u0438\u0445\u0430\u0438\u0301\u043b \u041d\u0438\u043a\u043e\u043b\u0430\u0301\u0435\u0432\u0438\u0447 \u0411\u0430\u0440\u044b\u0301\u0448\u043d\u0438\u043a\u043e\u0432 , Latvian: \"Mihails Bari\u0161\u0146ikovs\" ; born January 27, 1948), nicknamed \"Misha\" (Russian diminutive of the name \"Mikhail\"), is a Soviet and American dancer, choreographer, and actor. Are we justified in saying that \"Mikhail was also nicknamed MickaGho\"? Yes, no, or maybe?", "doc_id": 643, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6911, 9623, 20452, 43323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Johnson College Prep is a public four-year charter high school located in the Englewood neighborhood on the south side of Chicago, Illinois, United States. It is a part of the Noble Network of Charter Schools. The school is named for African-American businessman and founder of the Chicago-based Johnson Publishing Company John H. Johnson and his wife Eunice Johnson. Are we justified in saying that \"Johnson College Prep was named after a African American publishing company owner.\"? Yes, no, or maybe? Yes\n###\nTom Clancy's Splinter Cell is a 2002 stealth video game developed by Ubi Soft Montreal and built on the Unreal Engine 2. It is the first \"Splinter Cell\" game in the series. Endorsed by author Tom Clancy, it follows the activities of NSA black ops agent Sam Fisher. The character of Fisher is voiced by actor Michael Ironside. Are we justified in saying that \"The character of Fisher is voiced by Tom Clancy.\"? Yes, no, or maybe? No\n###\nJeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia. Are we justified in saying that \"Jeffrey Orlando Hunter entered the Canadian Football League after leaving the National Football League\"? Yes, no, or maybe? Maybe\n###\nThe Linkou Power Plant () is a coal-fired power plant in Linkou District, New Taipei, Taiwan. With the previous total installed capacity of 600 MW, the power plant used to be the smallest coal-fired power plant in Taiwan. The power plant is currently undergoing retrofitting to increase its installed generation capacity to 2.4 GW. Are we justified in saying that \"The power plant's capacity is being quadrupled.\"? Yes, no, or maybe? Yes\n###\nGeorge Edward Foreman (born January 10, 1949) is an American former professional boxer who competed from 1969 to 1977, and from 1987 to 1997. Nicknamed \"Big George\", he is a two-time world heavyweight champion and an Olympic gold medalist. Outside the sport he is an ordained minister, author, and entrepreneur. Are we justified in saying that \"George Foreman returned to boxing after a 10 year hiatus \"? Yes, no, or maybe?", "doc_id": 495, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35963, 44894, 31098, 33145], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Fight or Flight\" is the fifth episode of the second season of the NBC science fiction drama series \"Heroes\". It aired on October 22, 2007. This episode marks the first appearance of Elle, played by Kristen Bell, credited as a series regular. David Anders (Takezo Kensei) is also credited as a series regular from this episode. Are we justified in saying that \"David Anders no longer features in the series.\"? Yes, no, or maybe? Maybe\n###\nJohn M. W. Moorlach (born December 21, 1955 in the Netherlands) is a Republican California State Senator representing 37th Senate district, which includes portions of Orange County, since March 22, 2015. He previously served on the Orange County Board of Supervisors from December 5, 2006 \u2013 January 5, 2015 and as Orange County Treasurer-Tax Collector from March 17, 1995 \u2013 December 5, 2006. Are we justified in saying that \"John M. W. Moorlach lost every election.\"? Yes, no, or maybe? No\n###\nRelient K is the debut studio album by American rock band Relient K. Many of the tracks are newer versions of those found on their 1998 demo \"All Work & No Play\". Typical of early Relient K albums, the lyrics use pop culture references for teaching and to illustrate Biblical principles. As of late 2006/early 2007, this album has sold around 400,000 copies. Are we justified in saying that \"It would be impossible to find pop culture references embedded in the lyrics of Relient K tracks.\"? Yes, no, or maybe? No\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015. Are we justified in saying that \"Dantoro was from africa\"? Yes, no, or maybe? Yes\n###\nHenry Nelson Pope (April 23, 1859 - June 13, 1956) was president of the Texas Farmers Union and president of the Association of State Presidents of the Farmers' Education and Cooperative Union of America, and president of the American Federation of Organized Producers and Consumers. Are we justified in saying that \"Henry Nelson Pope is not currently living.\"? Yes, no, or maybe?", "doc_id": 844, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19160, 2736, 41344, 33168], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hipmunk, stylized as hipmunk, is a consumer-oriented online travel company headquartered in San Francisco, California. It was co-founded by Adam Goldstein and Reddit co-founder Steve Huffman in 2010. The company focuses on the visual organization of flight search results. It received increased media attention when Google announced its flight search service in September 2011. Are we justified in saying that \"Goldstein was older than Huffman in 2010.\"? Yes, no, or maybe? Maybe\n###\nSarah Beth Noriega (born April 24, 1976) is a former indoor volleyball player. She played for Loyola Marymount University from 1994 to 1997 and was named the 1997 West Coast Conference Player of the Year. She also played for the United States national team at the 2000 Summer Olympics. Are we justified in saying that \"sarah win golden medal in the 2000 summer olympics\"? Yes, no, or maybe? Maybe\n###\nMaria Ho (born March 6, 1983 in Taipei, Taiwan) is a Taiwanese American poker player, television personality and host. She is known as one of the top ranked female poker players in the world; a 3-time Bluff Reader's Choice Awards nominee for Favorite Female Poker Player and a World Series of Poker record-breaker, and for competing on the 15th season of \"The Amazing Race\". Are we justified in saying that \"Maria Ho can read faces.\"? Yes, no, or maybe? Maybe\n###\nThe 2004 IIFA Awards, officially known as the 5th International Indian Film Academy Awards ceremony, presented by the International Indian Film Academy honoured the best films of 2003 and took place between May 20\u201322, 2004. This year, the city of Singapore played host to the Indian Film Industry. The tag line of this year's IIFA Awards was \"Uniquely IIFA, Uniquely Singapore ...\". Are we justified in saying that \"The ceremony took place for approximately 24 hours\"? Yes, no, or maybe? Yes\n###\nThe Friant-Kern Canal is a 152 mi Central Valley Project aqueduct managed by the United States Bureau of Reclamation in Central California to convey water to augment irrigation capacity in Fresno, Tulare, and Kern counties. Construction began in 1949 and the canal was completed in 1951, at a cost of $60.8 million. Are we justified in saying that \"The Friant-Kern Canal is more than 200 kilometers long.\"? Yes, no, or maybe?", "doc_id": 531, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11614, 28, 61, 29363], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Eglinton Castle estate was situated at Irvine, on the outskirts of Kilwinning, North Ayrshire, Scotland (map reference NS 3227 42200) in the former district of Cunninghame. Eglinton Castle, was once home to the Montgomeries, Earls of Eglinton and chiefs of the Clan Montgomery. Eglinton Country Park now occupies part of the site. Are we justified in saying that \"Eglinton Castle was once home to three Earls.\"? Yes, no, or maybe? Maybe\n###\nCherry Tomato () is a 2008 South Korean film starring Shin Goo and Kim Hyang-gi. The family drama, a directorial debut by Jung Young-bae, depicts the poverty-stricken life of an old man and his granddaughter that evokes a strong sense of sympathy and helplessness. It was screened at the Busan Children\u2019s Film Festival in 2008. Are we justified in saying that \"Jung Young-bae directed one film in 2008\"? Yes, no, or maybe? Maybe\n###\nThirteen Ghosts (also known as 13 Ghosts and stylized as THIR13EN Ghosts) is a 2001 Canadian-American supernatural horror film directed by Steve Beck. It is a remake of the 1960 film \"13 Ghosts\" by William Castle. It follows the remake of another one of Castle's films, \"House on Haunted Hill\", and was shot entirely around Lower Mainland, British Columbia. Are we justified in saying that \"Thirteen Ghosts has had more than one film adaptation\"? Yes, no, or maybe? Yes\n###\n\"It's the Little Things\" is a 1967 single by Sonny James. \"It's the Little Things\" was Sonny James' twenty-fifth release on the country chart, the song went to number one on the country chart for five weeks and spent a total of fourteen weeks on the charts. Are we justified in saying that \"Only women liked the song\"? Yes, no, or maybe? Maybe\n###\nThe Angel and the Soldier Boy is the 13th album by Irish folk group Clannad, released in 1989. It is the soundtrack to the animated movie of the same name and contains both the music to the movie and the narration of the story by Tom Conti. The animation is based on the award-winning children's picture book by Peter Collington. Are we justified in saying that \"The Angel and the Soldier Boy was created within the past 30 years\"? Yes, no, or maybe?", "doc_id": 56, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21707, 1070, 26433, 40731], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Misty Knight is a fictional character appearing in American comic books published by Marvel Comics. Created by Tony Isabella and Arvell Jones, Knight was first mentioned (by name) in \"Marvel Premiere\" #20 (January 1975) and appeared in the next issue. Are we justified in saying that \"Misty Knight sold billions.\"? Yes, no, or maybe? Maybe\n###\nOverBlood is a science fiction video game developed by Riverhillsoft and published by Electronic Arts for the PlayStation. It is considered one of the first survival horror games to make use of a fully three-dimensional virtual environment, second only to Riverhillsoft's own \"Doctor Hauzer\", a fully 3D survival horror game released for the 3DO in 1994. Are we justified in saying that \"OverBlood was a very popular game.\"? Yes, no, or maybe? Maybe\n###\nThe Master of Revenge () is a 2016 South Korean television series starring Chun Jung-myung, Jo Jae-hyun, Jeong Yoo-mi, Lee Sang-yeob and Gong Seung-yeon. It aired on KBS2 from April 27, 2016 to June 30, 2016 on Wednesdays and Thursdays at 21:55 for 20 episodes. Are we justified in saying that \"Master of Revenge first aired in April 2016\"? Yes, no, or maybe? Yes\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology. Are we justified in saying that \"Pavel Sergeyevich Alexandrox made significant contributions to set theory and topology by writing three hundred papers.\"? Yes, no, or maybe? Yes\n###\nSoul Ballet is a musical project of actor, producer, arranger, programmer, and multi-instrumentalist Rick Kelly \"RK.\" Soul Ballet\u2019s music is smooth contemporary jazz/electronica, characterized as pulsating electronic beats entwined with a dark, moody atmosphere. Are we justified in saying that \"Soul Ballet works with Rick Kelly \"KR\"\"? Yes, no, or maybe?", "doc_id": 598, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31133, 28500, 38534, 38414], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "University of Maryland Eastern Shore (UMES), located on 745 acres (3.01 km) in Princess Anne, Maryland, United States, is part of the University System of Maryland. UMES is a historically black university, as well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. Are we justified in saying that \"University of Maryland Eastern Shore helps a lot of students with financial aid\"? Yes, no, or maybe? Maybe\n###\nWalcha Shire is a local government area located in the New England region of New South Wales, Australia. The Shire is situated adjacent to the junction of the Oxley Highway and Thunderbolts Way and is 20 km east of the Main North railway line passing through Walcha Road. Are we justified in saying that \"Oxley Highway and Thunderbolts Way is east of the Main North railway line.\"? Yes, no, or maybe? Yes\n###\nChristopher Lawrence (born 24 December 1956) is a classical musician, author, and conductor. He is most notable for his work as a broadcaster on Australian Broadcasting Corporation radio station ABC Classic FM, and previously on ABC Radio National and ABC Local Radio. Are we justified in saying that \"Christopher is a blues musician. \"? Yes, no, or maybe? No\n###\nBouck's Island is a farm near Breakabeen, New York within the town of Fulton, Schoharie County, New York near Fultonham, New York. Bouck's Island was the home of former New York governor William C. Bouck. Congressman Joseph Bouck was born on Bouck's Island and Wisconsin Congressman Gabriel Bouck once lived there. Are we justified in saying that \"Congressman Joseph Bouck was born in Breakabeen, New York.\"? Yes, no, or maybe? No\n###\n\"The DArkest Knight\" is tenth episode of the seventh season of the American mystery\u2013thriller television series \"Pretty Little Liars\". The installment was directed by Arlene Sanford and written by showrunner I. Marlene King and executive producer Maya Goldsmith. It premiered on August 23, 2016, on the cable network Freeform. Are we justified in saying that \"Season seven had ten episodes before The Darkest Knight.\"? Yes, no, or maybe?", "doc_id": 148, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29722, 10927, 17997, 39526], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bela George Lugosi (born January 5, 1938 in Los Angeles, California), also known as Bela Lugosi Jr., is an American attorney and the son of actor B\u00e9la Lugosi. His legal actions in \"Lugosi v. Universal Pictures\" led to the creation of the California Celebrities Rights Act. Are we justified in saying that \"Bela was involved in multiple lawsuits.\"? Yes, no, or maybe? Maybe\n###\nRise of the Dragon is a graphic adventure game released in 1990 for DOS and Macintosh, and later remade for the Sega CD (1993) as well as the Amiga. It was one of the few adventure game titles developed by Dynamix, a company that was better known as an action and flight simulator game developer. Are we justified in saying that \"The genre of Rise of the Dragon is not the most familiar type for Dynamix.\"? Yes, no, or maybe? Yes\n###\nGun Bow (1960 \u2013 December 1979) was an American Thoroughbred racehorse. He was one of America's leading older male racehorses in 1964 and 1965 and was later inducted into the Hall of Fame. Gun Bow was noted for his rivalry with five-time American Horse of the Year Kelso. Are we justified in saying that \"Gun Bow was not a very fast dog.\"? Yes, no, or maybe? No\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums). Are we justified in saying that \"All of the members of Norwegian black metal band Emperor came back for their reunion in 2005.\"? Yes, no, or maybe? Maybe\n###\n\"Hang On\" is a song by the band Weezer. The song impacted radio on February 15, 2011. It is the seventh track and second single from their eighth studio album, \"Hurley\". The album version of \"Hang On\" is co-written by Rick Nowels and features Canadian actor Michael Cera on backing vocals and pseudo-mandolin. The single version features no mandolin, and contains a harder sound. Are we justified in saying that \"If you listen carefully you can hear Michael Cera in on Weezers song\"? Yes, no, or maybe?", "doc_id": 483, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40669, 7562, 33839, 31825], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Club Deportivo Aguilar is a football team based in Aguilar de Campoo in the autonomous community of Castile and Le\u00f3n. Founded in 1947, it plays in the Primera Provincial. Its stadium is \"Ciudad Deportiva Alberto Fern\u00e1ndez\" with a capacity of 6,000 seats. Are we justified in saying that \"Aguilar's is located in Spain (Leon)\"? Yes, no, or maybe? Maybe\n###\nDaraar is a 1996 Indian thriller film directed by Abbas-Mustan starring Juhi Chawla, Rishi Kapoor and Arbaaz Khan in his film debut. Khan received the Filmfare Best Villain Award for his portrayal as a possessive husband. The film became average at the box office and was inspired by the Julia Roberts thriller \"Sleeping with the Enemy\". Are we justified in saying that \"Daraar exceeded expectations at the box office and became a hit.\"? Yes, no, or maybe? No\n###\nThe William Martin Armistead House is a historic building at 1510 Hyde Park Street in Sarasota, Florida, United States. It was owned by William Martin Armistead who was prominent in the advertising industry. On March 30, 2009, it was added to the U.S. National Register of Historic Places. Are we justified in saying that \"Sarasota, Florida has many places on the National Registry\"? Yes, no, or maybe? Maybe\n###\nBianca Gascoigne (born 28 October 1986) is a British glamour model and television personality. She is the daughter of Sheryl Gascoigne, and adopted daughter of Paul Gascoigne, a former footballer. She has a brother Mason and a half-brother Regan Gascoigne. She came sixth in the nineteenth series of Channel 5 reality show \"Celebrity Big Brother\". Are we justified in saying that \"Bianca Gascoigne is 25 years old today\"? Yes, no, or maybe? No\n###\nBaker College Preparatory High School (also known as Baker College Prep) is a public four-year charter high school located in the South Chicago neighborhood on the far south side of Chicago, Illinois. It is operated by the Noble Network of Charter Schools. It shares its campus with Bowen High School. Baker is named for civil and human rights activist Ella Baker. Are we justified in saying that \"The school is going to be built in 2028\"? Yes, no, or maybe?", "doc_id": 932, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10762, 38617, 16116, 41960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Introduction to Finality\" is the 22nd episode of the third season of the American television series \"Community\" and the third season finale. It originally aired on May 17, 2012 on NBC. This was the last episode to air with series creator Dan Harmon as showrunner before he was fired, though Harmon would later return as showrunner for the 5th season. Are we justified in saying that \"\"Introduction to Finality\" is the third episode of Community\"? Yes, no, or maybe? No\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Lucy Mack Smith was familiar with her son's life and activities\"? Yes, no, or maybe? Yes\n###\nJulian William Kennedy Burnside AO QC (born 9 June 1949) is an Australian barrister, human rights and refugee advocate, and author. He practises principally in commercial litigation, trade practices and administrative law. He is known for his staunch opposition to the mandatory detention of asylum seekers, and has provided legal counsel in a wide variety of high-profile cases. Are we justified in saying that \"When added together, the numerals in the year Burnside was born equal 23.\"? Yes, no, or maybe? Yes\n###\nPedro Nuno Gon\u00e7alves Concei\u00e7\u00e3o Silva (born January 21, 1977) is a Portuguese former swimmer, who specialized in sprint freestyle events. He is a two-time Olympian (2000 and 2004) and a former Portuguese record holder in the 50 m freestyle (22.86). Silva is a resident athlete for Sport Alg\u00e9s e Dafundo, and is trained by his long-time coach, director, and mentor M\u00e1rio Madeira. Are we justified in saying that \"Pedro was by his long time coach and mentor M\u00e1rio Madeira\"? Yes, no, or maybe? Yes\n###\nThe Tasmanian Legislative Council is the upper house of the Parliament of Tasmania in Australia. It is one of the two chambers of the Parliament, the other being the House of Assembly. Both houses sit in Parliament House in the state capital, Hobart. Members of the Legislative Council are often referred to as MLCs. Are we justified in saying that \"Tasmania is the state capital. \"? Yes, no, or maybe?", "doc_id": 432, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10677, 32441, 7627, 8114], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christopher David Mole (born 16 March 1958) is a British Labour Party politician, who was the Member of Parliament (MP) for Ipswich from a by-election in 2001, after the death of Jamie Cann, and was re-elected in 2005. He was Parliamentary Under Secretary of State at the Department for Transport, until his defeat in the 2010 general election by Ben Gummer, son of former MP John Gummer. Are we justified in saying that \"John Gummer has a kid\"? Yes, no, or maybe? Yes\n###\nNASA John H. Glenn Research Center at Lewis Field is a NASA center, located within the cities of Brook Park and Cleveland between Cleveland Hopkins International Airport and the Cleveland Metroparks's Rocky River Reservation, with a subsidiary facility in Sandusky, Ohio. Its director is Janet L. Kavandi. Are we justified in saying that \"The center is named off a person who's name started with a J\"? Yes, no, or maybe? Yes\n###\nKeystone is an unincorporated community and census-designated place in central Keith County, Nebraska, United States. It lies along local roads near the North Platte River, northeast of the city of Ogallala, the county seat of Keith County. Its elevation is 3,100\u00a0feet (945\u00a0m). Although Keystone is unincorporated, it has a post office, with the ZIP code of 69144. Are we justified in saying that \"Keystone is the largest unincorporated community in central Keith County, Nebraska with a post office.\"? Yes, no, or maybe? Maybe\n###\nThe Ghost and Mrs. Muir (1947) is a romantic-fantasy film starring Gene Tierney and Rex Harrison. It was directed by Joseph L. Mankiewicz, and is based on a 1945 novel written by Josephine Leslie under the pseudonym of R. A. Dick. In 1945, 20th Century Fox bought the film rights to the novel, which had been published only in the United Kingdom at that time. It was shot entirely in California. Are we justified in saying that \" Josephine Leslie wrote the novel that inspired the movie The Ghost and Mrs. Muir \"? Yes, no, or maybe? Yes\n###\nThe Empire Icon Award is an honorary Empire Award presented by the British film magazine \"Empire\". The Empire Icon Award was first introduced at the 11th Empire Awards ceremony in 2006 with Brian Cox receiving the award. The award was absent from the 12th, 17th and 18th Empire Awards ceremonies. Hugh Jackman is the most recent winner in this category. Are we justified in saying that \"Brian Cox was honored at the 18th Empire Awards ceremony\"? Yes, no, or maybe?", "doc_id": 732, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14930, 11801, 20148, 583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hubertus \"Huib\" Wilton (12 March 1921 in Rotterdam \u2013 29 October 1959) was a Dutch tennis player. He was on the 1953 Netherlands Davis Cup team which also included among others Hans van Swol (his partner in the men's doubles), Boebi van Meegeren and Ivo Rinkel. In 1950 Wilton reached the second round at Wimbledon, losing to Henry Billington of Great Britain 6\u20131 7\u20135 11\u20139. Are we justified in saying that \"Billington was also on the 1952 Davis Cup team.\"? Yes, no, or maybe? Maybe\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\". Are we justified in saying that \"LA1:TV is for profit\"? Yes, no, or maybe? No\n###\nBarbatodon is a mammal genus from the Upper Cretaceous period. It lived in Transylvania at the same time as some of the last dinosaurs and was a member of the extinct order of Multituberculata. It is within the suborder of Cimolodonta, and the family Kogaionidae. The genus \"Barbatodon\" was named by R\u00e3dulescu R. and Samson P. in 1986. Are we justified in saying that \"Barbatodon lived for millions of years.\"? Yes, no, or maybe? Maybe\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others. Are we justified in saying that \"Gay Sex in the 70s is one of the best documentaries ever.\"? Yes, no, or maybe? Maybe\n###\n\"(Baby) You Don't Have to Tell Me\" (often written \"You Don't Have to Tell Me\") is a song by New York songwriter Pete Antell (formerly of the American pop group The Chants) and first recorded bysinger Bobby Coleman. The obscure song was later recorded and released by the American pop group the Walker Brothers as their sixth single in 1966. The accompaniment was directed by Reg Guest. Are we justified in saying that \"The Walker Brothers released six singles before You Don't Have to Tell me.\"? Yes, no, or maybe?", "doc_id": 173, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11194, 25432, 34668, 12376], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"New York Mets had a losing record because they lost their star players\"? Yes, no, or maybe? Maybe\n###\nThe Tsavo Man-Eaters were a pair of man-eating Tsavo lions responsible for the deaths of a number of construction workers on the Kenya-Uganda Railway from March through December 1898. The significance of this pair of lions was their unusual behavior, such as the number of men killed and the manner of the attacks. Are we justified in saying that \"The Tsavo Man-Eaters were killed multiple people.\"? Yes, no, or maybe? Yes\n###\nThe Feed icon is for indicating that a web feed is available on a web page. It was originally invented for the use of RSS, but it is also common for Atom and other web feeds now. The icon is normally orange, with hex code #FA9B39. The original icon was created by Stephen Horlander, a designer at Mozilla. Are we justified in saying that \"The Feed icon is widely used by internet users\"? Yes, no, or maybe? Maybe\n###\nGhost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997). Are we justified in saying that \"The members of Veruca Salt have remained the same since 1997.\"? Yes, no, or maybe? No\n###\nK\u00e1roly P\u00e1ncz\u00e9l (born April 3, 1961) is a Hungarian teacher and politician, member of the National Assembly (MP) for R\u00e1ckeve (Pest County Constituency XIII) from 2002 to 2006 and from 2010 to 2014. He was also Member of Parliament from the Pest County Regional List of Fidesz between 1998\u20132002 and 2006\u20132010. He was elected MP for Dabas (Pest County Constituency XI) in 2014. Are we justified in saying that \"K\u00e1roly P\u00e1ncz\u00e9l was the first female member of the National Assembly (MP) for R\u00e1ckeve.\"? Yes, no, or maybe?", "doc_id": 403, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23690, 23007, 40822, 16139], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The final of the 1983 Prudential Cup was played between India and West Indies at the Lord's Cricket Ground on 25 June 1983. This was the third consecutive World Cup final appearance for West Indies. India playing their first finals defeated the West Indies to claim the title. It was the first world cup win for India. Are we justified in saying that \"West Indies were mad at India.\"? Yes, no, or maybe? Maybe\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"The university has more men than women\"? Yes, no, or maybe? Maybe\n###\nThe 1821 Norfolk and Long Island Hurricane was one of four known tropical cyclones that have made landfall in New York City. Another, even more intense hurricane in pre-Columbian times (sometime between 1278 and 1438) left evidence that was detected in southern New Jersey by paleotempestological research. The third was the 1893 New York hurricane, and the fourth was Hurricane Irene in 2011. Are we justified in saying that \"There have been five other cyclones in New York City.\"? Yes, no, or maybe? No\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Kant served as VP during spring of 2002.\"? Yes, no, or maybe? Yes\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures. Are we justified in saying that \"A Daughter of the Wolf features acting\"? Yes, no, or maybe?", "doc_id": 893, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40017, 28038, 44093, 29953], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Samson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation. Are we justified in saying that \"Samson and Delilah, Op. 47 is performed in a German translation.\"? Yes, no, or maybe? Yes\n###\n\"We Really Shouldn't Be Doing This\" is a song written by Jim Lauderdale, and recorded by American country music artist George Strait. It was released in September 1998 as the third and final single from his album \"One Step at a Time\". It peaked at number 4 in the United States, and number 2 in Canada. Are we justified in saying that \"\"We Really Shouldn't Be Doing This\" was the last single from \"One Step at a Time\"\"? Yes, no, or maybe? Yes\n###\n\"Professional Rapper\" is a song by American rapper Lil Dicky from his debut studio album \"Professional Rapper\". It was released on July 31, 2015 as the album's second single. It was produced by Stan Lane and features a guest appearance by West Coast hip hop artist Snoop Dogg. Are we justified in saying that \"Snoop Dog appears on more than one song on the album Professional Rapper.\"? Yes, no, or maybe? Maybe\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"Murder of the Universe has been covered by black sabbath\"? Yes, no, or maybe? Maybe\n###\nDr. Jeckyll & Mr. Hyde was an American 1980s hip-hop group consisting of Andre \"Dr. Jeckyll\" Harrell and Alonzo \"Mr. Hyde\" Brown. The group was known for its corporate business image, wearing designer suits and ties while they rapped. The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records in 1980. Are we justified in saying that \"The group first performed under the name Harlem World Crew and recorded on Tayster and Rojac Records more than 1980 years ago.\"? Yes, no, or maybe?", "doc_id": 540, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18754, 22218, 37918, 41124], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Linyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"LYU has many students.\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"The war created chaos in the surrounding communities.\"? Yes, no, or maybe? Maybe\n###\nGreg Lazarus is the pen name of South African husband-and-wife writing duo Greg Fried and Lisa Lazarus. Greg Lazarus is the author of the novels \"Paradise\" and \"When in Broad Daylight I Open my Eyes\", and the couple have also published the memoir \"The Book of Jacob: A Journey into Parenthood\". Are we justified in saying that \" \"When in Broad Daylight I Open my Eyes\" was a song written by Elton John.\"? Yes, no, or maybe? No\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property. Are we justified in saying that \"Nikola Tesla still resides at Wardenclyffe\"? Yes, no, or maybe? No\n###\nUSNS \"Lone Jack\" (T-AO-161) was a type Type T2-SE-A1 tanker laid down under Maritime Commission contract (USMC number 1783) by the Sun Shipbuilding & Dry Dock Co. of Chester, Pennsylvania (hull number 450) on 11 July 1944. The ship was launched on 21 October 1944, sponsored by Mrs. Julia W. Bruwier, and delivered to Cities Service Oil Co. of New York City on 31 October 1944. Are we justified in saying that \"USNS \"Lone Jack\" was launched before the end of WW2\"? Yes, no, or maybe?", "doc_id": 934, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23266, 28007, 21330, 6233], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"I'd Be Lost\" and \"Only One\" are two songs recorded by Australian singer-songwriter Sarah Blasko for her fifth studio album \"Eternal Return\". Both songs premiered on 13 September 2015 during Richard Kingsmill's new music segment on Triple J and were released as a double A-side on 18 September 2015. Are we justified in saying that \"Sarah Blasko writes all of her own music.\"? Yes, no, or maybe? Maybe\n###\nTrainspotting is a 1996 British black comedy crime film directed by Danny Boyle, and starring Ewan McGregor, Ewen Bremner, Jonny Lee Miller, Kevin McKidd, Robert Carlyle, and Kelly Macdonald in her acting debut. Based on the novel of the same name by Irvine Welsh, the film was released in the United Kingdom on 23 February 1996. Are we justified in saying that \"Trainspotting is intended to by funny.\"? Yes, no, or maybe? Yes\n###\nRecently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct. Are we justified in saying that \"There have been less than 80 recently extinct species.\"? Yes, no, or maybe? No\n###\nHow Green Was My Valley is a BBC Television serial based on the novel by Richard Llewellyn, and features one of the last performances by Stanley Baker. It was first shown in the UK from 29 December 1975 in six weekly parts, while producer Martin Lisemore also cast Si\u00e2n Phillips in his next production, \"I Claudius\" (1976). Are we justified in saying that \"How Green Was My Valley was based on a BBC Television serial. \"? Yes, no, or maybe? No\n###\nMan in a Hurry (French: \"L'Homme press\u00e9\" , Italian: \"L'ultimo giorno d'amore\" , released in UK as The Hurried Man) is a 1977 French-Italian drama film directed by \u00c9douard Molinaro and starring Alain Delon and Mireille Darc. It is based on the novel \"The Man in a Hurry\" by Paul Morand. It recorded admissions of 730,581 in France. Are we justified in saying that \"Man in a Hurry had over 1 million admissions in France.\"? Yes, no, or maybe?", "doc_id": 112, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17764, 30147, 35248, 41959], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Mast\u00edn Espa\u00f1ol or Spanish Mastiff, is a giant breed of dog, originating in Spain, originally bred to be a guard dog whose specialized purpose is to be a Livestock guardian dog protecting flocks (and occasionally herds) from wolves and other predators. The Mountain dog type has a heavier coat. Are we justified in saying that \"Spanish Mastiff is a good guard dog\"? Yes, no, or maybe? Maybe\n###\nThe Reunion (also titled as The Reunion: Live At The Hyatt Regency 9.11.2010) is a live album released on January 11, 2015 by the Washington, D.C.-based go-go band Rare Essence. The album was recorded live at the Hyatt Regency in Crystal City, Virginia on September 11, 2010. Are we justified in saying that \"The Reunion is a four member band\"? Yes, no, or maybe? Maybe\n###\nLourdes Ver\u00f3nica Ar\u00e9valos Elias (born January 13, 1984, in San Lorenzo) is a Paraguayan model and beauty pageant titleholder who represented her country in Miss Universe 2006 held in Los Angeles, California, USA on July 23, 2006. She won the title of \"3rd Runner-up\". In the same year she represented Paraguay in the Reina Sudamericana 2006 beauty pageant and won the title of \"2nd Runner-up\". Are we justified in saying that \"Lourdes Ver\u00f3nica Ar\u00e9valos Elias will play a role in the next Marvel film\"? Yes, no, or maybe? Maybe\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits. Are we justified in saying that \"This box set contains music by the band not commonly heard or released.\"? Yes, no, or maybe? Yes\n###\nPeter L. N. Padfield (born 1932) is a British author, biographer, historian, and journalist who specializes in naval history and in the Second World War period. His early journalism appeared under the name P. L. N. Padfield. As well as his non-fiction work, he has also published four novels. Are we justified in saying that \"Peter Padfield is a European novelist.\"? Yes, no, or maybe?", "doc_id": 120, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11222, 18738, 5948, 25654], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Peter Murray Kapetan was an American Broadway actor, singer and dancer notable for playing numerous roles during a thirty-year career. He was notable for performing in the musical \"The Wedding Singer\" as a Ronald Reagan impersonator. He appeared in \"Titanic\", \"Sunset Boulevard\", \"Joseph and the Amazing Technicolor Dreamcoat\", and \"Got Tu Go Disco\". Are we justified in saying that \"Peter Murray Kapetan appeared in a minimum of five different productions over his career. \"? Yes, no, or maybe? Yes\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"LYU has affordable tuition.\"? Yes, no, or maybe? Maybe\n###\nThe Middlewich Folk and Boat Festival takes place in June in Middlewich, Cheshire, England. The festival builds on the town's industrial heritage in which canal boats were used to move coal and other raw materials in the town for the production of salt, and then move the salt out of town, either for use directly, or as a raw material in the manufacture of chemicals such as chlorine and soda ash. Are we justified in saying that \"The Middlewich Folk and Boat Festival is the only festival that Middlewich, Cheshire, England celebrates during the month of June.\"? Yes, no, or maybe? Maybe\n###\nProject Gasbuggy was an underground nuclear detonation carried out by the United States Atomic Energy Commission on December 10, 1967 in rural northern New Mexico. It was part of Operation Plowshare, a program designed to find peaceful uses for nuclear explosions. Are we justified in saying that \"Project Gasbuggy caused radiation poisoning.\"? Yes, no, or maybe? Maybe\n###\nSomething Like Human is the second album by the band Fuel released in 2000 on Epic Records. \"Something Like Human\" reached #17 on the U.S. Billboard Top 200, and featured their first U.S. Top 40 hit with \"Hemorrhage (In My Hands)\" which reached #30 on the \"Billboard\" Hot 100 charts. It remains one of their most popular songs to date. Are we justified in saying that \"Something Like Human is an awesome album that was released in 2000, it reached nr. 2 on the U.S. Billboard Top 200 \"? Yes, no, or maybe?", "doc_id": 198, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [132, 14924, 20056, 7833], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "I Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013. Are we justified in saying that \"I Love Hong Kong has three sequels.\"? Yes, no, or maybe? Yes\n###\nMIT Technology Review is a magazine published by the Massachusetts Institute of Technology. It was founded in 1899 as The Technology Review, and was re-launched without \"The\" in its name on April 23, 1998 under then publisher R. Bruce Journey. In September 2005, it underwent another transition under its then editor-in-chief and publisher, Jason Pontin, to a form resembling the historical magazine. Are we justified in saying that \"MIT Technology Review has been updated or re-branded twice.\"? Yes, no, or maybe? Yes\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele. Are we justified in saying that \"Stanley Fredrick Steele scored 97 goals from 1955 until 1968.\"? Yes, no, or maybe? Yes\n###\nThe Nariphon (Thai: \u0e19\u0e32\u0e23\u0e35\u0e1c\u0e25 ), also known as Makkaliphon (Thai: \u0e21\u0e31\u0e01\u0e01\u0e30\u0e25\u0e35\u0e1c\u0e25 , from Pali \"makkaliphala\"), is a tree in Buddhist mythology which bears fruit in the shape of young female creatures. The maidens grow attached by their head from the tree branches. This tree grows at the Himaphan, a mythical forest where the female fruits are enjoyed by the Gandharvas who cut the fruits and take them away. Are we justified in saying that \"The Nariphon is also called Makkaliphon \"? Yes, no, or maybe? Yes\n###\nAllen S. Weiner, former Stanford Professor of International Law, is a senior lecturer in International Law at Stanford Law School, and co-director of the Stanford Program in International and Comparative Law and the Stanford Center on International Conflict and Negotiation. Are we justified in saying that \"Weiner is not smart. \"? Yes, no, or maybe?", "doc_id": 603, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34611, 1040, 14379, 8343], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"He is professional skateboarder who lived in his van on the west side of LA\"? Yes, no, or maybe? Yes\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production. Are we justified in saying that \"Stella was a novel that took place in 1941 about the Battle of the Atlantic.\"? Yes, no, or maybe? Yes\n###\nRyman Auditorium (formerly Grand Ole Opry House and Union Gospel Tabernacle) is a 2,362-seat live performance venue, located at 116 5th Avenue North, in Nashville, Tennessee and is best known as the home of the \"Grand Ole Opry\" from 1943 to 1974. It is owned and operated by Ryman Hospitality Properties, Inc. Are we justified in saying that \"The Grand Ole Opry was moved to Memphis Tennessee after 1974\"? Yes, no, or maybe? Maybe\n###\nZina Lynna Garrison (born November 16, 1963) is a former professional tennis player from the United States. During her career, she was a women's singles runner-up at Wimbledon in 1990, a three-time Grand Slam mixed doubles champion, and a women's doubles gold medalist and singles bronze medalist at the 1988 Olympic Games. She is currently coaching Taylor Townsend. Are we justified in saying that \"Garrison played doubles with Graf.\"? Yes, no, or maybe? Maybe\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"In 2017, the final tournament in the 2017 Overwatch World Cup will be held during the last week of November.\"? Yes, no, or maybe?", "doc_id": 556, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17373, 20062, 15721, 34574], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Anna Sun\" is a song by American rock band Walk the Moon, originally for their 2010 album \"I Want! I Want!\". The song was written by band members Adrian Galvin, Nick Lerangis, Nicholas Petricca and Adam Reifsnyder about Adrian's ex-girlfriend, Anna Ceravolo. It was included on the band's 2012 major-label debut album, \"Walk the Moon\". It was released as a commercial single on February 7, 2012. Are we justified in saying that \"Anna Sun was the band's most profitable song\"? Yes, no, or maybe? Maybe\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"Gwyneth Paltrow and Kelly Preston know each other very well\"? Yes, no, or maybe? Maybe\n###\nTelephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh. Are we justified in saying that \"TSS exports to America\"? Yes, no, or maybe? Maybe\n###\nFrederick Wiseman (born January 1, 1930) is an American filmmaker, documentarian, and theatre director. His work is \"devoted primarily to exploring American institutions\". He has been called \"one of the most important and original filmmakers working today\". Are we justified in saying that \"Frederick Wiseman was considered very easy to work with\"? Yes, no, or maybe? Maybe\n###\nSusan Lynch (born 5 June 1971) is a Northern Irish actress. A three-time IFTA Award winner, she also won the British Independent Film Award for Best Supporting Actress for the 2003 film, \"16 Years of Alcohol\". Her other film appearances include \"Waking Ned\" (1998), \"Nora\" (2000), \"Beautiful Creatures\" (2000), and \"From Hell\" (2001). Are we justified in saying that \"Susan Lynch acted in \"16 Years of Alcohol\" which was released in the 2010s.\"? Yes, no, or maybe?", "doc_id": 809, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [43067, 37175, 44392, 16173], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records. Are we justified in saying that \"The album was released in 2015 + 1\"? Yes, no, or maybe? Yes\n###\nDance India Dance is an Indian dance competition show that airs on Zee TV, produced by Essel Vision Productions. The show is the national TV version of the Zee Bangla dance reality show \"Dance Bangla Dance\" which also has Mithun Chakraborty as the Grandmaster. Are we justified in saying that \"Dance India Dance is shown on national television.\"? Yes, no, or maybe? Yes\n###\nForest Friends (French: Les Copains de la For\u00eat ) is an animated television series initially broadcast on TF1, and later rerun on French children's network TiJi. Along with the CGI series \"The Odd Family\", this was one of the first shows produced by Timoon Animation. a company created by Philippe Mounier. Are we justified in saying that \"Timoon Animation was the sole company created by Philippe Mounier\"? Yes, no, or maybe? Maybe\n###\nDavid Thomas Bush (born November 9, 1979) is an American former professional baseball pitcher. He played in Major League Baseball (MLB) for the Toronto Blue Jays, Milwaukee Brewers, Texas Rangers, and Philadelphia Phillies. Bush also played for the SK Wyverns of the KBO League. Are we justified in saying that \"David Thomas Bush played in more than 3 different MLB teams\"? Yes, no, or maybe? Yes\n###\nYi Bangja, Crown Princess Uimin of Korea (also \"Euimin\", Japanese: \u674e\u65b9\u5b50 \"Ri Masako\") (4 November 1901 \u2013 30 April 1989) was the consort of Crown Prince Euimin of Korea. She and her husband would have been the emperor and empress of the Empire of Korea if Korea had not been annexed to the Empire of Japan in 1910. Are we justified in saying that \"Yi Bangja died when she was 50 years old.\"? Yes, no, or maybe?", "doc_id": 341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28714, 1437, 6384, 12146], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Englandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo. Are we justified in saying that \"Knut Wigert was in Englandfarere film. \"? Yes, no, or maybe? Yes\n###\n\"Live Free or Die\" is the 71st episode of the HBO original series \"The Sopranos\" and the sixth of the show's sixth season. Written by David Chase, Terence Winter, Robin Green, and Mitchell Burgess, and directed by Tim Van Patten, it originally aired on April 16, 2006. Are we justified in saying that \"Tim Van Patten directed the entire sixth season of The Sopranos.\"? Yes, no, or maybe? Maybe\n###\nThe Love for Three Oranges, Op. 33, also known by its French language title L'amour des trois oranges (Russian: \u041b\u044e\u0431\u043e\u0432\u044c \u043a \u0442\u0440\u0451\u043c \u0430\u043f\u0435\u043b\u044c\u0441\u0438\u043d\u0430\u043c , \"Lyubov' k tryom apel'sinam\"), is a satirical opera by Sergei Prokofiev. Its French libretto was based on the Italian play \"L'amore delle tre melarance\" by Carlo Gozzi. The opera premiered at the Auditorium Theatre in Chicago, Illinois, on 30 December 1921. Are we justified in saying that \"The opera debuted two days before the new year.\"? Yes, no, or maybe? Yes\n###\nJoseph Maurice Francis Connaughton (15 August 1918 \u2013 12 February 1944) was an English first-class cricketer active 1939 who played for Middlesex. He was born in Paddington. During World War II he was commissioned in the Royal Artillery. He was drowned off the Maldives after SS \"Khedive Ismail\" was torpedoed; and officially declared dead one year later. Are we justified in saying that \"Joseph Maurice Francis Connaughton is alive.\"? Yes, no, or maybe? No\n###\nPLU Crew is the varsity rowing program for Pacific Lutheran University in Tacoma, Washington. The team was founded in 1964 as a joint program with University of Puget Sound. Today the team consists of Men's and Women's programs for both Varsity and Novice rowers, and competes as a member of the Northwest Collegiate Rowing Conference (NCRC) and Western Intercollegiate Rowing Association (WIRA). Are we justified in saying that \"PLU Crew never won a match\"? Yes, no, or maybe?", "doc_id": 530, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4686, 22129, 23287, 8184], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brandon Hughes (born September 25, 1980), better known by his stage name 6 Tre G is an American hip hop recording artist, record producer, and CEO from Fayette, Alabama. He is also the founder and CEO of Mazerati Records. 6 Tre G has released many studio albums Don Mazerati, Boss Muzik, El Trapo and many more. Are we justified in saying that \"Brandon Hughes was born in 1999\"? Yes, no, or maybe? No\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\". Are we justified in saying that \"Carmen Lebbos is from a small town in Portugal.\"? Yes, no, or maybe? No\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"He became a master of music at age 52\"? Yes, no, or maybe? Maybe\n###\nThe Mayor of Youngstown is the chief executive of the government of the city of Youngstown, Ohio. The term of office for the mayor and members of Youngstown City Council is four years. Youngstown, Ohio has had a total of 50 recorded mayoral administrations, including the city's current mayor. Youngstown has traditionally been led by Democratic mayors. Are we justified in saying that \"There has been 50 recorded people that took the position of The Mayor of Youngstown\"? Yes, no, or maybe? Yes\n###\nTango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center. Are we justified in saying that \"George Balanchine to Stravinsky's \"Tango\" made and premiered in 1982.\"? Yes, no, or maybe?", "doc_id": 618, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37762, 194, 36545, 2051], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wuqiang County () is county of southeastern Hebei province, China. It is under the administration of Hengshui City, with a population of 210,000 residing in an area of 442 km2 . Both China National Highway 307 and G1811 Huanghua\u2013Shijiazhuang Expressway pass through the county. Are we justified in saying that \"Wuqiang County has a population of over 200 thousand\"? Yes, no, or maybe? Yes\n###\n\"Paul Revere\" is a song by American hip hop group Beastie Boys, released as the third single from their debut album \"Licensed to Ill\" (1986). It was written by Adam Horovitz, Joseph Simmons, Darryl McDaniels, and Rick Rubin. It was produced by Rick Rubin and the Beastie Boys. The song tells a fictional story of how the Beastie Boys met. Are we justified in saying that \"\"Paul Revere\" took inspirations from Tupac and Biggie Smalls\"? Yes, no, or maybe? Maybe\n###\nOnce Upon a Time is an American fantasy drama television series that premiered on October 23, 2011, on ABC. The show takes place in the fictional seaside town of Storybrooke, Maine, whose residents are characters from various fairy tales transported to the \"real world\" town and robbed of their original memories by a powerful curse. Are we justified in saying that \"Once Upon a Time premiered over 3 years ago\"? Yes, no, or maybe? Yes\n###\nThe Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\". Are we justified in saying that \"Naughty Dog Developed the sequel to The Last of US\"? Yes, no, or maybe? Yes\n###\nMegan Malia Leilani McClung (April 14, 1972\u2013December 6, 2006) was the first female United States Marine Corps officer killed in combat during the Iraq War. Major McClung was serving as a public affairs officer in Al Anbar Province, Iraq when she was killed. Are we justified in saying that \"McClung was born in the fourth month.\"? Yes, no, or maybe?", "doc_id": 209, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21381, 33385, 22328, 35835], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Drake Hotel is a hospitality venue on Queen Street West in Toronto, Ontario, Canada near Parkdale. In addition to a nineteen-room boutique hotel, there is a restaurant lounge, corner caf\u00e9 with street-level patio, Sky Bar roof top patio, and the Drake Underground basement nightclub and live performance venue. Are we justified in saying that \"the drake hotel is a venue\"? Yes, no, or maybe? Yes\n###\nAucuba chinensis is a shrub or small tree, native to southern China, Taiwan, Burma and northern Vietnam. Typically it grows to 6 meters tall, though it can be larger. The leaves are thick, dark green above and light green below, sometimes with teeth along the margins. Are we justified in saying that \"Aucuba chenensis has dark green leaves and are smooth along the edges. \"? Yes, no, or maybe? No\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The man was not very well in his mental capabilities.\"? Yes, no, or maybe? Maybe\n###\n\"Day In, Day Out\" is a song written by Marv Green and Thom McHugh, and recorded by American country music artist David Kersh. It was released in May 1997 as the fourth single from his album \"Goodnight Sweetheart\". The song reached number 11 on the \"Billboard\" Hot Country Singles & Tracks chart in September 1997. Are we justified in saying that \"The song is more than 10 years old\"? Yes, no, or maybe? Yes\n###\nA cardigan is a type of knitted garment that has an open front. Commonly cardigans have buttons: a garment that is tied is instead considered a robe. A more modern version of the garment has no buttons and hangs open by design. By contrast, a pullover does not open in front but must be \"pulled over\" the head to be worn. It may be machine- or hand-knitted. Are we justified in saying that \"A cardigan has buttons.\"? Yes, no, or maybe?", "doc_id": 230, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11982, 1093, 9285, 44541], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name. Are we justified in saying that \"The Amboy Dukes has been to Russia.\"? Yes, no, or maybe? Maybe\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood. Are we justified in saying that \"Dhanish Karthik was born in the 70s.\"? Yes, no, or maybe? No\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound. Are we justified in saying that \"The Bavarian Mountain Hound is going to be used for police work.\"? Yes, no, or maybe? Maybe\n###\nMasquerade (Russian: \u041c\u0430\u0441\u043a\u0430\u0440\u0430\u0434 ) is a verse play written in 1835 by the Russian Romantic writer Mikhail Lermontov. The four-act play, set in 1830's St. Petersburg aristocratic society, highlights the rebellious spirit and noble mind of the protagonist, Yevgeny Arbenin. It is often compared with Shakespeare's \"Othello\" in its essential story line. Are we justified in saying that \"Mikhail Lermontov was born in 1805.\"? Yes, no, or maybe? Maybe\n###\nTaki's Magazine, called \"Takimag\" for short, is an online magazine of politics and culture published by the Greek paleoconservative journalist and socialite Taki Theodoracopulos and edited by his daughter Mandolyna Theodoracopulos. Initially called Taki's Top Drawer, the site was redesigned and relaunched under its current title in March 2008 with a subsequent redesign in 2010. Are we justified in saying that \"Taki's Magazine is the long version of Takimag\"? Yes, no, or maybe?", "doc_id": 33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7915, 36605, 23820, 11965], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Walkin' is the debut mini-album by South Korean singer Suran. It was released on June 2, 2017, by Million Market and distribuited by LOEN Entertainment. It consists of five songs, including \"Wine\" featuring rapper Changmo, previously released as a digital single, and the title track \"1+1=0\" featuring singer Dean. Are we justified in saying that \"Walkin' was released within the first six months of 2017\"? Yes, no, or maybe? Yes\n###\nSheree Victoria Murphy (born 22 August 1975) is an English actress and television presenter, best known for her roles as Tricia Dingle in the ITV soap opera \"Emmerdale\", Eva Strong in the Channel 4 soap opera \"Hollyoaks\" and Dakota Davies in the Australian soap opera \"Neighbours\". Are we justified in saying that \"Sheree Victoria Murphy is a not a very popular actress in the UK\"? Yes, no, or maybe? Maybe\n###\nGeoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\". Are we justified in saying that \"The executive chef of several restaurants in New York City, Zakarian won the right to join \"Iron Chef America\" in 2011 after performing well in \"Chopped.\"\"? Yes, no, or maybe? Maybe\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC. Are we justified in saying that \"SNOBOL was used by Bush.\"? Yes, no, or maybe? Maybe\n###\nLate Spring () is a 2014 South Korean romance melodrama starring Park Yong-woo, Kim Seo-hyung and Lee Yoo-young. It portrays the true beauty and the platonic love discovered between a genius sculptor and his final model. It made its world premiere at the Santa Barbara International Film Festival in January 2014. Are we justified in saying that \"The film is exactly about 5 years old\"? Yes, no, or maybe?", "doc_id": 752, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10164, 449, 3479, 13048], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "First World problem is a slang term used to refer to issues in First World nations that are complained about only because of the absence of more pressing concerns. The term was added to the \"Oxford Dictionary Online\" in November 2012, and to the \"Macquarie Dictionary Online\" in December 2012. Are we justified in saying that \"Jerzy Skolimowski debuted his shorts at the Cannes Film Festival.\"? Yes, no, or maybe? Maybe\n###\n\"Pop That\" is a song by American rapper French Montana. Released as the first single from his debut studio album \"Excuse My French\" (2013), it features guest appearances from fellow rappers Rick Ross, Drake and Lil Wayne. The song's backing track was composed by Lee On the Beats, who have also helped to write the song along with the four rappers. Are we justified in saying that \"Pop That was released 16 years ago.\"? Yes, no, or maybe? No\n###\nRonald Mayorga S\u00e1nchez (born June 26, 1984, Yumbo, Valle del Cauca, Colombia) is an award-winning Colombian journalist and TV anchor of \"Red\" in Caracol Television in Colombia. As a radio journalist who works with \"Blue Radio\" one of the radio station's imported from Latin America as a host in \"Vox Populi\". Are we justified in saying that \"Ronald Mayorga S\u00e1nchez won an award for journalism before 1980\"? Yes, no, or maybe? No\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues. Are we justified in saying that \"Ethereal ran into legal issues with its trademark so it was renamed.\"? Yes, no, or maybe? Yes\n###\nMahalakshmi (Tamil: \u0bae\u0b95\u0bbe\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf ) is an 2017 Indian-Tamil Language Family soap opera starring Kavya Shastry, Vallab, Anjali Rav and Lokesh. It replaced Nijangal and it broadcast on Sun TV on Monday to Saturday from 6 March 2017 at 12:30PM (IST). It was produced by Vision Time India Pvt Ltd and directed by Shan Karthik and M.K.Arunthavaraja. Are we justified in saying that \"in 2017 Mahalakshmi was broadcast for the first time when it replaced Nijangal\"? Yes, no, or maybe?", "doc_id": 64, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19942, 32593, 914, 8430], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Eli ben Yehudah ha Nazir ben Zechariah, Arabic Abu Kathir Yahya al Katib (\"Father of Kathir, Yahya the Scribe\" fl.Tiberias, 910s) was a grammarian and philologist of the Hebrew, Arabic and Aramaic languages. He may have been among the teachers in Tiberias of Saadia. Are we justified in saying that \"Eli ben Yehudah ha Nazir ben Zechariah was a philologist \"? Yes, no, or maybe? Yes\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ). Are we justified in saying that \"GSG 9 consists of an ethnically diverse team of police members\"? Yes, no, or maybe? Maybe\n###\nRevelation of the Last Hero is a 1992 Hong Kong \"wuxia\" romance television series produced by TVB and starring Aaron Kwok, Noel Leung, Ada Choi, , Frankie Lam and Bryan Leung. The theme song of the series, titled \"Breeze in the Frost\" (\u9727\u88e1\u6e05\u98a8) by was sung by Kwok. Are we justified in saying that \"Revelation of the Last Hero was shown outside of Hong Kong.\"? Yes, no, or maybe? Maybe\n###\nJohn Howe (October 14, 1754 \u2013 December 27, 1835) was a loyalist printer during the American Revolution, a printer and Postmaster in Halifax, the father of the famous Joseph Howe, a spy prior to the War of 1812, and eventually a Magistrate of the Colony of Nova Scotia. He was born in Boston, Massachusetts Bay colony, the son of Joseph Howe, a tin plate worker of Puritan ancestry, and Rebeccah Hart. Are we justified in saying that \"Joseph Howe was of Native American ancestry.\"? Yes, no, or maybe? No\n###\nNabokov's Congeries was a collection of work by Vladimir Nabokov published in 1968 and reprinted in 1971 as \"The Portable Nabokov\". Because Nabokov supervised its production less than a decade before he died, it is useful in attempting to identify which works Nabokov considered to be his best, especially among his short stories. Are we justified in saying that \"Vladimir Nabokov published his \"best-of\" collection in the late 60s.\"? Yes, no, or maybe?", "doc_id": 740, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38891, 24524, 10268, 5192], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"The Overwatch World Cup 2017 is an \"Overwatch\" eCook tournament\"? Yes, no, or maybe? No\n###\nPetasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Due to its large rhubarb-like leaves during the growing season it is mistaken for Rhubarb plant\"? Yes, no, or maybe? Maybe\n###\nZuikaku (Japanese: \u745e\u9db4 \"Auspicious Crane\") was a \"Sh\u014dkaku\"-class aircraft carrier of the Imperial Japanese Navy. Her complement of aircraft took part in the attack on Pearl Harbor that formally brought the United States into the Pacific War, and she fought in several of the most important naval battles of the war, before being sunk during the Battle of Leyte Gulf. Are we justified in saying that \"The Auspicious Crane was part of the reason that the United States joined the Pacific War in World War 2.\"? Yes, no, or maybe? Yes\n###\nWayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war. Are we justified in saying that \"wayne cole-janess is an american banker.\"? Yes, no, or maybe? No\n###\nNiels Bagge Hansen better known by the stage name Vinnie Who (born on 1 November 1987) is a Danish indie pop and disco singer and songwriter who released two albums, whose the debut \"Then I Met You\" in 2010 and \"Midnight Special\" and is signed to EMI Denmark. An androgynous male singer, he sings in a distinctive high-pitched feminine voice. Are we justified in saying that \"Vinnie Who is from Denmark and sings in a feminine voice because he is androgynous.\"? Yes, no, or maybe?", "doc_id": 77, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13688, 3390, 560, 706], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "A startup company (startup or start-up) is an entrepreneurial venture which is typically a newly emerged, fast-growing business that aims to meet a marketplace need by developing a viable business model around an innovative product, service, process or a platform. A startup is usually a company designed to effectively develop and validate a scalable business model. Are we justified in saying that \"Startup companies do not develop business models.\"? Yes, no, or maybe? No\n###\nGame Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams. Are we justified in saying that \"Game Plan had a lot of employees\"? Yes, no, or maybe? Maybe\n###\nSlam Creepers were a rock band from Vansbro, Sweden formed by Bj\u00f6rn Skifs in 1962 and broke up in 1969. Bj\u00f6rn Skifs then went on a solo career. They had some moderate hits including a cover of a Foundations song \"We Are Happy People\". Skifs would later find success with a number 1 hit \"Hooked on a Feeling as a member of Blue Swede. Are we justified in saying that \"Slam Creepers broke up in 1968.\"? Yes, no, or maybe? No\n###\nCynthia Mort (born June 18, 1956) is an American director, screenwriter, and producer. Mort has worked primarily in television since beginning her career in 1994, writing for the sitcom \"Roseanne\". Her notable works include the HBO series \"Tell Me You Love Me\" as a creator and executive producer, the revenge film \"The Brave One\" (2007) as a screenwriter, and the biopic \"Nina\" (2016) as a director. Are we justified in saying that \"Mort dabbled a bit in television\"? Yes, no, or maybe? No\n###\nRegent Power is a Private Limited company incorporated in 2007 and a concern of Habib Group of Chittagong. The plant is located at Barabkunda in Chittagong under independent power producers (IPP) arrangement of the government. Regent Power is operating on natural gas with the capacity to produce 22 MW per day. Are we justified in saying that \"The government organizes independent power producers for Reagent Power company.\"? Yes, no, or maybe?", "doc_id": 390, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35052, 41406, 44696, 42822], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Budapest Gypsy Symphony Orchestra is a Hungarian symphony orchestra of Romani (Gypsy) musicians. It emphasizes works by composers inspired by Hungarian folk music including Johannes Brahms, Vittorio Monti, Piotr Tcha\u00efkovski, Johann Strauss and Johann Strauss II. The orchestra has been performing for Are we justified in saying that \"Budapest Gypsy Symphony Orchestra is a popular orchestra worldwide\"? Yes, no, or maybe? Maybe\n###\nThe 2009 British Speedway Championship was the 2009 edition of the British Speedway Championship. The Final took place on 20 May at Wimborne Road in Poole, England. The Championship was won by Chris Harris, who beat Edward Kennett, Tai Woffinden and Lee Richardson in the final heat. It was the second time Harris had won the title. Are we justified in saying that \"The tournament took place in Poole.\"? Yes, no, or maybe? Yes\n###\nListennn... the Album is the debut studio album by American hip hop disc jockey DJ Khaled. It was released on June 6, 2006. by Terror Squad Entertainment and Koch Records. The album features guest appearances from Young Jeezy, Bun B, Birdman, Juelz Santana, Slim Thug, Krayzie Bone, Chamillionaire, Trina, Twista, Freeway, Jadakiss, Beanie Sigel, Styles P and Lil Scrappy, among others. Are we justified in saying that \"The album was released the year after 2004.\"? Yes, no, or maybe? No\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Few players are ambidextrous and Adriano plays as a defender or midfielder with them.\"? Yes, no, or maybe? Yes\n###\nThe 2013 MBC Drama Awards () is a ceremony honoring the outstanding achievement in television on the Munhwa Broadcasting Corporation (MBC) network for the year of 2013. It was held on December 30, 2013 and hosted by actor Lee Seung-gi and actress Han Ji-hye. Are we justified in saying that \"Munhwa Broadcasting Corporation produces movies exclusively in Asia.\"? Yes, no, or maybe?", "doc_id": 984, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38765, 18243, 15317, 12736], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Johnny Kidd (born Frederick Albert Heath, 23 December 1935 \u2013 7 October 1966) was an English singer and songwriter, best remembered as the lead vocalist for the rock and roll band Johnny Kidd & the Pirates. He was one of the few pre-Beatles British rockers to achieve worldwide fame, mainly for his 1960 hit, \"Shakin' All Over\". Are we justified in saying that \"Johnny Kidd made millions of dollars during his career.\"? Yes, no, or maybe? Maybe\n###\nThe Girl from Jones Beach is a 1949 American comedy film directed by Peter Godfrey and written by I. A. L. Diamond. The film stars Ronald Reagan, Virginia Mayo, Eddie Bracken, Dona Drake, Henry Travers and Lois Wilson. The film was released by Warner Bros. on July 16, 1949. Are we justified in saying that \"The Girl from Jones Beach is a film from the 20th century\"? Yes, no, or maybe? Yes\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\". Are we justified in saying that \"Vishwasrao's was the most popular actor in 2011.\"? Yes, no, or maybe? Maybe\n###\nThe 2015 J&T Banka Prague Open was a professional tennis tournaments played on outdoor clay courts. It was the 6th edition of the tournament which was an International tournament on the 2015 WTA Tour. It took place at the Sparta Prague Tennis Club in Prague, Czech Republic, from 27 April to 2 May 2015. This was the event's first edition as a WTA International tournament. Are we justified in saying that \"The tournament concluded in May\"? Yes, no, or maybe? Yes\n###\nABS is the Australian Broadcasting Corporation's television station in Adelaide, South Australia. It began broadcasting on 11 March 1960 from studios in the suburb of Collinswood. The station's transmitter is located at Mount Lofty, and is one of a series of relay transmitters located throughout the state. Are we justified in saying that \"The station's transmitter has been used for over 50 years\"? Yes, no, or maybe?", "doc_id": 616, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26502, 10382, 22163, 10619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"\"Come Back in One Piece\" was a major urban contemporary hit.\"? Yes, no, or maybe? No\n###\nA semi-automatic pistol is a type of pistol that is semiautomatic, meaning it uses the energy of the fired cartridge to cycle the action of the firearm and advance the next available cartridge into position for firing. One cartridge is fired each time the trigger of a semi-automatic pistol is pulled; the pistol's \"disconnector\" ensures this behavior. Are we justified in saying that \"A semi-automatic pistol is becoming more popular lately because of its use in movies \"? Yes, no, or maybe? Maybe\n###\nThe Cashman Center or the Cashman Field Center is a 483000 sqft complex on a 55 acre site in Las Vegas, Nevada. Operated by the Las Vegas Convention and Visitors Authority it includes Cashman Field and a 98100 sqft convention center. The center is mostly used for locals events, but does host national events like the second and the 2008-09 United States Bowling Congress Open Championships. Are we justified in saying that \"The Cashman Center only operates in Las Vegas.\"? Yes, no, or maybe? Maybe\n###\nThe 35th Annual GMA Music Awards (the show had a name change in 2004-05) were held on April 28, 2004 recognizing accomplishments of musicians for the year 2003. The show was held at the Municipal Auditorium in Nashville, Tennessee, and was hosted by Deion Sanders and Yolanda Adams. Are we justified in saying that \"In Nashville, Tennessee the GMA Music Awards were held for the thirty-fifth time in 2004.\"? Yes, no, or maybe? Yes\n###\nHillcrest School District is a school district based in Strawberry, Arkansas, United States. The district encompasses 266.08 mi2 of land in Lawrence, Independence, and Sharp counties and serves portions of Strawberry, Evening Shade, Poughkeepsie, Smithville, Williford, Cave City, Ravenden, Black Rock, Imboden, Saffell, Lynn, and Powhatan. Are we justified in saying that \"Hillcrest School District has a small geographical area\"? Yes, no, or maybe?", "doc_id": 527, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21654, 13128, 42267, 7804], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Goodbye on a Bad Day\" is a debut song co-written and recorded by American country music artist Shannon Lawson. It was released in February 2002 as the first single from the album \"Chase the Sun\". The song reached #28 on the \"Billboard\" Hot Country Singles & Tracks chart. The song was written by Lawson and Mark A Peters. Are we justified in saying that \"Goodbye on a Bad Day was covered by Bill Joe.\"? Yes, no, or maybe? Maybe\n###\nThe New York Blade was a free weekly newspaper focusing on lesbian, gay, bisexual and transgender (LGBT) issues in New York City, New York. The \"Blade\" was a member of the National Gay Newspaper Guild, and contained news, entertainment, classified ads, and free personals for men and women. Are we justified in saying that \"The New York Blade was published weekly\"? Yes, no, or maybe? Yes\n###\nClearance Giddens is an African American Elvis impersonator from Melfa, Virginia, who has been billed as the \"Black Elvis\". He has appeared on the \"The Arsenio Hall Show\" and the \"Geraldo Show\", and in the film \"Honeymoon in Vegas\". In the early 1990s, he also sang on stage in a duet with Jimmy Buffett singing \"Jailhouse Rock\". He is listed in the book \"I Am Elvis: A Guide to Elvis Impersonators\". Are we justified in saying that \"I just learned who Clearance Giddens is.\"? Yes, no, or maybe? Maybe\n###\nHudson Valley Community College, a SUNY associated two-year college, is located in Troy in Rensselaer County, New York. Although about eighty percent of the students are from the local area, the remainder are from other parts of New York, other states and from some 30 countries around the world. Are we justified in saying that \"About eighty percent of students at Hudson Valley Community College live within walking distance from the campus\"? Yes, no, or maybe? Maybe\n###\nRecorrupted is a limited edition EP by Whitechapel that was released on November 8, 2011 through Metal Blade Records. It consists of one original song, two of their previously released songs remixed (\"This Is Exile\" and \"Breeding Violence\"), an acoustic version of \"End of Flesh\" and a cover of the Pantera song \"Strength Beyond Strength\". Are we justified in saying that \"Recorrupted consists of 5 songs.\"? Yes, no, or maybe?", "doc_id": 979, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19990, 6387, 16479, 26960], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Brandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013. Are we justified in saying that \"Brandon Tyler McManus is over 4 years old\"? Yes, no, or maybe? Yes\n###\nThe Magic Roundabout in Swindon, England, was constructed in 1972 and consists of five mini-roundabouts arranged around a sixth central, anticlockwise roundabout. Located near the County Ground, home of Swindon Town F.C., its name comes from the popular children's television series \"The Magic Roundabout\". In 2009 it was voted the fourth scariest junction in Britain, in a poll by Britannia Rescue. Are we justified in saying that \"The popular children's television series The Magic Roundabout started in 1972.\"? Yes, no, or maybe? Maybe\n###\nJ. D.'s Revenge is a blaxploitation horror film released in 1976. It starred Glynn Turman and Lou Gossett. The main character becomes an unwilling host for the restless spirit of J.D. Walker, a hustler killed 30 years earlier when he was wrongfully accused of killing his sister. Are we justified in saying that \"The tale was based on a true story.\"? Yes, no, or maybe? Maybe\n###\nNight of Terror is a 1933 American Pre-Code horror film directed by Benjamin Stoloff, and starring Bela Lugosi, Sally Blane, Wallace Ford, and Tully Marshall. Despite receiving top billing, Bela Lugosi has a relatively small part. The film is also known as He Lived to Kill and Terror in the Night. Are we justified in saying that \"Night of Terror won an Academy Award.\"? Yes, no, or maybe? Maybe\n###\nRatatouille is a video game based on the Pixar film of the same name. It was developed at Heavy Iron Studios and released by THQ, on June 26, 2007. \"Ratatouille\" was released on thirteen systems\u2014Wii, Nintendo DS, PlayStation 3, PlayStation 2, PSP, Xbox 360, Xbox, Nintendo GameCube, Game Boy Advance, Microsoft Windows, OS X, J2ME, and mobile phone. Are we justified in saying that \"Ratatouille was released on the Wii U\"? Yes, no, or maybe?", "doc_id": 278, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40929, 19318, 16044, 5413], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ceres\u2013Negros Football Club, commonly referred to as Ceres\u2013Negros or just Ceres, is a Filipino football club based in the city of Bacolod, Negros Occidental that plays in the Philippines Football League. The club is a member of the Negros Occidental Football Association. It was previously known as the Ceres\u2013La Salle Football Club. Are we justified in saying that \"The Negros Occidental Football Association was previously known as the Ceres-La Salle Football Club.\"? Yes, no, or maybe? No\n###\nRoderick Dwayne \"Rod\" Higgins (born January 31, 1960) is an American retired professional basketball player who formerly served as president of basketball operations for the National Basketball Association's Charlotte Hornets. He is also the father of former Charlotte Hornets point guard Cory Higgins. Are we justified in saying that \"Higgins was born the last day of the month.\"? Yes, no, or maybe? Yes\n###\nMarques Ackerman (born 1 March 1996) is a South African first-class cricketer. He was included in the North West squad for the 2016 Africa T20 Cup. In August 2017, he was named in Durban Qalandars' squad for the first season of the T20 Global League. Are we justified in saying that \"Marques Ackerman moved to South Africa at a young age.\"? Yes, no, or maybe? Maybe\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"South of Port Melbourne is Port Phillip.\"? Yes, no, or maybe? Yes\n###\nLeonard \"Boogie\" Weinglass (born 1941) is a charismatic American businessman who founded retailer Merry-Go-Round, a chain of restaurants named Boogie\u2019s Diner, and whose early life was portrayed by actor Mickey Rourke in the 1982 classic American film \"Diner\". Are we justified in saying that \"Boogie's Diner was created in the 1980s.\"? Yes, no, or maybe?", "doc_id": 448, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10487, 21020, 10278, 8092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Live from the Gaiety is a live album by The Dubliners. It was recorded during the Irish leg of their tour celebrating forty years on the road. The double album was recorded at the Gaiety Theatre in Dublin in June 2002. All surviving members took part. A companion double DVD of the concert in its entirety was also released. Are we justified in saying that \"The tour was celebrating the 40th year on the road. All the dead members took part in the tour.\"? Yes, no, or maybe? No\n###\nSamuel Eto'o Fils (] ; born 10 March 1981) is a Cameroonian professional footballer who plays as a striker for Turkish club Antalyaspor. He is the most decorated African player of all time, having won the African Player of the Year award a record four times: in 2003, 2004, 2005 and 2010. He was third in the FIFA World Player of the Year award in 2005. Are we justified in saying that \"Samuel Eto'o Fils plays as a striker for the Cameroon club Antalyaspor.\"? Yes, no, or maybe? No\n###\nSNOBOL (\"StriNg Oriented and symBOlic Language\") is a series of computer programming languages developed between 1962 and 1967 at AT&T Bell Laboratories by David J. Farber, Ralph E. Griswold and Ivan P. Polonsky, culminating in SNOBOL4. It was one of a number of text-string-oriented languages developed during the 1950s and 1960s; others included COMIT and TRAC. Are we justified in saying that \"SNOBOL stands for \"StriNg Oriental and symBOlic Language\".\"? Yes, no, or maybe? No\n###\nHappy Mother's Day, Love George (also known Run Stranger, Run) is a 1973 American mystery film produced and directed by Darren McGavin. The film stars Patricia Neal, Cloris Leachman, Bobby Darin, Tessa Dahl, Ron Howard, Kathie Browne, Joe Mascolo, Simon Oakland, and Thayer David. Are we justified in saying that \"Run Stranger, Run was released in the later half of the 1980s\"? Yes, no, or maybe? No\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"Wallace Michael Ross was not into art.\"? Yes, no, or maybe?", "doc_id": 66, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27400, 1164, 17077, 40903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello. Are we justified in saying that \"Cassio was young in real life\"? Yes, no, or maybe? Maybe\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration. Are we justified in saying that \"The High Bridge Branch line followed the North Branch of the Raritan River for much of its duration.\"? Yes, no, or maybe? No\n###\nMount Willey is a mountain located in Grafton County, New Hampshire. The mountain is named after Samuel Willey, Jr. (1766\u20131826) and his family, who in 1825 moved into a house in Crawford Notch. The family was killed a year later in August 1826 during a landslide. Are we justified in saying that \"The family was killed in Mount Grafton.\"? Yes, no, or maybe? No\n###\n\"Big Jet Plane\" is a song written by Australian singer/songwriter Angus Stone, and originally recorded by Stone, under the name \"Lady of the Sunshine\", on his 2009 album \"Smoking Gun\". It was then re-recorded by Stone and his sister Julia, as the duo Angus & Julia Stone, and released on their 2010 album \"Down the Way\", as well as their 2010 EP \"Big Jet Plane\". Are we justified in saying that \"Big Jet plane was written by Aerosmith.\"? Yes, no, or maybe? No\n###\nJames Carlos Agravante Yap Sr. (born February 15, 1982) is a Filipino professional basketball player for the Rain or Shine Elasto Painters of the Philippine Basketball Association (PBA). Known by his nickname Big Game James, he had played for the Star Hotshots for twelve seasons winning seven PBA championships before being traded on 2016. He is also a twelve-time PBA All-Star through 2004 to 2015. Are we justified in saying that \"James Carlos ended his carreer after playing for twelse sevens in a row winning seven PBS championships\"? Yes, no, or maybe?", "doc_id": 623, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32764, 43119, 24411, 1763], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Adriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro is married\"? Yes, no, or maybe? Maybe\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke. Are we justified in saying that \"Lost averaged more than 19 episodes per season by the 6th season\"? Yes, no, or maybe? Yes\n###\n\"I'm So Sorry\" is a song by American rock band Imagine Dragons. The song serves as the second promotional single and fourth track from the band's second studio album \"Smoke + Mirrors\". Along with the songs \"Hopeless Opus\" and \"Gold\" on \"Smoke + Mirrors\", the song touches upon lead-singer Dan Reynolds' depression struggles. The song has peaked at number 14 on the \"Billboard\" Hot Rock Songs chart. Are we justified in saying that \"Hopeless Opus is a catchy song.\"? Yes, no, or maybe? Maybe\n###\nK Theory is an electronic hip-hop act by Dylan Lewman, which formerly included Dustin Musser and Malcolm Anthony. The group was founded by Dylan Lewman and Dustin Musser in 2011. They have done official remixes for Flo Rida's \"GDFR\", Rich Homie Quan's \"Flex\", Fetty Wap's \"Trap Queen\", and many more songs. Their remixes and originals have over to 100 millions plays across all platforms. Are we justified in saying that \"K Theory's originals are not as popular as their remixes.\"? Yes, no, or maybe? Maybe\n###\nFuhrmann & Schmidt Brewing Company was formed in 1906 and was located at Commerce and Washington Streets in Shamokin, Pennsylvania. Fuhrmann & Schmidt was the successor company to the Eagle Brewing Company (1854 \u2013 1878), the M. Markel & Company (1878 \u2013 1893) and Phillip H Fuhrmann (1893 \u2013 1906). Are we justified in saying that \"Fuhrmann & Schmidt Brewing Company has in its name only one of the three entities it was named after.\"? Yes, no, or maybe?", "doc_id": 870, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30330, 25993, 41429, 34246], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "City Mall is a shopping mall located in Amman, Jordan owned by the Al-Khayr Real Estate Investment Company. It was opened in 2006. The mall extends over an area of 160,000 sq., of which 55,000 sq. are leasable. Anchors include Carrefour, Zara, BeBe, Debenhams, Aizone, GAP, Virgin Megastores, Mango, TGIF, Grand Cinemas, H & M, and Jingo Jungle. Are we justified in saying that \"City Mall is one of 4 malls in amman\"? Yes, no, or maybe? Maybe\n###\nCocaine is a highly caffeinated energy drink distributed by Redux Beverages. It contains more caffeine than rival energy drinks Red Bull and Rockstar, symbolized by three and a half steer heads on the label. Aside from caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks. Are we justified in saying that \"Aside from 280mg caffeine, the label claims 750 milligrams of taurine, another common ingredient found in many energy drinks.\"? Yes, no, or maybe? Maybe\n###\nFinniss is an electoral district of the House of Assembly in the Australian state of South Australia. It is named after B. T. Finniss, the first Premier of South Australia. It covers a 5,875 km\u00b2 regional area which includes Kangaroo Island, the Fleurieu Peninsula, and the towns of Kingscote, Middleton, Mount Compass, Penneshaw, Port Elliot, Sellicks Beach, Victor Harbor and Yankalilla. Are we justified in saying that \"B.T. Finniss is the leader of the electoral district of the House of Assembly.\"? Yes, no, or maybe? Maybe\n###\nAngus Scott (16 August 1927 \u2013 16 March 1990) was a British track and field athlete who competed in sprinting events. He represented Great Britain at the 1952 Summer Olympics. He was affiliated with the Achilles Club. He was part of the winning British 4\u00d7400 metres relay team at the 1950 European Athletics Championships. Are we justified in saying that \"Angus Scott was born in Scotland\"? Yes, no, or maybe? Maybe\n###\nAnna Pihl is a Danish police drama produced by TV2. The series stars Charlotte Munck (\"Kongekabale\") as the title character Anna Pihl, Peter Mygind, and Iben Hjejle (\"High Fidelity\" and \"Blinkende Lygter\") as Mikala. Three seasons have been produced between 2006 and 2008, each having 10 episodes. Are we justified in saying that \"Anna Pihl is a short show\"? Yes, no, or maybe?", "doc_id": 958, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29162, 41285, 37556, 551], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In poker the term Triple Crown is used for winning a poker title on the three major poker tours: The World Series of Poker (WSOP), World Poker Tour (WPT) and up to 2016 the European Poker Tour (EPT). Since the EPT has been discontinued and rebranded as the PokerStars Championship, those wins are considered part of the crown. Are we justified in saying that \"One has to win all the tree major poker tours to earn the Triple Crown title.\"? Yes, no, or maybe? Maybe\n###\nThe 1957 Wednesbury by-election was held on 28 February 1957 after the incumbent Labour MP, Stanley Evans, resigned from the House of Commons and the Labour Party after he had refused to vote against the Conservative government on the Suez Crisis. The Labour candidate, John Stonehouse, retained the seat with an increased majority. Are we justified in saying that \"Political problems in Egypt were contributing factors in Stanley Evans' resignation\"? Yes, no, or maybe? Maybe\n###\nPrivate First Class Jose F. Valdez (January 3, 1925 - February 17, 1945) was a United States Army soldier who posthumously received the Medal of Honor \u2014 the United States' highest military decoration \u2014 for his actions near Rosenkranz, France, in the Battle of the Colmar Pocket during World War II. Are we justified in saying that \"Private First Class Jose F. Valdez was born over 28 years ago\"? Yes, no, or maybe? Yes\n###\n\"The Orange and the Green\" or \"The Biggest Mix-Up\" is a humorous Irish folk song about a man whose father was a Protestant (\"Orange\") and whose mother was a Catholic (\"Green\"). It describes the man's trials as the product of religious intermarriage and how \"mixed up\" he became as a result of such an upbringing. Are we justified in saying that \"The Biggest Mix-Up is a serious folk song.\"? Yes, no, or maybe? No\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\". Are we justified in saying that \"Peter John Carlesimo coached basketball for a half century\"? Yes, no, or maybe?", "doc_id": 794, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35735, 34430, 19085, 42345], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Forest Hill Vineyard (also referred to as Forest Hill Wines) is an Australian winery business based in the Great Southern wine region of Western Australia. Its vineyard is west of Mount Barker, and its winery and cellar door are further south, at Denmark. Are we justified in saying that \"Forest Hill Vineyard is the best winery business\"? Yes, no, or maybe? Maybe\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position. Are we justified in saying that \"The Consolidated Tape Association has no association with real-time trade.\"? Yes, no, or maybe? No\n###\nSebastian Philip Bierk (born April 3, 1968), known professionally as Sebastian Bach, is a Canadian heavy metal singer who achieved mainstream success as frontman of Skid Row from 1987 to 1996. He continues a solo career, acted on Broadway, and has made appearances in film and television. Are we justified in saying that \"Sebastian Bach is canadian\"? Yes, no, or maybe? Yes\n###\nThe Sierra Leone Civil War (1991\u20132002) began on 23 March 1991 when the Revolutionary United Front (RUF), with support from the special forces of Charles Taylor\u2019s National Patriotic Front of Liberia (NPFL), intervened in Sierra Leone in an attempt to overthrow the Joseph Momoh government. The resulting civil war lasted 11 years, enveloped the country, and left over 50,000 dead. Are we justified in saying that \"The Sierra Leone Civil War, from 2002 to 1991 when it all was getting started, was a war that never should have been necessary, and never existed as far as the majority of the world was concerned although it went on for over eleven years and the casualties were numerous.\"? Yes, no, or maybe? Maybe\n###\n\"It's Not Right but It's Okay\" is a song by American singer Whitney Houston, from her fourth studio album, \"My Love Is Your Love\". It was written by LaShawn Daniels, Rodney Jerkins, Fred Jerkins III, Isaac Phillips, Toni Estes, and produced by Darkchild. The song examines a woman confronting her lover about his infidelity. Are we justified in saying that \"The song was the title track to the album.\"? Yes, no, or maybe?", "doc_id": 443, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4582, 32066, 24600, 40187], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Krishan Kant ran for re-election in 2002\"? Yes, no, or maybe? No\n###\nJiaozhou Bay Bridge (or Qingdao Haiwan Bridge) is a 26.7 km long roadway bridge in eastern China's Shandong province, which is part of the 41.58 km Jiaozhou Bay Connection Project. s of December 2012 , Guinness World Records lists the Jiaozhou Bay Bridge as the world's longest bridge over water (aggregate length) at 41.58 km . Are we justified in saying that \"Jiaozhou Bay Bridge is located in a quiet part of China\"? Yes, no, or maybe? Maybe\n###\nChristoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph Ernst Friedrich von Forcade de Biaix sentenced many people to death.\"? Yes, no, or maybe? Maybe\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration. Are we justified in saying that \"High Bridge, New Jersey is near the Raritan River\"? Yes, no, or maybe? Yes\n###\nMany science fiction works have been set in the 21st century (years 2001 to 2100). With humanity now in the 21st century, many of the predictions of these works have so far been proven obsolete. This page lists only \"predictions\" regarding the 21st century, as opposed to contemporary accounts of the actual 21st century, which would be too numerous to list. Are we justified in saying that \"many science fiction works have been set in the 20th century.\"? Yes, no, or maybe?", "doc_id": 251, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38154, 6649, 11758, 37287], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"Amy Timberlake likes to read\"? Yes, no, or maybe? Maybe\n###\nDrifters is a British sitcom that stars Jessica Knappett, Lydia Rose Bewley and Lauren O'Rourke as three female friends who live in Leeds following their graduation from university. All three actresses had previously appeared together in \"The Inbetweeners Movie\". Four series were broadcast, between 2013 and 2016. Are we justified in saying that \"The Drifters sitcom ended in 2013.\"? Yes, no, or maybe? No\n###\nWilliam Lang Denholm \"Bill\" McCue OBE (1934\u20131999) was a Scottish singer known for his performances in opera, musical theatre and traditional Scottish folk music. In 1982 he was awarded an OBE for his contribution to Scottish music. In 1999 he died aged 65. Are we justified in saying that \"William Lang Denholm \"Bill\" McCue starred in Cheers.\"? Yes, no, or maybe? Maybe\n###\nThe 18th Critics' Choice Awards were presented on January 10, 2013 at the Barker Hangar at the Santa Monica Airport, honoring the finest achievements of 2012 filmmaking. The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2012. Are we justified in saying that \"The ceremony was broadcast on The CW and hosted by Sam Rubin. The nominees were announced on December 11, 2019.\"? Yes, no, or maybe? No\n###\nDhanish Karthik (born 24 July 1989) is an Indian actor. He made his debut as Sanjeev Menon in the Malayalam film \"Ivide\" (2015) directed by Shyamaprasad. He recently finished filming for the Bollywood film Chef (2017 film) with Saif Ali Khan. The film, directed by Raja Krishna Menon, is slated to release in July 2017. This will be Karthik's debut in Bollywood. Are we justified in saying that \"Raja Krishna Menon will direct Karthik's next movie.\"? Yes, no, or maybe?", "doc_id": 194, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39252, 11985, 38913, 24323], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Taina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002. Are we justified in saying that \"Taina had 2 main characters on the show. \"? Yes, no, or maybe? Maybe\n###\nThe Amboy Dukes were an American rock band formed in 1964 in Detroit, Michigan, best known for their one hit single \"Journey to the Center of the Mind\". The band's name comes from the title of a novel by Irving Shulman. In the UK the group's records were released under the name of The American Amboy Dukes because of the existence of a British group with the same name. Are we justified in saying that \"The Amboy Dukes has been to North Korea.\"? Yes, no, or maybe? Maybe\n###\nThe Protectorate of Bohemia and Moravia (German: \"Protektorat B\u00f6hmen und M\u00e4hren\" ; Czech: \"Protektor\u00e1t \u010cechy a Morava\" ) was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau. Are we justified in saying that \"The Protectorate of Bohemia and Moravia was a protectorate of Nazi Germany established following the German occupation of Czechoslovakia. Earlier in 1938, with the Munich Agreement, Sudetenland territory of Czech Lands was incorporated into Nazi Germany as a Reichsgau. People were unhappy.\"? Yes, no, or maybe? Maybe\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif. Are we justified in saying that \"There is a Man in Our House is a bad film.\"? Yes, no, or maybe? Maybe\n###\nHuevos a la mexicana is a popular breakfast dish in Mexican cuisine. Finely chopped tomato, green chili pepper and onion is lightly fried in a hot skillet. Eggs are added and stirred until set. The heat is turned off and the coriander leaves are mixed in the eggs, adding salt. Refried beans is a common accompaniment. Are we justified in saying that \"Huevos a la mexicana is made outside of Mexico.\"? Yes, no, or maybe?", "doc_id": 822, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14553, 20922, 7589, 32976], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Santa Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"Santa Lucia is a place of education at present.\"? Yes, no, or maybe? Yes\n###\nLinyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"LYU is located in China. \"? Yes, no, or maybe? Yes\n###\nGordon Hendrick (February 16, 1949) is a former Republican member of the Montana Legislature. He was elected to House District 14 which represents the Superior area. Due to Montana's term limits, he was ineligible to run for re-election in 2012. He was succeeded by Republican candidate Nicholas Schwaderer for the 2013 legislature cycle. Are we justified in saying that \"Hendrick is considering a run for president.\"? Yes, no, or maybe? Maybe\n###\nThe 1992 Ohio State Buckeyes football team represented the Ohio State University in the 1992 NCAA Division I-A football season. The Buckeyes compiled an 8\u20133\u20131 record, including the 1993 Florida Citrus Bowl in Orlando, Florida, where they lost, 21\u201314, to the Georgia Bulldogs. Are we justified in saying that \"The 1992 Ohio State Buckeyes football team represented the Ohio State University in the 1992 NCAA Division I-B football season\"? Yes, no, or maybe? No\n###\nHow to Steal a Million is a 1966 heist comedy film, directed by William Wyler and starring Audrey Hepburn, Peter O'Toole, Eli Wallach and Hugh Griffith. The picture is set and was filmed in France, though the characters speak entirely in English. Audrey Hepburn's clothes were designed by Givenchy. Are we justified in saying that \"How to Steal a Million is funny.\"? Yes, no, or maybe?", "doc_id": 476, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24831, 10257, 4385, 28009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Humans Need Not Apply is a 2014 short Internet documentary film, directed, produced, written, and edited by CGP Grey. The film focuses on the future of the integration of automation into economics, as well as the impact of this integration to the worldwide workforce. It was released online as a YouTube video. Are we justified in saying that \"Humans Need Not Apply involves automation\"? Yes, no, or maybe? Yes\n###\nLoui Jover (born April 1967) is an Australian painter and artist. He is known for his artwork in ink wash paintings on vintage book pages. Jover started his work on art in his childhood, but did not start public art until 1989, when he joined the Australian army as an illustrator and photographer. Are we justified in saying that \"Loui Jover knows what light aperture is\"? Yes, no, or maybe? Yes\n###\nVictor H. Halligan (November 22, 1892 \u2013 March 10, 1973) was an American football player. He played for the University of Nebraska from 1912 to 1914 and was the first All-American football player to be selected from the Nebraska Cornhuskers football team. Are we justified in saying that \"Victor played three season of college football.\"? Yes, no, or maybe? Yes\n###\nThe Australian Football League celebrates the best goal of the season through the annual Goal of the Year competition. In 2011, this is officially known as the Panasonic AFL Goal of the Year. Each round three goals are nominated and fans are able to vote online for their favourite here . Are we justified in saying that \"the Panasonic AFL Goal of the Year is about celebratin the best goal of the season\"? Yes, no, or maybe? Yes\n###\n2 Cool 2 Be 4gotten is a 2016 Filipino coming-of-age drama film directed by Petersen Vargas in his feature-length directorial debut and written by Jason Paul Laxamana. The film stars Khalil Ramos, Ethan Salvador and Jameson Blake. It depicts the mysterious coming-of-age tale of Felix after he met half-American Snyder brothers, Magnus and Maxim. Are we justified in saying that \"2 Cool 2 Be 4gotten was originally meant to be released in 2019.\"? Yes, no, or maybe?", "doc_id": 429, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4587, 29033, 16311, 45117], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Al Overton Jr. is an American sound engineer. He has been nominated for four Academy Awards in the category Best Sound. He has worked on over 40 films between 1969 and 1991. His father, Al Overton, was also nominated for an Academy Award for Best Sound. Are we justified in saying that \"Al Overton Jr. was nominated for an Academy Award between 1969 and 1991.\"? Yes, no, or maybe? Yes\n###\nThis article is a list of seasons completed by the Utah Jazz of the National Basketball Association (NBA). The Jazz joined the NBA as the New Orleans Jazz, an expansion team that began play in the 1974\u201375 season. The Jazz relocated from New Orleans to Salt Lake City, Utah for the 1979\u201380 season. As of April 15, 2015, the Utah Jazz are the last franchise to not record a 60-loss season. Are we justified in saying that \"The team did not work hard enough.\"? Yes, no, or maybe? Maybe\n###\nOur Lady of Confidence, also known as La Madonna della Fiducia or Our Lady of Trust, is a venerated image depicting the Blessed Virgin Mary enshrined at the Lateran Basilica. The feast of Our Lady of Confidence falls on the last Saturday prior to Lent. Are we justified in saying that \"Our Lady of Confidence is enshrined at the Lateran Basilica \"? Yes, no, or maybe? Yes\n###\nTake Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\". Are we justified in saying that \"Take Two features the only duet Marvin Gaye has ever performed\"? Yes, no, or maybe? No\n###\nThe S-99 (Russian: \u0421-99 ) experimental submarine was the only ship of the Soviet Project 617 submarine class (NATO reporting name: Whale class) that the Soviet Union built during the early Cold War and the only Soviet submarine which had a Walter engine fuelled by high test peroxide (HTP). Are we justified in saying that \"The Soviet Union used a Walter engine fuelled by high test peroxide (HTP) only once - for the experimental submarine S-99 (Russian: \u0421-99).\"? Yes, no, or maybe?", "doc_id": 814, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [276, 17728, 4914, 11657], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robin Hobb is the pen name of Margaret Astrid Lindholm Ogden (born March 5, 1952), an American writer. She is best known for the books set in the Realm of the Elderlings, which started in 1995 with the publication of \"Assassin's Apprentice\", the first book in the Farseer trilogy. Are we justified in saying that \"Margaret was a french novelist.\"? Yes, no, or maybe? No\n###\nThe Lord of the Rings: The Fellowship of the Ring is a 2001 New Zealand-American epic high fantasy adventure film directed by Peter Jackson based on the first volume of J. R. R. Tolkien's \"The Lord of the Rings\" (1954\u20131955). It is the first installment in \"The Lord of the Rings series\", and was followed by \"\" (2002) and \"\" (2003), based on the second and third volumes of \"The Lord of the Rings\". Are we justified in saying that \"Each film in the trilogy was based on the corresponding volume of J. R. R. Tolkien's \"The Lord of the Rings.\"\"? Yes, no, or maybe? Yes\n###\nKathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006. Are we justified in saying that \"Neko's absence left a void in the band that fans feel Kathryn has not adequately filled.\"? Yes, no, or maybe? Maybe\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o emigrated to Brazil when he was younger than 22.\"? Yes, no, or maybe? Yes\n###\nDeath Race is an arcade game released by Exidy in the United States on April 1, 1976. The game is likely inspired by the 1975 cult film \"Death Race 2000\", which was still in some theatres at the time of the game's release; its original working title which appeared on some flyers to distributors was Death Race 98. Are we justified in saying that \"Death Race was set in 1998\"? Yes, no, or maybe?", "doc_id": 53, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20461, 34218, 4709, 28794], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Charles Rashad Jamaal Brown (born April 10, 1987) is a former American football offensive tackle in the National Football League (NFL) for the New Orleans Saints, New York Giants and Dallas Cowboys. He played college football at USC, where he won the Morris Trophy, recognizing the best offensive and defensive linemen on the West Coast in 2009. Are we justified in saying that \"The Morris Trophy is made of gold.\"? Yes, no, or maybe? Maybe\n###\nSmithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"The last song on Smithereens doesn't contain electric instruments.\"? Yes, no, or maybe? Yes\n###\nThe History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play. Are we justified in saying that \"The history boys is a 2002 film\"? Yes, no, or maybe? No\n###\nThe Reid Report is an hour-long weekday U.S. and world political commentary program on MSNBC. Hosted by Joy-Ann Reid, it premiered on February 24, 2014, in the time slot formerly occupied by \"NewsNation with Tamron Hall\". The show ended on February 27, 2015 due to low ratings. Are we justified in saying that \"When did the Reid Report get cancelled?\"? Yes, no, or maybe? Yes\n###\n\"Chasing Colors\" is a song recorded by electronic DJs Marshmello and Ookay featuring the vocals of American singer Noah Cyrus. It was written by Marshmello, Ookay, Skyler Stonestreet and Chase Duddy and released on 24 February 2017 via Marshmello's label Joytime Collective. Are we justified in saying that \"Chasing Colors was released near the tail of 2017.\"? Yes, no, or maybe?", "doc_id": 421, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17259, 43302, 36504, 4536], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Remember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\" Are we justified in saying that \"Remember the Daze was released in 2002. \"? Yes, no, or maybe? No\n###\nBailey Gatzert (December 29, 1829 \u2013 April 19, 1893) was an American politician and the eighth mayor of Seattle, Washington, serving from 1875 to 1876. He was the first Jewish mayor of Seattle, narrowly missing being the first Jewish mayor of a major American city (Moses Bloom became mayor of Iowa City, Iowa, in 1873), and has been the only Jewish mayor of Seattle to date. Are we justified in saying that \"Moses Bloom was a Jew. \"? Yes, no, or maybe? Yes\n###\nX X X X (pronounced four-ex) is a brand of Australian beer brewed in Milton, Brisbane by Queensland brewers, Castlemaine Perkins (now a division of the Japanese-owned company Lion). It enjoys wide popularity in the state of Queensland, where it is commonly found on-tap in pubs and bars. Are we justified in saying that \"XXXX is not found on tap everywhere in Queensland\"? Yes, no, or maybe? Maybe\n###\nRoger Heman Sr. (February 27, 1898 \u2013 March 14, 1969) was an American sound engineer. He won an Academy Award for Best Special Effects and was nominated for four more in the same category. He worked on more than 350 films during his career. His son was also a sound engineer. Are we justified in saying that \"Roger Heman Sr. only won one Academy Award in his lifetime. \"? Yes, no, or maybe? Maybe\n###\nThe Sisters of Mercy are an English gothic rock band, formed in 1980 in Leeds, United Kingdom (UK). After achieving early underground fame there, the band had their commercial breakthrough in mid-1980s and sustained it until the early 1990s, when they stopped releasing new recorded output in protest against their record company Time Warner. Currently, the band is a touring outfit only. Are we justified in saying that \"The Sisters of Mercy were formed in nineteen hundred eighty two.\"? Yes, no, or maybe?", "doc_id": 636, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5696, 14702, 42078, 29351], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tasmanian Devils is a 2013 television film directed by Zach Lipovsky and starring Danica McKellar and Apolo Ohno. The movie was first released onto the Syfy channel on January 19, 2013 and centers around a group of friends that get attacked by extremely large tasmanian devils. \"Radio Times\" rated the film poorly, giving it two out of 5 stars. Are we justified in saying that \"Danica Lipovsky and Zach Ohno had roles in the movie.\"? Yes, no, or maybe? No\n###\nThe Chatot (also Chacato or Chactoo) were a Native American tribe who lived in the upper Apalachicola River and Chipola River basins in what is now Florida. They spoke a Muskogean language, which may have been the same as that of the Pensacola people. Are we justified in saying that \"The Chatot spoke mainly English\"? Yes, no, or maybe? No\n###\nMargaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law. Are we justified in saying that \"M.L. Jeanne Parker was born in the UK\"? Yes, no, or maybe? Yes\n###\nMarry Him If You Dare (; lit. Mirae's Choice or Future's Choice) is a 2013 South Korean television series starring Yoon Eun-hye, Lee Dong-gun, Jung Yong-hwa, Han Chae-ah, and Choi Myung-gil. It aired on KBS2 from October 14 to December 3, 2013 on Mondays and Tuesdays at 22:00 for 16 episodes. Are we justified in saying that \"Marry Him If You Dare had acting.\"? Yes, no, or maybe? Yes\n###\nAstana ( , ; Kazakh: Astana ] ) is the capital city of Kazakhstan. It is located on the banks of Ishim River in the north portion of Kazakhstan, within the Akmola Region, though administrated separately from the region as a city with special status. The 2017 census reported a population of 1,006,574 within the city limits, making it the second-largest city in Kazakhstan, behind Almaty. Are we justified in saying that \"There were two other cities that had a higher population compared to Astana in 2017.\"? Yes, no, or maybe?", "doc_id": 649, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14020, 9259, 37018, 35757], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Antonio Lewis, better known by his stage name Zombie Juice, is an American rapper from Brooklyn, New York. He is one third of the hip hop trio Flatbush ZOMBiES. Along with his rapping career, Lewis also directed a music video for the Flatbush Zombies song \"Thugnificense\". Are we justified in saying that \"There are not three other members of Flatbush Zombies.\"? Yes, no, or maybe? Yes\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film was released 4 years after 1940.\"? Yes, no, or maybe? Yes\n###\nMadava Farms is an 800-acre certified organic maple syrup enterprise located primarily in Dover, New York. The farm is the maker of Crown Maple Syrup, and it is considered to be the largest maple syrup production facility in North America. Are we justified in saying that \"800-acres is a large farm.\"? Yes, no, or maybe? Maybe\n###\n\"Merry Christmas, Charlie Manson!\" is the sixteenth episode in the second season of the American animated television series \"South Park\". The 29th episode of the series overall, it originally aired on Comedy Central in the United States on December 9, 1998. The episode was written by series co-creator Trey Parker, along with Nancy M. Pimental, and directed by Eric Stough. Are we justified in saying that \"Merry Christmas, Charlie Manson! aired over 10 years ago\"? Yes, no, or maybe? Yes\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\". Are we justified in saying that \"Hooked on a Feeling's cover was originally performed by B. J. Thomas.\"? Yes, no, or maybe?", "doc_id": 171, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32933, 12167, 38770, 8880], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gulf Air (Arabic: \u0637\u064a\u0631\u0627\u0646 \u0627\u0644\u062e\u0644\u064a\u062c\u200e \u200e \"\u1e6cayar\u0101n al-Khal\u012bj\") is the principal flag carrier of Bahrain. Headquartered in Muharraq, adjacent to Bahrain International Airport, the airline operates scheduled services to 41 destinations in 23 countries across Africa, Asia and Europe. Its main base is Bahrain International Airport. Are we justified in saying that \"Gulf Air flies to Paris. \"? Yes, no, or maybe? Maybe\n###\nThis is a list of hotels in the Caribbean. The Caribbean is a region that consists of the Caribbean Sea, its islands (some surrounded by the Caribbean Sea and some bordering both the Caribbean Sea and the North Atlantic Ocean), and the surrounding coasts. The region is southeast of the Gulf of Mexico and the North American mainland, east of Central America, and north of South America. Are we justified in saying that \"The Caribbean region is located South of Mexico.\"? Yes, no, or maybe? No\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums). Are we justified in saying that \"2014 was the best year for Emperor. \"? Yes, no, or maybe? Maybe\n###\nThe Home Depot, Inc. or Home Depot is an American home improvement supplies retailing company that sells tools, construction products, and services. The company is headquartered at the Atlanta Store Support Center in unincorporated Cobb County, Georgia (with an Atlanta mailing address). Are we justified in saying that \"Home Depot sells American-made tools.\"? Yes, no, or maybe? Maybe\n###\n\"Flatline\" is the ninth episode of the eighth series of the British science fiction television programme \"Doctor Who\", written by Jamie Mathieson, and directed by Douglas Mackinnon. The episode stars Peter Capaldi and Jenna Coleman, with Joivan Wade and Christopher Fairbank guest starring. The episode received critical acclaim, with particular praise directed at Coleman's performance. Are we justified in saying that \"Doctor Who had more than one episode called \"Flatline\"\"? Yes, no, or maybe?", "doc_id": 547, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36997, 27218, 39697, 9489], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The Georgia Tech Yellow Jackets are a very badteam\"? Yes, no, or maybe? Maybe\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"Kidsty Pike has in recent years flowed away from the English Lake DIstrict\"? Yes, no, or maybe? No\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\" Are we justified in saying that \"Bugger is offensive to short people.\"? Yes, no, or maybe? Maybe\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley. Are we justified in saying that \"Taylor was born around 10 o' clock in the morning\"? Yes, no, or maybe? Maybe\n###\nDennis Princewell Stehr (born 15 May 1984), better known by his stage name Mr Probz, is a Dutch singer, musician and actor. In 2013, he released the song \"Waves\", which was remixed in 2014 by Robin Schulz, becoming an international hit. He has released one album and featured in the film Bolletjes Blues. Are we justified in saying that \"Stehr wrote Waves in 1988.\"? Yes, no, or maybe?", "doc_id": 351, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32160, 26452, 38910, 23148], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The UK Parliament constituency of County Galway was an historic Irish constituency, comprised the whole of County Galway, except for the Borough of Galway. It replaced the pre-Act of Union Parliament of Ireland constituency. Its representatives sat in the British House of Commons. Are we justified in saying that \"County Galway is historically important.\"? Yes, no, or maybe? Maybe\n###\nAhmad Kemal Idris (born Singaraja, Bali, February 10, 1923 \u2013 died Jakarta, July 28, 2010) was a prominent Indonesian Army general during the 1950s and 1960s. He was an Indonesian guerrilla leader during the Indonesian National Revolution, who in 1949 was involved in continued resistance to the Dutch forces after they occupied Yogyakarta. Are we justified in saying that \"Ahmad Kemal Idris is still alive\"? Yes, no, or maybe? No\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ. Are we justified in saying that \"Frank Viola is a Chinese chef.\"? Yes, no, or maybe? No\n###\nThe interdimensional hypothesis (IDH or IH), is an idea advanced by Ufologists such as Jacques Vall\u00e9e that says unidentified flying objects (UFOs) and related events involve visitations from other \"realities\" or \"dimensions\" that coexist separately alongside our own. It is an alternative to the extraterrestrial hypothesis (ETH). Are we justified in saying that \"a lot of people think the IDH is bogus\"? Yes, no, or maybe? Maybe\n###\nJohn Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s. Are we justified in saying that \"John Henry Newman died in eighteen hundred fifty six.\"? Yes, no, or maybe?", "doc_id": 727, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41533, 44486, 4434, 1714], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Petasites is a genus of flowering plants in the sunflower family, Asteraceae, that are commonly referred to as butterburs and coltsfoots. They are perennial plants with thick, creeping underground rhizomes and large rhubarb-like leaves during the growing season. Are we justified in saying that \"Petasites flowered with the sun\"? Yes, no, or maybe? Maybe\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"\"Breakfast in Birmingham\" was a Top 50 hit.\"? Yes, no, or maybe? Maybe\n###\nThe New Pornographers is a Canadian indie rock band formed in 1997 in Vancouver, British Columbia. Presented as a musical collective of singer-songwriters and musicians from multiple projects, the band has released seven studio albums to critical acclaim for their use of multiple vocalists and elements of power pop incorporated into their music. Are we justified in saying that \"The New Pornographers is a Canadian indie rock band formed in 1997 in the basement of a bar \"? Yes, no, or maybe? Maybe\n###\n\"Paul Revere\" is a song by American hip hop group Beastie Boys, released as the third single from their debut album \"Licensed to Ill\" (1986). It was written by Adam Horovitz, Joseph Simmons, Darryl McDaniels, and Rick Rubin. It was produced by Rick Rubin and the Beastie Boys. The song tells a fictional story of how the Beastie Boys met. Are we justified in saying that \"The song, Paul Revere, was released by an American Hip Hop Group called Boys Beastie. \"? Yes, no, or maybe? No\n###\nI Love Hong Kong is a 2011 Hong Kong comedy film produced and directed by Eric Tsang. Film stars Tsang, Tony Leung Ka-fai, Sandra Ng and a star-studded cast of Hong Kong stars. It was released in Chinese New Year Day. The sequel movies are I Love Hong Kong 2012 and I Love Hong Kong 2013. Are we justified in saying that \"I Love Hong Kong is a 2013 is the second sequel to I love Hong Kong 2011.\"? Yes, no, or maybe?", "doc_id": 452, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13522, 32163, 16179, 35941], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Arthur John Duckworth (born 19 January 1949) is a former Australian rules footballer who played for Fitzroy in the Victorian Football League (VFL), West Perth in the West Australian National Football League (WANFL), and Central District in the South Australian National Football League (SANFL). He is the older brother of former Essendon footballer Billy Duckworth. Are we justified in saying that \"Arthur John Duckworth was born more than 29 years ago.\"? Yes, no, or maybe? Yes\n###\nKim Da-som (born May 6, 1993), better known mononymously as Dasom, is a South Korean singer and actress. She is best known as a former member of South Korean girl group Sistar under Starship Entertainment. She has acted in films and television dramas, including \"Family\" (2012\u20132013), \"Melody of Love\" (2013\u20132014) and \"The Virtual Bride\" (2015). Are we justified in saying that \"Kim Da-Som is a Korean actress and singer who's fame started with her membership in a So. Korean girl group, and then she moved on to acting in films and television dramas, as well as writing a memoir before her passing during the making of The Virtual Bride in 2015\"? Yes, no, or maybe? Maybe\n###\nArthur C. Clarke's World of Strange Powers is a popular thirteen-part British television series looking at strange worlds of the paranormal. It was produced by Yorkshire Television for the ITV network and first broadcast in 1985. It was the sequel to the 1980 series \"Arthur C. Clarke's Mysterious World\". Are we justified in saying that \"World of Strange Powers delves into paranormal topics. \"? Yes, no, or maybe? Yes\n###\nAdriano Correia Claro (born 26 October 1984), known simply as Adriano, is a Brazilian professional footballer who plays for Turkish club Be\u015fikta\u015f JK. One of few players in professional football who are genuinely ambidextrous, he is capable of playing as a defender or midfielder, on both sides of the pitch. Are we justified in saying that \"Adriano Correia Claro has a record number of goals for his team\"? Yes, no, or maybe? Maybe\n###\nBonnie Doon stop is a tram stop under construction in the Edmonton Light Rail Transit network in Edmonton, Alberta, Canada. It will serve the Valley Line, and is located on the west side of 83 Street, south of 84 Avenue, between Bonnie Doon and Idylwylde. The stop is scheduled to open in 2020. Are we justified in saying that \"Bonnie Doon will open right after the new year\"? Yes, no, or maybe?", "doc_id": 204, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20128, 23305, 23002, 17738], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Kinsey Millhone is a fictional character who was created by Sue Grafton for her \"alphabet mysteries\" series of novels. Millhone appears in a number of short stories written by Grafton. Grafton's mystery novels featuring Millhone are set in 1980s Santa Teresa, a fictionalized town based on Santa Barbara, California. Are we justified in saying that \"Grafton has written a series of novels.\"? Yes, no, or maybe? Yes\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"The population is less now than 2019\"? Yes, no, or maybe? Maybe\n###\n...In Black and White is the 12th studio album by American country artist Barbara Mandrell. The album was released in April 1982 on MCA Records and was produced by Tom Collins. It was Barbara Mandrell's first studio album in two years since the release of \"Love Is Fair\". Are we justified in saying that \"In Black and White has been covered by Ice T.\"? Yes, no, or maybe? Maybe\n###\n[id] is the third studio album by deathcore band Veil of Maya. It was released through Sumerian Records on April 6, 2010. They worked with producer Michael Keene of death metal band The Faceless on this album. Keene previously worked with the band, producing their previous album \"The Common Man's Collapse\". It is the band's only album to feature bassist Matthew C. Pantelis. Are we justified in saying that \"Veil of Maya didn't always work with Pantelis.\"? Yes, no, or maybe? Yes\n###\nWings over America is a triple live album by Wings, released in December 1976. The album was recorded during American leg of the band's acclaimed 1975\u201376 Wings Over the World tour. It peaked at number 8 on the UK Albums Chart and reached number 1 on the US \"Billboard\" Top LPs & Tape chart. Are we justified in saying that \"Critical acclaim doesn't mean anything.\"? Yes, no, or maybe?", "doc_id": 184, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [45220, 14719, 23860, 11903], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Roberto Cammarelle (born 30 July 1980) is an Italian boxer, best known for winning the World Amateur Boxing Championships in 2007 (Chicago) and 2009 (Milan) as a super heavyweight and a gold medal at the 2008 Olympic Games in Beijing. He won a silver medal in 2012 London Olympics Games, losing to Anthony Joshua, by a contested jury's decision that was unsuccessfully appealed. Are we justified in saying that \"Roberto Cammarelle won the World Amateur Boxing Championships 3 years in a row\"? Yes, no, or maybe? No\n###\nScience in History is a four-volume book by scientist and historian John Desmond Bernal, published in 1954. It was the first comprehensive attempt to analyse the reciprocal relations of science and society throughout history. It was originally published in London by Watts. There were three editions up to 1969 an. It was republished by MIT Press in 1971 and is still in print. Are we justified in saying that \"Science in History was published less than 660 months ago.\"? Yes, no, or maybe? No\n###\nGrindhouse Releasing is a Hollywood-based independent cult film distribution company led by film editor Bob Murawski and co-founded by Sage Stallone. Grindhouse digitally remasters, restores, and produces bonus materials and video documentaries for cult film DVDs and Blu-rays which it distributes on the CAV label. Are we justified in saying that \"Grindhouse Releasing released a movie last year\"? Yes, no, or maybe? Maybe\n###\nSanta Lucia is a former ancient Roman Catholic church in central Bologna, located on Via Castiglione 36. The incomplete, nearly semi-ruinous brick facade screens a stunning Baroque interior architecture, now used as a large lecture hall by the University of Bologna. Are we justified in saying that \"The University of Bologna owns the former ancient Roman Catholic church Santa Lucia.\"? Yes, no, or maybe? Maybe\n###\nOliver Bierhoff (] ; born 1 May 1968) is a retired German former footballer who scored the first golden goal in the history of major international football, for Germany in the Euro 96 final, a career-defining performance that vaulted him into the international limelight. Are we justified in saying that \"Oliver Bierhoff's career blew up after his golden goal in the Euro 96 final.\"? Yes, no, or maybe?", "doc_id": 903, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35291, 9298, 14751, 754], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Andrea von Habsburg (\"Andrea Maria von Habsburg-Lothringen\") Archduchess of Austria, Hereditary Countess of Neipperg, (born 30 May 1953, in W\u00fcrzburg, Bavaria), is the first child and oldest daughter of Otto von Habsburg and his wife Princess Regina of Saxe-Meiningen. Are we justified in saying that \"Andrea von Habsburg isn't the only daughter of her parents.\"? Yes, no, or maybe? Yes\n###\nMichael George Stroka (May 9, 1938 in Passaic, New Jersey \u2013 April 14, 1997) was an American actor on soap operas like ABC-TV's \"Dark Shadows\", in which he played Aristede, Bruno Hess, and Laszlo Ferrari from 1969 to 1970. In addition, he made a cameo appearance as a pallbearer in the MGM film, \"House of Dark Shadows\", the first of two feature films based on the ABC soap opera. Are we justified in saying that \"Michael George Stroka died in Passaic, New Jersey.\"? Yes, no, or maybe? Maybe\n###\nJake Deckard (born December 30, 1972) is an American pornographic actor and director who appears in gay pornographic films and magazines. In 2008, he started his own production company, \"Screaming Eagle XXX\". Deckard won both Best Actor and Performer of the Year at the 2008 GayVN Awards. Are we justified in saying that \"Jake Deckard started his own production company in 2008, but he had been wanting to do this since the 90's.\"? Yes, no, or maybe? Maybe\n###\nThe Key is a 1958 British war film set in 1941 during the Battle of the Atlantic. It was based on the 1951 novel \"Stella\" by Jan de Hartog (later republished as \"The Distant Shore\" and \"The Key\") and was directed by Sir Carol Reed. William Holden, Sophia Loren and Trevor Howard starred in the production. Are we justified in saying that \"Reed went on to direct many movies in his career.\"? Yes, no, or maybe? Maybe\n###\nWilliam Lewis Moody Jr. (January 25, 1865 \u2013 July 21, 1954) was an American financier and entrepreneur from Galveston, Texas, who founded a private bank, an insurance company, and one of the largest charitable foundations in the United States. Moody was active in the day-to-day operations of his companies until two days before his death. Are we justified in saying that \"William Lewis Moody Jr. was born in the year directly after 1864.\"? Yes, no, or maybe?", "doc_id": 998, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6249, 40914, 28068, 15905], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Darrell Lance Abbott, also known as \"Diamond\" Darrell or \"Dimebag\" Darrell (August 20, 1966 - December 8, 2004), was an American guitarist. Best known as a founding member of the heavy metal bands Pantera and Damageplan. His recordings and film appearances include: Are we justified in saying that \"Darrell Abbot wasn't the only founding member of Damageplan.\"? Yes, no, or maybe? Maybe\n###\nFrederick Ferdinand of Anhalt-K\u00f6then (25 June 1769, Pless \u2013 23 August 1830, K\u00f6then) was a German prince, Ascanian ruler of the principality of Anhalt-Pless and, from 1818, of the duchy of Anhalt-K\u00f6then. He was the second son of Frederick Erdmann, Prince of Anhalt-Pless, and his wife, Louise Ferdinande, daughter of Henry Ernest, Count of Stolberg-Wernigerode. Are we justified in saying that \"Louise Ferdinand died in Pless.\"? Yes, no, or maybe? Maybe\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005. Are we justified in saying that \"Nick Lachey and Jessica Simpson were both famous for being musicians.\"? Yes, no, or maybe? Maybe\n###\nThe Newcomers is a 2000 American family drama film directed by James Allen Bradley and starring Christopher McCoy, Kate Bosworth, Paul Dano and Chris Evans. Christopher McCoy plays Sam Docherty, a boy who moves to Vermont with his family, hoping to make a fresh start away from the city. It was filmed in Vermont, and released by Artist View Entertainment and MTI Home Video. Are we justified in saying that \"The Newcomers was a box office success.\"? Yes, no, or maybe? Maybe\n###\nThe Real Housewives of Atlanta (abbreviated RHOA) is an American reality television series that premiered on October 7, 2008, on Bravo. Developed as the third installment of \"The Real Housewives\" franchise, following \"The Real Housewives of Orange County\" and \"New York City\", it has aired nine seasons and focuses on the personal and professional lives of several women residing in Atlanta, Georgia. Are we justified in saying that \"The Real Housewives of Atlanta is the third episode of \"The Real Housewives.\"\"? Yes, no, or maybe?", "doc_id": 263, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6103, 22922, 29838, 43592], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Tango is a ballet made by New York City Ballet co-founder and founding choreographer George Balanchine to Stravinsky's \"Tango\" (1940) arranged 1953 by the composer. The premiere took place June 10, 1982, as part of City Ballet's Stravinsky Centennial Celebration at the New York State Theater, Lincoln Center. Are we justified in saying that \"Tango is a drama made by George Balanchine.\"? Yes, no, or maybe? No\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying. Are we justified in saying that \"It scored 10 out of 10 stars\"? Yes, no, or maybe? Maybe\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"Kidsty Pike has been seen by earl.\"? Yes, no, or maybe? Maybe\n###\nGeoffrey Zakarian (born July 25, 1959) is an American Iron Chef, restaurateur, television personality and author. He is the executive chef of several restaurants in New York City, Atlantic City and Miami. He is featured on several television programs on the Food Network, including \"Chopped\" and in 2011, \"The Next Iron Chef\", where he won the right to join \"Iron Chef America\". Are we justified in saying that \"Geoffrey Zakarian beat the other chefs competing on the episode of \"The Next Iron Chef\" he competed on in 2011.\"? Yes, no, or maybe? Yes\n###\nThe Old Time Gospel Hour Quartet was a Southern Gospel Quartet that was formed by Jerry Falwell (not a member) at Thomas Road Baptist Church in 2000. The group performed weekly on The Old Time Gospel Hour TV program that Falwell hosted, in addition to having a small travel schedule. In addition to selling their merchandise at concerts, they also recruited students for Liberty University. Are we justified in saying that \"Falwell sang with The Old Time Gospel Hour Quartet.\"? Yes, no, or maybe?", "doc_id": 691, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13539, 24372, 12076, 14539], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Harriston (population 1,797) is a community in the Town of Minto in Wellington County, Ontario, Canada. In 1999, Harriston was amalgamated with the communities of Palmerston, Clifford, and Minto Township to form the Town of Minto. Harriston is located at the headwaters of the Maitland River, and has several shops, restaurants, a library, an art gallery and cultural centre. Are we justified in saying that \"Harriston is north of the USA.\"? Yes, no, or maybe? Yes\n###\nThe Pear Tree (\"Derakht-e-Golabi\") is a 1998 Iranian drama movie written and directed by Dariush Mehrjui with Homayoun Ershadi and Golshifteh Farahani in the lead. The movie was noted for the exemplary craftsmanship of Dariush Mehrjui on his examination of the Iranian bourgeoisie. This movie is also the debut film of the veteran Iranian actress Golshifteh Farahani. Are we justified in saying that \"Dariush Mehrjui is also known for a comedy movie that he directed before the Pear tree\"? Yes, no, or maybe? Maybe\n###\nJames Duncan Scurlock (born September 15, 1971) is an American director, producer, writer and financial adviser. He is probably best known for his critically acclaimed documentary \"Maxed Out: Hard Times, Easy Credit and the Era of Predatory Lenders\" and his award-winning book, \"Maxed Out: Hard Times in the Age of Easy Credit\". His most recent book, \"\", is a biography of Larry Hillblom. Are we justified in saying that \"James Duncan Scurlock died today\"? Yes, no, or maybe? No\n###\nThe Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"Chris Gardner has a son named Christopher Jr.\"? Yes, no, or maybe? Yes\n###\nJoona Veteli (born 21 April 1995) is a Finnish football player currently playing for Norwegian OBOS-ligaen side Fredrikstad. Veteli plays in the position of centre midfielder but can also operate as an attacking midfielder, defensive midfielder, right-back and winger. Are we justified in saying that \"Joona Veteli has played professionally in the US\"? Yes, no, or maybe?", "doc_id": 756, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12912, 37644, 12882, 39055], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Chris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Jay Crawford worked for ESPN.\"? Yes, no, or maybe? Yes\n###\nSuper Show 6 - Super Junior World Tour Concert Album is Super Junior's sixth live recorded album, released on 6 November 2015. This album contains two CDs with 36 live recordings from the Super Show 6 concerts held on September 19\u201321, 2014 at the Olympic Gymnastics Arena located in Seoul, South Korea. Are we justified in saying that \"the Albums was recorded during the 2014 olympics\"? Yes, no, or maybe? Maybe\n###\nIdentification Marks: None (Polish: Rysopis ) is a 1964 Polish drama film directed by Jerzy Skolimowski. It was the first feature film directed by Skolimowski, after the shorts \"Erotique\", \"Little Hamlet\", \"The Menacing Eye\", \"Boxing\" and \"Your Money or Your Life\". Are we justified in saying that \"Identification Marks: None had a sequel to the film, many years later.\"? Yes, no, or maybe? Maybe\n###\nNewlyweds: Nick and Jessica is an American reality television series that aired on MTV. It followed the marriage of then husband and wife Nick Lachey and Jessica Simpson. The show premiered on August 19, 2003, and ran for three seasons, with 41 episodes in total. The last season started on January 26, 2005, and the show concluded on March 30, 2005. Are we justified in saying that \"MTV shows reality programs instead of music videos.\"? Yes, no, or maybe? Maybe\n###\nAl comp\u00e1s de tu mentira (English: To the Compass of Your Lie ) is a 1950 black-and-white Argentine musical film directed by H\u00e9ctor Canziani. The film was adapted from Oscar Wilde's play \"The Importance of being Earnest\" by Abel Santacruz. The film starred Francisco \u00c1lvarez and Pedro Quartucci. Are we justified in saying that \"The film was re-written from English.\"? Yes, no, or maybe?", "doc_id": 178, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42419, 11613, 20242, 20267], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Time of Your Life is an American television drama series starring Jennifer Love Hewitt that aired for one season on Fox. A spin-off of \"Party of Five\", the series followed Sarah Reeves Merrin as she moved to New York City to learn more about her biological parents. Co-stars included Jennifer Garner, Pauley Perrette and Gina Ravera. Are we justified in saying that \"This wasn't a series because it only aired one season.\"? Yes, no, or maybe? No\n###\nPeter John \"P. J.\" Carlesimo (born May 30, 1949) is an American basketball coach, who coached in both the NBA and college basketball for nearly 40 years. He is also a television broadcaster, having worked with \"ESPN, The NBA on TNT, Westwood One, Fox Sports Southwest\" and \"CSN New England\". Are we justified in saying that \"Carlesimo has had a long career in sports\"? Yes, no, or maybe? Yes\n###\nFernande Olivier (born Am\u00e9lie Lang; 6 June 1881\u201326 January 1966) was a French artist and model known primarily for having been the model of painter Pablo Picasso, and for her written accounts of her relationship with him. Picasso painted over 60 portraits of Olivier. Are we justified in saying that \"There are over 100 paintings of Olivier painted by Picasso.\"? Yes, no, or maybe? No\n###\nMean Girls 2 is a 2011 American teen comedy television film directed by Melanie Mayron. It is a stand-alone sequel to the 2004 film \"Mean Girls\". The film premiered on ABC Family on January 23, 2011. The film stars Meaghan Martin, Jennifer Stone, Maiara Walsh, Nicole Gale Anderson, Claire Holt, and Diego Boneta. Tim Meadows reprises his role as Principal Ron Duvall from the original film. Are we justified in saying that \"Mean Girls 2 inspired many novels.\"? Yes, no, or maybe? Maybe\n###\nKathryn Jane Calder (born June 17, 1982) is a Canadian indie rock musician, who performs as a solo artist, and is a member of the band The New Pornographers. She is a former member of Immaculate Machine. Calder started with The New Pornographers by filling in for Neko Case for live performances and was made a permanent member in 2006. Are we justified in saying that \"The Immaculate Machine came before The New Pornographers in Calder's career.\"? Yes, no, or maybe?", "doc_id": 746, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44931, 15975, 9166, 40321], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Introduction to Finality\" is the 22nd episode of the third season of the American television series \"Community\" and the third season finale. It originally aired on May 17, 2012 on NBC. This was the last episode to air with series creator Dan Harmon as showrunner before he was fired, though Harmon would later return as showrunner for the 5th season. Are we justified in saying that \"Community only aired 3 seasons\"? Yes, no, or maybe? No\n###\nIreland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth. Are we justified in saying that \"There are two islands larger than Ireland in Europe\"? Yes, no, or maybe? Yes\n###\nThree Little Sisters is a 1944 American comedy film directed by Joseph Santley and written by Olive Cooper. The film stars Mary Lee, Ruth Terry, Cheryl Walker, William Terry, Jackie Moran and Charles Arnt. The film was released on July 31, 1944, by Republic Pictures. Are we justified in saying that \"The film has at least 6 people.\"? Yes, no, or maybe? Yes\n###\nTribute is a ballet made by Christopher d'Amboise to music by Johann Sebastian Bach. The premi\u00e8re took place Saturday, June 4, 2005, at the School of American Ballet workshop performance, Juilliard Theater, Lincoln Center for the Performing Arts. The New York City Ballet premi\u00e8re was Sunday, February 4, 2007, at the New York State Theater, also at Lincoln Center. Are we justified in saying that \"Tribute won an oscar\"? Yes, no, or maybe? Maybe\n###\nJack Christopher Truelove (born 27 December 1995) is an English football player who most recently played for National League North side Hednesford Town on loan from Oldham Athletic. He is currently registered to play for National League North side Curzon Ashton. Are we justified in saying that \"Jack Christopher Truelove never played in a national match.\"? Yes, no, or maybe?", "doc_id": 891, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15949, 18653, 31263, 9514], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shitanshu Hargovindbhai Kotak (born 19 October 1972 in Rajkot) was an Indian first-class cricketer. A left-handed batsman, he has been a prolific run scorer for Saurashtra. Now he is the coach of Saurastra Cricket Team & soon will join Gujarat Lions IPL team as Assistant Coach. Are we justified in saying that \"There are 3 vowels in Shitanshu Hargovindbhai Kotak's first name. \"? Yes, no, or maybe? Yes\n###\nThe Prague Skate (sometimes titled Golden Skate; from 1994: Czech Skate) is an international figure skating competition. It was a senior event from the 1960s to 1997, usually held in November or December in Prague. Medals were awarded in the disciplines of men's singles, ladies' singles, and pair skating. Since 1999, it is organized in some years as part of the ISU Junior Grand Prix series. Are we justified in saying that \"They stopped calling Prague Skate the Golden Skate before 1995\"? Yes, no, or maybe? Yes\n###\nThe Cable Guy is a 1996 American comedy film directed by Ben Stiller, starring Jim Carrey and Matthew Broderick. It was released in the United States on June 14, 1996. The film co-stars Leslie Mann, Jack Black, George Segal, Diane Baker, Eric Roberts, Owen Wilson, Janeane Garofalo, David Cross, Andy Dick, Amy Stiller, and Bob Odenkirk. Are we justified in saying that \" In 1996 a comedy called The Cable Guy was released. Many people went to see it but there are mixed ideas as to whether it was a commercial success. Most sources consider it to have been successful. The actors were paid a large amount of money.\"? Yes, no, or maybe? Maybe\n###\nO'Sullivan Army Heliport (ICAO: KCSL,\u00a0FAA LID: CSL) is a U.S. Army heliport at Camp San Luis Obispo in San Luis Obispo County, California, United States. It is located just off California State Route 1, northwest of the city of San Luis Obispo, about halfway between it and Morro Bay. O'Sullivan AHP has one helipad designated H1 with a 2,430 by 75\u00a0ft (741 by 23\u00a0m) asphalt surface. Are we justified in saying that \"The Army maintains the Heliport everyday.\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"Israel and the neighboring states of Egypt signed a peace treaty.\"? Yes, no, or maybe?", "doc_id": 440, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27401, 25659, 45242, 37759], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "InterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV. Are we justified in saying that \"InterTV Grande Minas can be viewed all over Brazil.\"? Yes, no, or maybe? Maybe\n###\nEmperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. The group split up in 2001, but reunited from 2005 to 2007 for a few festival dates and brief US tours, and again reunited in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums). Are we justified in saying that \"Emperor is a Norwegian black metal band formed in 1991, regarded as highly influential by critics and emerging black metal bands. After splitting up in 2001, they reunited from 2005 to 2007 for a few festival dates and brief US tours, and again in 2013 to 2014. The group was founded by Ihsahn (guitar/vocal) and Samoth (then, drums).\n\"? Yes, no, or maybe? Yes\n###\nH\u00e9ctor Canziani was an Argentine poet, screenwriter and film director who worked in Argentine cinema in the 1940s and 1950s. Although his work was most abundant in screenwriting and poetry after his brief film career, he is best known for his directorship and production of the 1950 tango dancing film Al Comp\u00e1s de tu Mentira based on a play by Oscar Wilde. Are we justified in saying that \"The film Al Comp\u00e1s de tu Mentira was the highest grossing film in Argentina during the 1940s.\"? Yes, no, or maybe? No\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"the A2 motorway is one of the safest roads to drive on in Europe\"? Yes, no, or maybe? Maybe\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin. Are we justified in saying that \"Vixen was written by two men. \"? Yes, no, or maybe?", "doc_id": 555, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [19789, 19402, 32212, 12223], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rufus Lackland Taylor (January 6, 1910 \u2013 September 14, 1978) was an officer in the United States Navy. There he became Director of the Office of Naval Intelligence and a Vice Admiral. In 1966 he was appointed as Deputy Director of the Defense Intelligence Agency (DIA), then shortly thereafter as Deputy Director of the CIA, where he served from 1966 to 1969. Are we justified in saying that \"Rufus Lackland Taylor died older than 50\"? Yes, no, or maybe? Yes\n###\nLinda Ellerbee (born August 15, 1944) is an American journalist who is most known for several jobs at NBC News, including Washington, D.C. correspondent, and also as host of the Nickelodeon network's \"Nick News with Linda Ellerbee\". Her work on \"NBC News Overnight\" was recognized by the jurors of the duPont Columbia Awards as \"possibly the best written and most intelligent news program ever.\" Are we justified in saying that \"Linda Ellerbee is an American journalist who is known for her works on \"NBC News Overnight\" as the best written and most intelligent news program ever.\"? Yes, no, or maybe? Maybe\n###\nWinning America is a documentary television film about the Canadian band Said the Whale. It follows the band on their first US tour down through California, and then to South by Southwest. It premiered on CBC Television on July 23, 2011. The film was directed by Brent Hodge and Thomas Buchan, and was produced by Brent Hodge, Jon Siddall and Sheila Peacock. It was nominated for a Leo Award in 2012. Are we justified in saying that \"5 months after premiering on CBC Television, Winning America was released on DVD.\"? Yes, no, or maybe? Maybe\n###\nYulia Victorovna Makhalina (Russian: \u042e\u043b\u0438\u044f \u0412\u0438\u043a\u0442\u043e\u0440\u043e\u0432\u043d\u0430 \u041c\u0430\u0445\u0430\u043b\u0438\u043d\u0430 ), also Yulia, (born 23 June 1968) is a Russian ballet dancer. Since 1986, she has been with the Kirov/Mariinsky Ballet where she is a principal dancer. Along with Ulyana Lopatkina, Makhalina is a member of 'the basketball team', a group of Kirov dancers who are characterized for being especially tall and slender. Are we justified in saying that \"the basketball team is unsuccessful in their efforts\"? Yes, no, or maybe? Maybe\n###\nKiss of the Spider Woman is a musical with music by John Kander and Fred Ebb, with the book by Terrence McNally. It is based on the Manuel Puig novel \"El Beso de la Mujer Ara\u00f1a\". The musical had runs in the West End (1992) and Broadway (1993) and won the 1993 Tony Award for Best Musical. Are we justified in saying that \"Kiss of the Spider Woman won a Tony Award 2 years after it first ran in 1992.\"? Yes, no, or maybe?", "doc_id": 203, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31514, 23842, 34003, 4796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "41 Commando or No. 41 (Royal Marine) Commando was a unit of the Royal Marines trained as Commandos during the Second World War. They were part of the all Royal Marine 4th Special Service Brigade that took part in the Normandy landings in June 1944 and later that served in World War II, the Korean War, and in Northern Ireland. They were disbanded in 1981. Are we justified in saying that \"Number 41 killed many people\"? Yes, no, or maybe? Maybe\n###\nMohamed Izzadeen Mohamed Naufer (born 17 January 1981) is a Sri Lankan footballer, who plays either on the left wing or as a striker for Army SC and the Sri Lanka national football team. On 6 September 2013 Izzadeen scored 4 goals against Bhutan in the 2013 SAFF Championship. He has also played for Sri Lanka in the 2006 AFC Challenge Cup and 2010 AFC Challenge Cups. Are we justified in saying that \"Mohamed Izzadeen Mohamed Naufer has many fanatic fans.\"? Yes, no, or maybe? Maybe\n###\nBrendan Francis Aidan Behan (christened Francis Behan) ( ; Irish: \"Breand\u00e1n \u00d3 Beach\u00e1in\" ; 9 February 1923 \u2013 20 March 1964) was an Irish Republican, poet, short story writer, novelist, and playwright who wrote in both English and Irish. He is widely regarded as one of the greatest Irish writers and poets of all time. Are we justified in saying that \"Brendan Francis Aidan Behan wrote in french\"? Yes, no, or maybe? No\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost. Are we justified in saying that \"B\u00ebor the Old is a real person.\"? Yes, no, or maybe? No\n###\nThe 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015. Are we justified in saying that \"The 2015 City of Onkaparinga Challenger took place on courts made from concrete.\"? Yes, no, or maybe?", "doc_id": 38, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38413, 14819, 42719, 4676], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Real Madrid Club de F\u00fatbol C, commonly known as Real Madrid C, was a Spanish association football team that played in the Tercera Divisi\u00f3n \u2013 Group 7. It was Real Madrid's second reserve team. They played their home games at La Ciudad del Real Madrid in Valdebebas outside the city of Madrid. At the end of the 2014\u201315 Tercera Division, Real Madrid C was disbanded. Are we justified in saying that \"Real Madrid Club de F\u00fatbol C was disbanded due to poor attendance and financial support\"? Yes, no, or maybe? Maybe\n###\nThe Pikes Peak Center for the Performing Arts (known commonly as Pikes Peak Center) is a concert auditorium in Colorado Springs, Colorado. It serves as an entertainment, cultural, educational, and assembly center for the citizens of El Paso County, the Pikes Peak region, and the surrounding area. Are we justified in saying that \"The Pikes Peak Center for the Performing Arts is a concert auditorium in Colorado Springs, Colorado. \"? Yes, no, or maybe? Yes\n###\nRoy Denzil Hibbert (born December 11, 1986) is a Jamaican-American professional basketball player who last played for the Denver Nuggets of the National Basketball Association (NBA). He is a two-time NBA All-Star, and earned NBA All-Defensive Second Team honors in 2014. Are we justified in saying that \"Hibbert no longer plays in the NBA.\"? Yes, no, or maybe? Yes\n###\nLemoyne is an unincorporated community and census-designated place in northern Keith County, Nebraska, United States. It lies along Nebraska Highway 92 on the northern shore of Lake C.W. McConaughy, north of the city of Ogallala, the county seat of Keith County. Its elevation is 3,333\u00a0feet (1,016\u00a0m). Although Lemoyne is unincorporated, it has a post office, with the ZIP code of 69146. Are we justified in saying that \"Lemoyne is near more than one lake.\"? Yes, no, or maybe? Maybe\n###\nThe National Democratic Party (NDP) is a Ghanaian political party, founded in October 2012 as a split from the ruling National Democratic Congress. Its first leader was former NDC politician Nana Konadu Agyeman Rawlings, who is also the wife of former President of Ghana and NDC founder Jerry Rawlings. Are we justified in saying that \"The National Democratic Party has five words in it's name.\"? Yes, no, or maybe?", "doc_id": 566, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18593, 25882, 27052, 27287], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Carlyle Eubank is an American writer and screenwriter. His 2014 film \"The Signal\", starring Laurence Fishburne, Brenton Thwaites, and Olivia Cooke, premiered at the 2014 Sundance Film Festival and was released in US theaters on June 13 by Focus Features. Are we justified in saying that \"Carlyle Eubank is very proud of his film the signal\"? Yes, no, or maybe? Maybe\n###\nJoseph Eppele (born August 12, 1987) is a professional Canadian football offensive lineman for the Ottawa Redblacks of the Canadian Football League. He was drafted second overall by the Toronto Argonauts in the 2010 CFL Draft, being the first offensive lineman taken while being ranked fifth overall by the CFL's Amateur Scouting Bureau. He played college football for the Washington State Cougars. Are we justified in saying that \"Joseph Eppele protects the quarterback well\"? Yes, no, or maybe? Maybe\n###\nFrank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ. Are we justified in saying that \"Frank is an African American author and blogger. \"? Yes, no, or maybe? Maybe\n###\n\"Requiem\" is the seventh episode in the fifth season, and the 101st overall episode, of the American crime drama television series \"NCIS\". It first aired on CBS in the United States on November 6, 2007. The episode was written by Shane Brennan and directed by Tony Wharmby. Are we justified in saying that \"The episode was directed by Tony Wharmby and written by Shane Brennan who felt that Requiem was a classic\"? Yes, no, or maybe? Maybe\n###\nThe 2015 Abu Dhabi Grand Prix (formally known as the 2015 Formula 1 Etihad Airways Abu Dhabi Grand Prix) was a Formula One motor race held at the Yas Marina Circuit on 29 November 2015. The race was the nineteenth and final round of the 2015 season, and marked the seventh running of the Abu Dhabi Grand Prix as a round of the World Championship since its inception in . Are we justified in saying that \"The Abu Dhabi Grand Prix was started in 2008.\"? Yes, no, or maybe?", "doc_id": 454, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24253, 26329, 2876, 32015], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christmas Eve is the day before Christmas Day, the festival commemorating the birth of Jesus of Nazareth. Christmas Day is observed around the world, and Christmas Eve is widely observed as a full or partial holiday in anticipation of Christmas Day. Together, both days are considered one of the most culturally significant celebrations in Christendom and Western society. Are we justified in saying that \"The word Christmas contains the word \"Christ\" within it.\"? Yes, no, or maybe? Yes\n###\nHarold E. Ennes was a broadcasting pioneer who authored many textbooks for broadcast and broadcast-related communications training and was a member of the Indianapolis chapter of the Society of Broadcast Engineers. He was a member of SBE's national Certification Committee and made many contributions to the early development of the SBE Certification Program. Are we justified in saying that \"Harold E. Ennes was a broadcaster who was not a female\"? Yes, no, or maybe? Yes\n###\nPisnia zavzhdy z namy (Ukrainian: \u041f\u0456\u0441\u043d\u044f \u0437\u0430\u0432\u0436\u0434\u0438 \u0437 \u043d\u0430\u043c\u0438 ) is a 1975 Soviet Ukrainian musical film, produced by Viktor Storozhenko starring Sofia Rotaru in the main role, as well as Soviet Ukrainian Smerichka vocal-instrumental band. The movie features songs in Ukrainian, Moldovan and Russian of Sofia Rotaru filmed in the background of Ukrainian Carpathian mountains. Are we justified in saying that \"There is at least one song in Russian in the film Pisnia zavzhdy z namy.\"? Yes, no, or maybe? Yes\n###\nThe Boulton Paul Balliol and Sea Balliol were monoplane military advanced trainer aircraft built for the Royal Air Force (RAF) and the Royal Navy Fleet Air Arm (FAA) by Boulton Paul Aircraft. Developed in the late 1940s the Balliol was designed to replace the North American Harvard trainer and used the Rolls-Royce Merlin engine, with the Sea Balliol a naval version for deck landing training. Are we justified in saying that \"The Sea Balliol went out to sea over 100 times.\"? Yes, no, or maybe? Maybe\n###\nMercy Yvonne Debrah-Karikari is a career diplomat and the first female to be secretary to the cabinet of the government of Ghana. She was appointed to occupy this position by the current President Nana Akufo-Addo. Her appointment took effect on the 14th of February 2017. Are we justified in saying that \"Mercy Yvonne Debrah-Karikari was the secretary to the cabinet of the government of Ghana in 2019.\"? Yes, no, or maybe?", "doc_id": 805, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23014, 972, 40154, 28212], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Linyi University (LYU, ) is a public university based in Lanshan district of Linyi, Shandong province, China. It offers studies in 62 different undergraduate degrees, organized among nine major disciplines: Economics, Law, Education, Literature, History, Science, Engineering, Agriculture and Management. Are we justified in saying that \"The school is built mostly with wood\"? Yes, no, or maybe? Maybe\n###\nThe End Tour was the farewell tour of the heavy metal band Black Sabbath, featuring founding members Ozzy Osbourne, Tony Iommi and Geezer Butler. The tour concluded Sabbath's 40+ year career. The final show was February 4, 2017, in their home city of Birmingham, UK. Are we justified in saying that \"On Feb. 4th 2017 a rock band of some renown ended a career that has spanned more than three decades but less than five.\"? Yes, no, or maybe? Yes\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\". Are we justified in saying that \"Jonathan King's 1971 version of \"Hooked on a Feeling\" includes the words 'ooga chaka'.\"? Yes, no, or maybe? Yes\n###\nBoleslav William Felix Robert Sienkiewicz ( ; born May 3, 1958), better known as Bill Sienkiewicz, is an Eisner Award-winning American artist who produces comic book art, primarily for Marvel Comics' \"The New Mutants\" and \"\". Sienkiewicz often utilizes oil painting, collage, mimeograph, and other forms generally uncommon in comic books. Are we justified in saying that \"Bill Sienkiewicz is a baby boomer. \"? Yes, no, or maybe? Yes\n###\nThe 1938 Montana Grizzlies football team represented the University of Montana in the 1938 college football season as a member of the Pacific Coast Conference (PCC). Led by fourth-year head coach Doug Fessenden, they played their home games on campus in Missoula at Dornblaser Field. The Grizzlies finished the season with an overall record of 5\u20133\u20131, and were 0\u20131 in PCC play. Are we justified in saying that \"The Pacific Coast Conference featured the Montana Razorbacks in 1938.\"? Yes, no, or maybe?", "doc_id": 871, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36732, 44373, 15534, 39668], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The McLaren MP4/1 (initially known as the MP4) was a Formula One racing car produced by the McLaren team. It was used during the 1981, 1982 and 1983 seasons. It was the first Formula One car to use a carbon fibre composite monocoque, a concept which is now ubiquitous. Are we justified in saying that \"The McLaren team were proud of the McLaren MP4/1 as it was quick\"? Yes, no, or maybe? Maybe\n###\nCavalry Sunday is the annual parade of the Combined Cavalry Old Comrades Association which takes place in Hyde Park each year. Retired members of the cavalry regiments march to a service and commemoration around the band stand in the SE corner of Hyde Park and the nearby memorial to the bombing which took place there. The old troopers typically wear bowler hats and march with furled umbrellas. Are we justified in saying that \"Calvary Sunday takes place on the second Sunday of March.\"? Yes, no, or maybe? Maybe\n###\nPoor Pretty Eddie is a 1975 American film starring Leslie Uggams, Shelley Winters and Michael Christian. Made on a relatively small budget, it is known for having an atypical narrative and directorial style, which combines elements of horror, exploitation film making, Southern gothic, and pornographic film. It has subsequently become popular in cult and B movie circles. Are we justified in saying that \"The movie was produced BY Fox\"? Yes, no, or maybe? Maybe\n###\nStaunton Mall is a shopping mall in Augusta County, Virginia, United States. It is slightly outside the city limits of Staunton, Virginia. It has 4 anchors in operation include Belk, JCPenney, Peebles and Gold's Gym (previously Sears Surplus and Goody's Family Clothing), with former anchors including Books-A-Million and Steve & Barry's. Are we justified in saying that \"Staughton mall is the biggest in the City.\"? Yes, no, or maybe? Maybe\n###\nThe Big 12 Conference is a ten-school collegiate athletic conference headquartered in Irving, Texas. It is a member of the NCAA's Division I for all sports; its football teams compete in the Football Bowl Subdivision (FBS; formerly Division I-A), the higher of two levels of NCAA Division I football competition. Member schools are located in Iowa, Kansas, Oklahoma, Texas, and West Virginia. Are we justified in saying that \"The Big 12 Conference has 3 schools from Texas.\"? Yes, no, or maybe?", "doc_id": 571, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17235, 44612, 19238, 13749], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Weezer, also known as the White Album, is the eponymous tenth studio album by American rock band Weezer, released on April 1, 2016. The album marks their fourth self-titled release and their first produced by Jake Sinclair. It is the first release through Crush Music and was distributed by Atlantic Records. Are we justified in saying that \"Weezer is the band's 9th album\"? Yes, no, or maybe? No\n###\nVasili Vyacheslavovich Blagov (Russian: \u0412\u0430\u0441\u0438\u043b\u0438\u0439 \u0412\u044f\u0447\u0435\u0441\u043b\u0430\u0432\u043e\u0432\u0438\u0447 \u0411\u043b\u0430\u0433\u043e\u0432 ; born October 29, 1954 in Moscow) is a Russian pair skater who competed for the Soviet Union. With partner Irina Cherniaeva, he represented the Soviet Union at the 1972 Winter Olympics where they placed 6th. Are we justified in saying that \"Vasili Vyacheslavovich Blagov starts with an A.\"? Yes, no, or maybe? No\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal. Are we justified in saying that \"The Empire of Japan invaded and occupied the Northeast over 70 Years ago.\"? Yes, no, or maybe? Yes\n###\nThe 1974 New York Mets season was the 13th regular season for the Mets, who played home games at Shea Stadium. Led by manager Yogi Berra, the team finished the season with a record of 71\u201391, placing fifth in the National League East. This was the first time the Mets had a losing season since 1968. Are we justified in saying that \"The 1974 New York Mets had a cheap beer day.\"? Yes, no, or maybe? Maybe\n###\nThe 1997 Indian vice-presidential election was held on 16 August 1997 to elect Vice-President of India. Krishan Kant defeated Surjit Singh Barnala to become 10th Vice-President of India. At the time of the election, VP office was vacant since the incumbent, K. R. Narayanan, had already inaugurated as President following his victory in the presidential election. Are we justified in saying that \"The 1997 Indian vice-presidential election was held in the ninth month of the year.\"? Yes, no, or maybe?", "doc_id": 561, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35683, 14027, 37273, 40619], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Art of Dying is a Canadian rock band fronted by Jonny Hetherington. The band is currently signed to Better Noise Records. Bassist Cale Gontier is the cousin of Adam Gontier, the lead singer of Saint Asonia and former lead singer of Three Days Grace. Prior to joining the band, Gontier and guitarist Tavis Stanley played in another band, Thornley. Are we justified in saying that \"Art of Dying is from Europe \"? Yes, no, or maybe? No\n###\nThe Mission Viejo Vigilantes were a minor league baseball team located in Mission Viejo, California. The team played in the independent Western Baseball League, and was not affiliated with any Major League Baseball team. Their home stadium was Mission Viejo Stadium near Saddleback College. Are we justified in saying that \"There are no Major League Baseball teams based in California.\"? Yes, no, or maybe? Maybe\n###\nFar from the Madding Crowd is a 2015 British romantic drama film directed by Thomas Vinterberg and starring Carey Mulligan, Matthias Schoenaerts, Michael Sheen, Tom Sturridge and Juno Temple. It is an adaptation of the 1874 novel of the same name by Thomas Hardy, the fourth time this novel has been filmed. Are we justified in saying that \"Far from the Madding Crowd is a comedy\"? Yes, no, or maybe? No\n###\nThe 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015. Are we justified in saying that \"There have been five tournaments in total.\"? Yes, no, or maybe? Maybe\n###\nCraig Lahiff (April 23, 1947 \u2013 2 February 2014) was an Australian film director. He grew up in the Adelaide suburb of Somerton Park and studied science at Adelaide University, then trained as a systems consultant before studying arts in film at Flinders University. He began working in the film industry on crews for movies such as \"Sunday Too Far Away\" and \"The Fourth Wish\". Are we justified in saying that \"Craig Lahiff directed several television shows.\"? Yes, no, or maybe?", "doc_id": 777, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [3324, 21926, 5179, 23385], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Amara Karan (born 1984) is a Sri Lankan-English actress who made her film d\u00e9but as the love interest in Wes Anderson's \"The Darjeeling Limited\". The film premi\u00e8red at the 2007 Venice Film Festival. Karan's second film role was as schoolgirl Peaches in the 2007 film \"St Trinian's\". Are we justified in saying that \"The film was not popular\"? Yes, no, or maybe? Maybe\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif. Are we justified in saying that \"There Is a Woman in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama\"? Yes, no, or maybe? No\n###\nComet in Moominland (Swedish: \"Kometjakten\" / \"Mumintrollet p\u00e5 kometjakt\" / \"Kometen kommer\") is the second in Tove Jansson's series of Moomin books. Published in 1946, it marks the first appearance of several main characters, like Snufkin and the Snork Maiden. Are we justified in saying that \"Snufkin did not appear in the book before Comet\"? Yes, no, or maybe? Yes\n###\nThe Cincinnati and Whitewater Canal Tunnel is a historic yet abandoned canal tunnel in the southwestern corner of the U.S. state of Ohio. Located within the village of Cleves near Cincinnati, it was constructed in 1837 as part of the Whitewater Canal system. Since the canal's closure, it has largely been forgotten, but it has been designated a historic site. Are we justified in saying that \"The Cincinnati and Whitewater Canal Tunnel has been seen by Bill.\"? Yes, no, or maybe? Maybe\n###\nThomas Carr Frank (born March 21, 1965) is an American political analyst, historian, journalist, and columnist for \"Harper's Magazine\". He wrote \"The Tilting Yard\" column in the \"Wall Street Journal\" from 2008 to 2010, and he co-founded and edited \"The Baffler\". He has written several books, most notably \"What's the Matter with Kansas?\" (2004) and \"Listen, Liberal\" (2016). Are we justified in saying that \"Harpers Magazine is the same as Harpers Bazaar.\"? Yes, no, or maybe?", "doc_id": 996, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15143, 13721, 40004, 34344], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "USFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico. Are we justified in saying that \"The USFC \"Fish Hawk\" is no longer in operation. \"? Yes, no, or maybe? Yes\n###\n\"In Due Time\" is the lead single from Killswitch Engage's sixth studio album, \"Disarm the Descent\". The song is the band's first single to feature vocalist Jesse Leach since 2003's \"The Element of One\". The song charted at no. 23 on the Active rock chart and no. 26 on the Mainstream Rock chart. Are we justified in saying that \"\"In Due Time\" features the vocalist Jessica Leach.\"? Yes, no, or maybe? No\n###\nWarriors of Virtue is a 1997 Chinese-American martial arts fantasy film directed by Ronny Yu and starring Angus Macfadyen, Mario Yedidia, and Marley Shelton. It was released in English, Mandarin and Cantonese-language versions. The creature effects were designed by Academy Award-nominated special effect production house Alterian, Inc. Are we justified in saying that \"The film was directed by a man with the last name Yu\"? Yes, no, or maybe? Yes\n###\nThe 315th Operations Group is a United States Air Force Reserve unit assigned to the 315th Airlift Wing. The unit is stationed at Charleston Air Force Base, South Carolina. The 315th Group controls all operational McDonnell Douglas C-17 Globemaster III flying squadrons of the 315th Airlift Wing. It was activated in 1992, when Air Force Reserve Command implemented the Objective Wing organization. Are we justified in saying that \"315th Airlift Wing consists of Air force reserves and active-duty military personal\"? Yes, no, or maybe? Maybe\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\"). Are we justified in saying that \"\"Toi, la musique et moi\" has been translated into more than three languages\"? Yes, no, or maybe?", "doc_id": 132, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7801, 2471, 9550, 30902], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Julian Ricardo Marley (born 4 June 1975) is a British Jamaican reggae musician. He is the son of reggae artist and performer Bob Marley, and Lucy Pounder. Julian is known to be a Grammy award nominated, reggae singer, musician, songwriter, producer and humanitarian. He follows into his father\u2019s footsteps and is a devout Rastafarian who uses his music to inspire his life and spirituality. Are we justified in saying that \"Bob Marley's son, Julian, is also a musician.\"? Yes, no, or maybe? Yes\n###\nAmerican football strategy concerns the deployment of offensive, defensive, and special teams players and the execution of plays in American football. In American football, there are a vast array of positions, formations, strategies, plays and types of play calling systems that are utilized. If a strategy is for a particular game, it is known as a \"game plan\". Are we justified in saying that \"American football strategy will improve a teams performance.\"? Yes, no, or maybe? Maybe\n###\nGhost Notes is the fifth full-length studio album by American rock band Veruca Salt, released on July 10, 2015, through El Camino Records. Produced by Brad Wood, who also produced the band's debut \"American Thighs\", it is the first to feature the band's original lineup since their second album, \"Eight Arms to Hold You\" (1997). Are we justified in saying that \"El Camino was the first to sign Salt\"? Yes, no, or maybe? Maybe\n###\nBaar is a railway station in the Swiss canton of Zug, situated in the municipality of Baar. The station is located on the Z\u00fcrich to Lucerne railway line and is an intermediate stop for InterRegio trains from Z\u00fcrich to Lucerne and on Z\u00fcrich S-Bahn line S9. Are we justified in saying that \"Baar is the first station before Lucerne.\"? Yes, no, or maybe? Maybe\n###\nPeter John Reynolds (6 November 1939 \u2013 26 September 2001) was a British archaeologist known for his research in experimental archaeology and the British Iron Age and for being recruited as the first director of Butser Ancient Farm, a working replica of an Iron Age farmstead in Hampshire. Are we justified in saying that \"Peter John Reynolds sadly passed away on the 33 of september, 2001\"? Yes, no, or maybe?", "doc_id": 765, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18753, 30230, 33132, 28917], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Zale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990). Are we justified in saying that \"Zale Dalen is a Canadian film and television director. He is proud of his film the hounds of Notre Dame\"? Yes, no, or maybe? Maybe\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"There are no plans to expand the The San Nicolao Tunnel\"? Yes, no, or maybe? Maybe\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX. Are we justified in saying that \"The Outsiders are both dead\"? Yes, no, or maybe? Maybe\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Mickey Rooney had other children.\"? Yes, no, or maybe? Maybe\n###\nHamsalekha (born Govindaraju Gangaraju on 23 June 1951) is an Indian film composer and a songwriter who works in South Indian cinema, predominantly in the Kannada film industry since the late 1980s. He is also a screenplay writer, dialogue writer, instrumentalist and a conductor. Composed and written for over 300 feature films. Are we justified in saying that \"Hamsalekha wrote stories.\"? Yes, no, or maybe?", "doc_id": 648, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23411, 42749, 6542, 36518], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia. Are we justified in saying that \"Jeffrey Orlando Hunter played defense.\"? Yes, no, or maybe? Yes\n###\nBabar: King of the Elephants is a 1999 animated film made by Nelvana Limited, Homemade Films, and TMO-Loonland and released in theaters in Canada and the U.S. by Alliance Communications and later on home video by HBO Home Video. Based on Jean de Brunhoff's book series, it is the second \"Babar\" film, following \"\". Are we justified in saying that \"HBO first played Babar: King of the Elephants in the year 1999\"? Yes, no, or maybe? No\n###\n\"Toi, la musique et moi\" (English translation: \"You, the Music and I\") was the Monegasque entry in the Eurovision Song Contest 1976, performed in French by French singer Mary Christy. Christy recorded the song in five languages; French, Italian (as \"La musica e noi due\"), Spanish (\"La m\u00fasica, t\u00fa y yo\"), German (\"Die Musik und ich\") and English (\"Thank You for Rushing into My Life\"). Are we justified in saying that \"\"Toi, la musique et moi\" has been translated into multiple languages\"? Yes, no, or maybe? Yes\n###\nThomas Carr Frank (born March 21, 1965) is an American political analyst, historian, journalist, and columnist for \"Harper's Magazine\". He wrote \"The Tilting Yard\" column in the \"Wall Street Journal\" from 2008 to 2010, and he co-founded and edited \"The Baffler\". He has written several books, most notably \"What's the Matter with Kansas?\" (2004) and \"Listen, Liberal\" (2016). Are we justified in saying that \"Carr is not known widely as a car enthusiast.\"? Yes, no, or maybe? Maybe\n###\n\"Beyond This Earthly Realm\" is the eleventh episode of the fourth season of the American animated television series \"Adventure Time\". The episode was written and storyboarded by Ako Castuera and Jesse Moynihan, from a story by Patrick McHale, Kent Osborne, and Pendleton Ward. It originally aired on Cartoon Network on June 11, 2012. Are we justified in saying that \"\"Beyond This Earthly Realm\" was written by professional comedians\"? Yes, no, or maybe?", "doc_id": 852, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42118, 11876, 32111, 26654], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bear River City is a city in Box Elder County, Utah, United States. The population was 853 at the 2010 census, an increase over the 2000 population of 750. The population was sufficient under Utah state law for Bear River to become a city near the end of 2000. Although current state law requires a minimum population of 1000 for cities, Bear River City remains a city. Are we justified in saying that \"Bear River City is a city because they signed a petition to be a city.\"? Yes, no, or maybe? Maybe\n###\nRachel Brosnahan (born December 15, 1990) is an American actress. She is best known for her roles in the films \"The Unborn\" (2009) and \"Beautiful Creatures\" (2013) and for portraying Rachel Posner in the Netflix series \"House of Cards\", for which she was nominated for an Emmy Award. Are we justified in saying that \"Rachel Brosnahan was born in the 50s.\"? Yes, no, or maybe? No\n###\nPrincess Caroline of Gloucester (Caroline Augusta Maria; 24 June 177414 March 1775) was an infant member of the British Royal Family, a great-grandchild of George II, niece of George III and daughter of the 1st Duke of Gloucester and Edinburgh and his wife, Maria Walpole, daughter of Sir Edward Walpole and his mistress Dorothy Clement. Are we justified in saying that \"Princess Caroline was 1 day old when she died\"? Yes, no, or maybe? No\n###\nSong'z U Can't Find is a compilation of Celly Cel's guest appearance on other rappers albums and compilations. The project was released in 2002 for Boss Up Muzik and was produced by Celly Cel, Studio Ton and G-Man Stan. This compilation marked the first Celly Cel album not to chart on any album charts. Guests include E-40, B-Legit, Rappin' 4-Tay, WC, Baby Beesh and C-Bo. Are we justified in saying that \"Celly Cel is a very silly rapper\"? Yes, no, or maybe? Maybe\n###\nChief Crazy Horse is a 1955 American CinemaScope Technicolor Western film directed by George Sherman starring Victor Mature, Suzan Ball and John Lund. The film is a fictionalized biography of the Lakota Sioux Chief Crazy Horse. It was also known as \"Valley of Fury\". Are we justified in saying that \"Chief Crazy Horse is not a south american film\"? Yes, no, or maybe?", "doc_id": 52, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27966, 337, 34227, 16509], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Chinese University of Hong Kong (CUHK) is a public research university in Shatin, Hong Kong formally established in 1963 by a charter granted by the Legislative Council of Hong Kong. It is the territory's second oldest university and was founded as a federation of three existing colleges \u2013 Chung Chi College, New Asia College and United College \u2013 the oldest of which was founded in 1949. Are we justified in saying that \"CUHK is a co-ed institution. \"? Yes, no, or maybe? Maybe\n###\nThe Cars are an American rock band that emerged from the new wave scene in the late 1970s. The band originated in Boston, Massachusetts, in 1976, with singer, rhythm guitarist and songwriter Ric Ocasek, singer and bassist Benjamin Orr, lead guitarist Elliot Easton, keyboardist Greg Hawkes and drummer David Robinson. Are we justified in saying that \"The Cars originated in MA.\"? Yes, no, or maybe? Yes\n###\n\"The Bear and the Maiden Fair\" is the seventh episode of the third season of HBO's fantasy television series \"Game of Thrones\", and the 27th episode of the series overall. The episode was written by George R. R. Martin, the author of the \"A Song of Ice and Fire\" novels on which the series is based, and was directed by Michelle MacLaren, her directorial debut for the series. Are we justified in saying that \"The Bear and the Maiden Fair is about Game of Thrones.\"? Yes, no, or maybe? Yes\n###\nKapp Heights is a census-designated place located in Point Township, Northumberland County in the state of Pennsylvania. The community is located very close to the borough of Northumberland along Pennsylvania Route 147, near the confluence of the West Branch Susquehanna River and Susquehanna Rivers. As of the 2010 census the population was 863 residents. Are we justified in saying that \"There is 147 people in Kapp Heights.\"? Yes, no, or maybe? No\n###\nNuestra Belleza Nuevo Le\u00f3n 2007, was held at Las Lomas Eventos in Monterrey, Nuevo Le\u00f3n on July 25, 2007. At the conclusion of the final night of competition, Anagabriela Espinoza of San Pedro Garza Garc\u00eda was crowned the winner. Espinoza was crowned by outgoing Nuestra Belleza Nuevo Le\u00f3n titleholder, Mariana Lombard. Eight contestants competed for the state title. Are we justified in saying that \"Nuestra Belleza Nuevo Le\u00f3n 2007 was a biking event.\"? Yes, no, or maybe?", "doc_id": 949, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44300, 14771, 17117, 13594], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Middlewich Paddies are an Irish folk band formed in 1979 in the town of Middlewich in Cheshire. Although not widely known outside of folk music circles, two members of the band were instrumental in setting up the Middlewich folk and boat festival which has now become a recognised festival on the folk circuit. Are we justified in saying that \"Two members of \"The Middlewich Paddies\" helped set up the Middlewich folk and boat festival in the 80's.\"? Yes, no, or maybe? Maybe\n###\nPaul Hausser (7 October 1880 \u2013 21 December 1972) was a high-ranking commander in the Waffen-SS of Nazi Germany during World War II who played a key role in the post-war efforts by former members of the Waffen-SS to achieve historical and legal rehabilitation. Are we justified in saying that \"Paul Hausser was cruel even towards other Germans\"? Yes, no, or maybe? Maybe\n###\nGood is a 2008 drama film based on the stage play of the same name by C. P. Taylor. It stars Viggo Mortensen, Jason Isaacs, and Jodie Whittaker, and was directed by Vicente Amorim. The film premiered at the Toronto International Film Festival on 8 September 2008. Are we justified in saying that \"Good's script was written by Vicente Amorim.\"? Yes, no, or maybe? Maybe\n###\nMichael Hunter, Jr. (born July 10, 1988) is an American professional boxer who challenged for the WBO junior heavyweight title in 2017. As an amateur he won the National Championships as a super heavyweight in 2007 and 2009, and qualified for the 2012 Olympics in the heavyweight division. He is the son of former professional boxer Mike \"the Bounty\" Hunter. Are we justified in saying that \"Michael Hunter is an amateur boxer.\"? Yes, no, or maybe? No\n###\nShadowgun Legends is an upcoming first-person shooter video game developed and published by Madfinger Games for Android and iOS devices. It is the 3rd primary installment of the Shadowgun series, a sequel to the original Shadowgun and Shadowgun Deadzone, both being multiple award-winning games from 2011 and 2012 respectively. Are we justified in saying that \"Shadowgun Deadzone was released in 2011\"? Yes, no, or maybe?", "doc_id": 761, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [4624, 25401, 28894, 32985], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"A Leela of Her Own\" is the sixteenth episode in the third season of the animated series \"Futurama\". The episode is an homage to \"A League of Their Own\". It originally aired on the Fox network in the United States on April 7, 2002. Bob Uecker provided the voice of himself, Tom Kenny provided the voice of Abner Doubledeal, and Hank Aaron guest starred as himself and Hank Aaron XXIV. Are we justified in saying that \"A Leela of Her Own is an episode of Futurama\"? Yes, no, or maybe? Yes\n###\nMiniszt\u00e1r is a Hungarian pop group formed in 2000 and consisting of Georgina ('Gina') Poly\u00e1kovics, Vivien Gonda and M\u00e1rk\u00f3 ('M\u00e1rk') Tak\u00e1cs. The band has released two albums to date, as well as a video DVD. The group is one of many to cover the popular song Dragostea Din Tei. Are we justified in saying that \"Miniszt\u00e1r was formed over 10 years ago\"? Yes, no, or maybe? Yes\n###\nSeven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Clair Huffaker wrote the book in 1960.\"? Yes, no, or maybe? Maybe\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate. Are we justified in saying that \"The company will make a bar with no toffee.\"? Yes, no, or maybe? Maybe\n###\nRemember the Daze is a 2007 drama film released in theaters in April 2008. The film was directed by Jess Manafort. The plot of the movie has been described as \"a glimpse into the teenage wasteland of suburbia 1999 that takes place over 24-hours, and the teenagers who make their way through the last day of high school in the last year of the past millennium.\" Are we justified in saying that \"It has been 12 years since the theater release of the movie Remember the Daze. \"? Yes, no, or maybe?", "doc_id": 716, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34211, 2768, 11848, 30146], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Smithereens was sang by at least 2 people in the acoustic version.\"? Yes, no, or maybe? Yes\n###\nErnest R. Kroeger (August 10, 1862 \u2013 April 7, 1934) was an American composer. He is mainly known for the pedagogical works he composed for piano; he also taught music in St. Louis, Missouri. Today his papers are held at the Missouri Historical Society. Are we justified in saying that \"Today, Ernest R. Kroeger's works are housed at the Missouri Society for the Prevention of Cruelty to Animals.\"? Yes, no, or maybe? No\n###\nDavid Armand (born 1980) is an American writer of fiction, non-fiction, and poetry. He has published three novels, \"The Pugilist's Wife\", \"Harlow\", and \"The Gorge\". He has also published a collection of poems, \"The Deep Woods\", and a memoir titled \"My Mother's House\". He is currently Writer-in-Residence at Southeastern Louisiana University. Are we justified in saying that \"David Armand is a girl.\"? Yes, no, or maybe? No\n###\nPhichai Railway Station is a railway station located in Nai Mueang Subdistrict, Phichai District, Uttaradit. It is located 447.553\u00a0km from Bangkok Railway Station and is a class 2 railway station. It is on the Northern Line of the State Railway of Thailand. Phichai Railway Station opened as part of the Northern Line extension from Phitsanulok to Ban Dara Junction in November 1908. Are we justified in saying that \"The station is very abandoned\"? Yes, no, or maybe? Maybe\n###\nNannina de' Medici (14 February 1448 \u2013 14 May 1493), born Lucrezia de' Medici, was the second daughter of Piero di Cosimo de' Medici and Lucrezia Tornabuoni. She was thus the elder sister of Lorenzo de' Medici. She married Bernardo Rucellai. Her father's name was Piero, so she is sometimes known as Lucrezia di Piero de' Medici. Are we justified in saying that \"Lucrezia de' Medici had multiple siblings other than Lorenzo\"? Yes, no, or maybe?", "doc_id": 229, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29658, 33577, 40215, 38642], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Analyze This is a 1999 gangster comedy film directed by Harold Ramis, who co-wrote the screenplay with playwright Kenneth Lonergan and Peter Tolan. The film stars Robert De Niro as a mafioso and Billy Crystal as his psychiatrist. A sequel, \"Analyze That\", was released in 2002. Are we justified in saying that \"Analyze This has a short ending.\"? Yes, no, or maybe? Maybe\n###\nUpper Grosvenor Street is a historic street in Mayfair, London, United Kingdom. It runs from the Grosvenor House Hotel off Park Lane to the Embassy of the United States off Grosvenor Square. The Embassy of Monaco is located at number 7. Odey Asset Management, a hedge fund run by Crispin Odey, is located at number 12. Are we justified in saying that \"Upper Grosvenor Street has changed names\"? Yes, no, or maybe? Maybe\n###\nMarvin Ivan \"Buck\" Barrow (March 14, 1903 \u2013 July 29, 1933) was a member of the Barrow Gang. He was the older brother of the gang's leader, Clyde Barrow. He and his wife Blanche were wounded in a gun battle with police four months after they joined up with Bonnie and Clyde. Marvin died of his wounds. Are we justified in saying that \"Barrow died in 1933\"? Yes, no, or maybe? Yes\n###\nHistory of Joseph Smith by His Mother is a biography of Joseph Smith, founder of the Latter Day Saint movement, according to his mother, Lucy Mack Smith. It was originally titled Biographical Sketches of Joseph Smith, the Prophet, and His Progenitors for Many Generations and was published by Orson Pratt in Liverpool in 1853. Are we justified in saying that \"Joseph Smith was not an orphan\"? Yes, no, or maybe? Yes\n###\nThe Wolfsonian\u2013Florida International University or The Wolfsonian-FIU, located in the heart of the Art Deco District of Miami Beach, Florida, is a museum, library and research center that uses its collection to illustrate the persuasive power of art and design. For fifteen years, The Wolfsonian has been a division within Florida International University. Are we justified in saying that \"For the total amount of years that is equivalent to five multiplied by five, The Wolfsonian has been a division within Florida International University.\"? Yes, no, or maybe?", "doc_id": 353, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [23241, 14107, 36587, 2009], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jeffrey Orlando Hunter (born April 12, 1966) is a former American football defensive lineman. In a career lasting almost a decade, he played five seasons for four different teams in the National Football League, as well as in the Canadian Football League and the World League of American Football. Hunter played college football at Albany State University in Albany, Georgia. Are we justified in saying that \"Hunter also was very good at basketball.\"? Yes, no, or maybe? Maybe\n###\nUna Lillian Paisley (born 18 November 1922 in Kew in Melbourne in Victoria - died 1977 in Kew, Victoria) was an Australian cricket player. She played twelve Test matches for the Australia national women's cricket team. She captained the Australia national women's cricket team in four Test matches against New Zealand and England. Are we justified in saying that \"Una Lillian Paisley met Bush.\"? Yes, no, or maybe? Maybe\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"Santa Cova Funicular goes down a mountain.\"? Yes, no, or maybe? Yes\n###\nPacific Novelty was a developer of coin-operated arcade video games. \"Deep Death\" was their first title, which was later licensed by Game Plan and re-released as \"Shark attack\" (1981). \"Thief\", a \"Pac-Man\" styled maze chase, was their greatest success. Are we justified in saying that \"\"Thief\", a styled maze chase, was the greatest game released that year.\"? Yes, no, or maybe? Maybe\n###\nMemento is a 2000 American neo-noir psychological thriller film directed and written by Christopher Nolan, and produced by Suzanne and Jennifer Todd. The film's script was based on a pitch by Jonathan Nolan, who later wrote the story \"Memento Mori\" from the concept. It stars Guy Pearce, Carrie-Anne Moss, and Joe Pantoliano. Are we justified in saying that \"Memento was written and directed by two different individuals\"? Yes, no, or maybe?", "doc_id": 177, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37475, 15651, 32251, 17868], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Salvatore Mineo, Jr. (January 10, 1939February 12, 1976), was an American film and theatre actor, known for his performance as John \"Plato\" Crawford opposite James Dean in the film \"Rebel Without a Cause\" (1955). He was twice nominated for the Academy Award for Best Supporting Actor, for his roles in \"Rebel Without a Cause\" and \"Exodus\" (1960). Are we justified in saying that \"He was nominated for an Emmy.\"? Yes, no, or maybe? Maybe\n###\nSabrina Le Beauf (born March 21, 1958) is an American actress best known for her portrayal of Sondra Huxtable on the NBC situation comedy \"The Cosby Show\". She has voiced the character Norma Bindlebeep on the Nick at Nite animated series \"Fatherhood\", a show based on Bill Cosby's book of the same name. Are we justified in saying that \"Sabrina Le Beauf played Sondra for one season.\"? Yes, no, or maybe? Maybe\n###\nSan Francisco Bay Ferry is a passenger ferry service on the San Francisco Bay, administered by the San Francisco Bay Area Water Emergency Transportation Authority (WETA). San Francisco Bay Ferry is not affiliated with Golden Gate Ferry, which provides passenger ferry service to Marin County. Are we justified in saying that \"San Francisco Bay Ferry operates train service along the California coast.\"? Yes, no, or maybe? No\n###\n\"Yellow Ledbetter\" is the 2nd episode of the sixth season of the American series \"The Vampire Diaries\" and the series' 113th episode overall. \"Yellow Ledbetter\" was originally aired on October 9, 2014, on The CW. The episode was written by Julie Plec and directed by Pascal Verschooris. Are we justified in saying that \"Since \"Yellow Ledbetter\" is the 113th episode of \"Vampire Diaries\", there must have been 112 episodes that aired before it. \"? Yes, no, or maybe? Yes\n###\nFather Xmas is a 2001 short film from director Marie Rose and the American Film Institute's Directing Workshop for Women starring Dakota Fanning as six-year-old Clairee who learns from her older brother (Stephen Fanning) that Santa Claus is not real and that their father is fighting in the Vietnam War. Are we justified in saying that \"Father Xmas has an ending.\"? Yes, no, or maybe?", "doc_id": 267, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11714, 40278, 15495, 4715], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Take Two is a duet album by Motown label mates Marvin Gaye and Kim Weston, released August 25, 1966 on the Motown's Tamla label. The album was titled after its most successful selection, the Top 5 R&B/Top 20 Pop hit \"It Takes Two\", which was to this point Gaye's most successful duet with another singer. The album also featured the modest hit \"What Good Am I Without You?\". Are we justified in saying that \"The album Take Two had more than three hits.\"? Yes, no, or maybe? Maybe\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost. Are we justified in saying that \"B\u00ebor the Old is a fictional character in J.R.R. was the father of Baran and Belen and Beren Camlost.\"? Yes, no, or maybe? No\n###\nThis is a list of notable editorial cartoonists of the past and present sorted by nationality. An editorial cartoonist is an artist, a cartoonist who draws editorial cartoons that contain some level of political or social commentary. The list is incomplete; it lists only those editorial cartoonists for whom a Wikipedia article already exists. Are we justified in saying that \"Wikipedia is working to complete this list.\"? Yes, no, or maybe? Maybe\n###\nMatsuri Mizuguchi (\u6c34\u53e3 \u307e\u3064\u308a , Mizuguchi Matsuri , born October 28, 1986 in Yamagata Prefecture) is a Japanese voice actress who started her career in 2007. She is affiliated with Yellowtail. This voice actress shares the same exact date of birth and age as another unrelated Japanese voice actress and singer, Aki Toyosaki. Are we justified in saying that \"Yellowtail first considered the name Bluetail.\"? Yes, no, or maybe? Maybe\n###\nTamanna (Hindi: \u0924\u092e\u0928\u094d\u0928\u093e , translation: Desire) is a 1997 Indian drama film directed by Mahesh Bhatt. It stars Paresh Rawal, Pooja Bhatt, Sharad Kapoor and Manoj Bajpayee in the lead roles The screenplay was written by Tanuja Chandra. The story was written by Tanuja Chandra and Mahesh Bhatt. It was produced by Pooja Bhatt. Are we justified in saying that \"The film was written and produced by three different people.\"? Yes, no, or maybe?", "doc_id": 102, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5153, 20835, 19740, 44453], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 2015 Latrobe City Traralgon ATP Challenger was a professional tennis tournament played on outdoor hard court. It was the fourth edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Traralgon, Australia between 26 October \u2013 1 November 2015. Are we justified in saying that \"The fourth edition of the tournament was played on outdoor hard court\"? Yes, no, or maybe? Yes\n###\nBlack Wind, White Land is a 1993 documentary film, researched and produced by the founders of the Chernobyl Children's Project International and explores the Chernobyl nuclear disaster of 1986 and its consequences for the handicapped development of the people in Belarus, Russia and Ukraine. The film was directed by Gene Kerrigan and produced by Ali Hewson, the wife of U2's singer Bono. Are we justified in saying that \"Bono is married to Hewson.\"? Yes, no, or maybe? Yes\n###\nGwinnett County Public Schools is a school district operating in Gwinnett County, Georgia, United States. GCPS is the largest school system in Georgia, with 139 schools and an estimated enrollment of 178,000 students for the 2016-2017 year. GCPS is estimated to be the 14th largest school district in the U.S. The district has its headquarters in an unincorporated area near Suwanee. Are we justified in saying that \"Peachtree Corners is within GCPS.\"? Yes, no, or maybe? Yes\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"The Nydala Abbey was still in operation in 1500.\"? Yes, no, or maybe? Maybe\n###\nSeveral politico-constitutional arrangements use reserved political positions, especially when endeavoring to ensure the rights of minorities or preserving a political balance of power. These arrangements can distort the democratic principle of \"one person - one vote\" in order to address special circumstances. Are we justified in saying that \"politico-constitutional arrangements are imaginanry\"? Yes, no, or maybe?", "doc_id": 117, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9378, 17973, 20811, 17572], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Telephone Shilpa Sangstha or TSS is a government owned telecoms company in Bangladesh. It launched the first Laptop made/assembled in Bangladesh, Doel, in 2011. It also manufactures analogue phones. It also assembles smartphone and regular cell phones in Bangladesh. Are we justified in saying that \"Telephone Shilpa Sangstha launched the first cell phone made/assembled in Bangladesh, Doel\"? Yes, no, or maybe? Maybe\n###\nBruno Mingeon (born September 7, 1967 in Bourg-Saint-Maurice, Savoie) is a French bobsledder who competed from 1988 to 2006. Competing in five Winter Olympics, he won a bronze medal in the four-man event (tied with Great Britain) at Nagano in 1998. He was born in Bourg-Saint-Maurice. Are we justified in saying that \"Bruno Mingeon won his first Olympic medal in 1967.\"? Yes, no, or maybe? No\n###\nThe Proteus Design Suite is a proprietary software tool suite used primarily for electronic design automation. The software is used mainly by electronic design engineers and electronic technicians to create electronic schematics and electronic prints for manufacturing printed circuit boards. Are we justified in saying that \"The Proteus Design Suite is a proprietary software tool suite is a place you can trust\"? Yes, no, or maybe? Maybe\n###\nWilson Dam is a dam spanning the Tennessee River between Lauderdale County and Colbert County in the U.S. state of Alabama. It impounds Wilson Lake. It is one of nine Tennessee Valley Authority (TVA) dams on the Tennessee River. The dam was declared a National Historic Landmark on November 13, 1966. Are we justified in saying that \"Wilson Dam is a popular place to fish\"? Yes, no, or maybe? Maybe\n###\nBlack Dahlia is a 2006 United States production horror film inspired by the mysterious unsolved murder of the\"Black Dahlia\", Hollywood actress Elizabeth Short. Instead of dramatizing the infamous 1947 murder of Short and the ensuing investigation, writer-director Ulli Lommel follows a series of contemporary L.A.-area homicides patterned after the 1947 slaying. Are we justified in saying that \"Black Dahlia was meant to scare people\"? Yes, no, or maybe?", "doc_id": 325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14486, 3159, 7920, 30106], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lament is the seventh studio album by British new wave band Ultravox, released in the UK on 6 April 1984. It was the last album featuring original drummer Warren Cann until the band's reunion album \"Brilliant\" in 2012. The album peaked at #8 on the UK album chart and was certified Gold by the BPI in June 1984 for 100,000 copies sold. It also reached #25 in Germany and #115 in the United States. Are we justified in saying that \"Lament was the best selling album by Ultravox in the United States\"? Yes, no, or maybe? Maybe\n###\nLibya TV (also known as Libya Al Ahrar TV) is a Libyan TV channel broadcast by satellite from its headquarters in Doha. The channel was created in 2011 during the Libyan Civil War. Its presents news, opinions, analysis, photo and video reports about Libya in specific and the region in a wider scope. It focuses on Libya\u2019s revolution and future toward building a democratic state. Are we justified in saying that \"Libya TV often shows news reports about war.\"? Yes, no, or maybe? Maybe\n###\nDestiny (Arabic: \u0627\u0644\u0645\u0635\u064a\u0631\u200e \u200e , translit.\u00a0Al-massir) is a 1997 French-Egyptian historical drama film directed by Youssef Chahine. It was screened out of competition at the 1997 Cannes Film Festival. The film was selected as the Egyptian entry for the Best Foreign Language Film at the 70th Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Destiny was not nominated for an Oscar award.\"? Yes, no, or maybe? Yes\n###\nHellgate (originally titled Shadows) is a 2011 American-Thai supernatural thriller directed and written by John Penney, starring William Hurt and Cary Elwes. Elwes plays the sole survivor of a car crash who, upon seeing ghosts, seeks help from a spiritual guru (Hurt). Are we justified in saying that \"Hellgate was more popular in Thailand than America\"? Yes, no, or maybe? Maybe\n###\nThe Circuit Gilles Villeneuve (also spelled Circuit Gilles-Villeneuve in French) is a motor racing circuit in Montreal, Quebec, Canada. It is the venue for the FIA Formula One Canadian Grand Prix. It has previously hosted the FIA World Sportscar Championship, the Champ Car World Series, the NASCAR Canadian Tire Series, the NASCAR Xfinity Series and the Grand-Am Rolex Sports Car Series. Are we justified in saying that \"The Circuit Gilles Villeneuve once had a massive world championship at it\"? Yes, no, or maybe?", "doc_id": 260, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44986, 18123, 43957, 44398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Punjab Control of Organised Crime Act, (PCOCA) is law to be enacted by Punjab state in India to combat organised crime. It is in process of approval as the Punjab Cabinet has yet not given its approval on account of few reservations about various clauses of the Act.The Act is designed on the pattern of Maharashtra Control of Organised Crime Act enacted in 1999. Are we justified in saying that \"Punjab Control of Organised Crime Act was enacted before the Maharashtra Control of Organised Crime Act \"? Yes, no, or maybe? No\n###\n\"Crawling\" is a song by American rock band Linkin Park. It is the second single from their debut album \"Hybrid Theory\" and is the fifth track on the album. It was released in 2001 as their second single and won a Grammy for Best Hard Rock Performance in 2002. In January 2011, \"Crawling\" was released in a Linkin Park DLC pack for \"Rock Band 3\". Are we justified in saying that \"Linkin Park are proud of the song for winning a grammy\"? Yes, no, or maybe? Maybe\n###\nChild Whispers (published in 1922) is the first published work of the English children's author Enid Blyton, illustrated by her childhood friend and collaborator Phyllis Chase. It is a collection of 28 poems, and one of Blyton's most popular and best-known poetry books. Are we justified in saying that \"Blyton knew Chase for a long time.\"? Yes, no, or maybe? Yes\n###\nRishika Singh is an Indian actress who appears in Kannada-language films. She is the daughter of film director Rajendra Singh Babu and granddaughter of Mysore-based film producer Shankar Singh and former Bengali actress Pratima Devi. Her brother Aditya also appears in Kannada films. Are we justified in saying that \"Rishika Singh's mother is an actress.\"? Yes, no, or maybe? Maybe\n###\nSourceMedia is a mid-sized diversified business-to-business digital media company owned by Observer Capital, which acquired the company from Investcorp in August 2014. Formerly the Thomson Media division of The Thomson Corporation, SourceMedia was spun off and sold by Thomson to Investcorp in 2004 for $350 million. Are we justified in saying that \"Investcorp made $350 million on the sale of SourceMedia. \"? Yes, no, or maybe?", "doc_id": 458, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38471, 21750, 35546, 11727], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Suntaragaali (Kannada: \u0cb8\u0cc1\u0c82\u0c9f\u0cb0\u0c97\u0cbe\u0cb3\u0cbf ) is a 2006 Indian Kannada romantic action film directed by Sadhu Kokila and written by Ranganath. Besides direction, Sadhu Kokila has composed the music and also enacted in a supporting role. The main cast includes Darshan, Rakshita and Ashish Vidyarthi besides Seetha, Umashree and Rangayana Raghu in other pivotal roles. Are we justified in saying that \"umashree never acted after this movie\"? Yes, no, or maybe? Maybe\n###\nLive in Concert is the second live music video title by singer and actress Cher. Released by HBO in 1999, it contained footage from Cher's Do You Believe? Tour specials filmed at the MGM Grand Garden Arena in Paradise, Nevada in 1999. It featured tracks from the Gypsys, Tramps & Thieves album to the Believe album, alongside various covers. She has 7 costume changes by stylist Bob Mackie. Are we justified in saying that \"Cher performs song from other artists.\"? Yes, no, or maybe? Yes\n###\nPrincess Ragnhild Coast is the portion of the coast of Queen Maud Land in Antarctica lying between 20\u00b0 E and the Riiser-Larsen Peninsula, at 34\u00b0 E. All but the eastern end of the coast is fringed by ice shelves. It was discovered by Capt. Hjalmar Riiser-Larsen and Capt. Nils Larsen in aerial flights from the ship Norvegia on February 16, 1931, and named for Princess Ragnhild of Norway. Are we justified in saying that \"Queen Maud Land in Antarctica was discovered in 1931.\"? Yes, no, or maybe? Maybe\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg. Are we justified in saying that \"Duel has a long beginning scene.\"? Yes, no, or maybe? Maybe\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries. Are we justified in saying that \"2007 was not an exciting time for football fans. \"? Yes, no, or maybe?", "doc_id": 658, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26345, 29553, 29884, 13741], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Michael Cassio, or simply Cassio, is a fictional character in William Shakespeare's \"Othello\". The source of the character is the 1565 tale \"Un Capitano Moro\" by Cinthio; Cassio is unnamed in Cinthio but referred to as \"the squadron leader\". In the play, Cassio is a young and handsome lieutenant under Othello's command who becomes one of Iago's several victims in a plot to ruin Othello. Are we justified in saying that \"The basis for Cassio is \"the squadron leader\" in \"Un Capitano Moro.\"\"? Yes, no, or maybe? Yes\n###\nThe Anchor Bankside is a pub in the London Borough of Southwark. It is in the Bankside locality on the south bank of the Thames close to Southwark Cathedral and London Bridge station. A tavern establishment (under various names) has been at the pub's location for over 800 years. Behind the pub are buildings that were operated by the Anchor Brewery. Are we justified in saying that \"The Anchor Bankside is a pub in the England Borough of Southwark\"? Yes, no, or maybe? No\n###\nThe Toffee Crisp bar is a chocolate bar first manufactured in the United Kingdom by Mackintosh's in 1963. It is now produced by Nestl\u00e9 in the UK. It consists of puffed rice embedded in soft toffee and shaped into a rectangular cuboid, the whole bar being covered by milk chocolate. Are we justified in saying that \"The Toffee Crisp bar was manufactored in 1963\"? Yes, no, or maybe? Yes\n###\nIn guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"Power chords are rarely used in punk rock\"? Yes, no, or maybe? No\n###\nZafar Mahmud (1923\u20132016) was a Royal Indian Air Force officer during the second world war, originally stationed in Burma and subsequently stationed in Quetta (in present-day Pakistan) from 1945 to 1947 before the partition of British India. He was sent to England a number of times to train with the Royal Air Force just before and after the war. Are we justified in saying that \"Zafar Mahmud fought in the Vietnam War.\"? Yes, no, or maybe?", "doc_id": 671, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20876, 2529, 22212, 27964], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Valley of Fire Road (also called the Valley of Fire Highway) is a road in northeastern Clark County, Nevada serving the Valley of Fire State Park. The roadway was previously designated State Route 40 (SR 40), and the segment within the state park is currently designated a Nevada Scenic Byway. Are we justified in saying that \"There are no areas in Nevada that are recognized as state parks.\"? Yes, no, or maybe? No\n###\nSamson and Delilah (French: \"Samson et Dalila\" ), Op. 47, is a grand opera in three acts and four scenes by Camille Saint-Sa\u00ebns to a French libretto by Ferdinand Lemaire. It was first performed in Weimar at the Grossherzogliches (Grand Ducal) Theater (now the Staatskapelle Weimar) on 2 December 1877 in a German translation. Are we justified in saying that \"Samson and Delilah is performed more in the English language than in German.\"? Yes, no, or maybe? Maybe\n###\nThe Six-Day War (Hebrew: \u05de\u05dc\u05d7\u05de\u05ea \u05e9\u05e9\u05ea \u05d4\u05d9\u05de\u05d9\u05dd , \"Milhemet Sheshet Ha Yamim\"; Arabic: \u0627\u0644\u0646\u0643\u0633\u0629 , \"an-Naksah\", \"The Setback\" or \u062d\u0631\u0628 \u06f1\u06f9\u0666\u06f7 , \"\u1e24arb 1967\", \"War of 1967\"), also known as the June War, 1967 Arab\u2013Israeli War, or Third Arab\u2013Israeli War, was fought between June 5 and 10, 1967 by Israel and the neighboring states of Egypt (known at the time as the United Arab Republic), Jordan, and Syria. Are we justified in saying that \"The war left many wounds in society.\"? Yes, no, or maybe? Maybe\n###\nThe Underground Man (1997) is a novel by Mick Jackson. Critically acclaimed, it was shortlisted for the Booker Prize for that year. It shows the life of an eccentric and reclusive Victorian Duke, loosely modelled on William Cavendish-Scott-Bentinck, 5th Duke of Portland. His latest scheme involves building a set of tunnels beneath his estate. Are we justified in saying that \"The Underground Man was critically acclaimed by the critic, James Smith.\"? Yes, no, or maybe? Maybe\n###\nThe Letter Black, formerly known as Breaking the Silence, is a Christian rock band that was formed in 2006 in Uniontown, Pennsylvania. The band consists of lead vocalist Sarah Anthony; her husband, lead guitarist and vocalist Mark Anthony; and drummer Justin Brown. Are we justified in saying that \"Sarah is not a biological woman\"? Yes, no, or maybe?", "doc_id": 874, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41410, 14565, 45457, 1991], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "James Wyatt (\"ca.\" 1968/1969) is a game designer and a former United Methodist minister. He works for Wizards of the Coast, where he has designed several award-winning supplements and adventures for the \"Dungeons & Dragons\" (\"D&D\") roleplaying game. He is the author of several sci-fi and fantasy novels, including a few Forgotten Realms books, and the 4th edition \"Dungeon Master's Guide\". Are we justified in saying that \"James Wyatt designed the roleplaying game \"Dungeons & Dragons\".\"? Yes, no, or maybe? Yes\n###\nThe Astra modelo 400 was a Spanish service pistol produced by weapons manufacturer Astra-Unceta y Cia SA. as a replacement for the Campo-Giro 1913/1916, which had also been chambered in 9mm Largo. It was the standard issue sidearm in the Spanish Army during the Spanish Civil War and also saw service in Germany during World War II. Are we justified in saying that \"Astra Modelo 400 was the sidearm standard in the army of the Spanish. \"? Yes, no, or maybe? Yes\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes. Are we justified in saying that \"The show premiered the year after 1999.\"? Yes, no, or maybe? No\n###\nCourtland Park is a sub-neighbourhood of Carleton Heights in River Ward in the west end of Ottawa, Canada. It is bounded on the north by Baseline Road, on the east by the Rideau River, on the south by Dynes Road and on the west by Fisher Avenue. Prince of Wales Drive runs thorough the neighbourhood. Are we justified in saying that \"Courtland Park was once surrounded by highways\"? Yes, no, or maybe? Maybe\n###\nThe Samsung Galaxy Tab 8.9 is an Android-based tablet computer designed and manufactured by Samsung, introduced on 22 March 2011 at CTIA wireless convention in its Samsung Unpacked event in Orlando. It is part of the Samsung Galaxy Tab series, and features an 8.9-inch display and a 1\u00a0GHz dual-core Nvidia Tegra 2 processor. Are we justified in saying that \"Samsung Galaxy has about 9 inch display\"? Yes, no, or maybe?", "doc_id": 456, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31990, 41235, 12260, 16017], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"Come Back in One Piece is not on the Romeo Must Die soundtrack.\"? Yes, no, or maybe? No\n###\nEldrid Nordb\u00f8 (born 12 August 1942) is a Norwegian politician for the Labour Party. She was personal secretary to the Minister of Social Affairs in 1971, state secretary to the prime minister (1986-89), and Minister of Trade and Shipping (1990-91). She is married to economist and politician Bj\u00f8rn Skogstad Aamo. Are we justified in saying that \"Eldrid Nordb\u00f8 lives in Norway.\"? Yes, no, or maybe? Maybe\n###\nThe Tesla Science Center at Wardenclyffe is a nonprofit organization established to develop a regional science and technology center at the site of Nikola Tesla's former Wardenclyffe laboratory on Long Island, New York. The center raised money through crowdfunding to purchase the property. Are we justified in saying that \"Nikola Tesla's former Wardenclyffe laboratory was owned by New York State\"? Yes, no, or maybe? Maybe\n###\nHarbour Place Shopping Centre (Irish: \"An Chuain Pl\u00e1s Ionad Siopad\u00f3ireachta\" ) is a shopping centre located in Mullingar, Ireland. The centre is anchored by Dunnes Stores, and it is overall the largest store in the shopping centre. It is one of the most well-known shopping centres in Mullingar, and one of the busiest in the town. Are we justified in saying that \"Harbour Place Shopping Centre is anchored by Macy Stores\"? Yes, no, or maybe? No\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings. Are we justified in saying that \"The recipe says to use canola oil\"? Yes, no, or maybe?", "doc_id": 441, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [831, 23895, 32357, 17651], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Last of Us Part II is an upcoming action-adventure survival horror video game developed by Naughty Dog and published by Sony Interactive Entertainment for PlayStation 4. It was announced at the PlayStation Experience event in December 2016, and will serve as the sequel to 2013's \"The Last of Us\". Are we justified in saying that \"The Last of Us Part II was released on December 2016.\"? Yes, no, or maybe? No\n###\nSidalcea oregana\" var. \"calva, the Wenatchee Mountains checker-mallow, is a very rare flowering plant variety that occurs only in five locations in the Wenatchee Mountains of Chelan County, Washington, United States. The plant has been placed on the Endangered species list. It is the rarest known plant in Washington state. Are we justified in saying that \"Sidalcea oregana grows everywhere.\"? Yes, no, or maybe? No\n###\nSouthpaw is a 2015 American sports drama film directed by Antoine Fuqua, written by Kurt Sutter and starring Jake Gyllenhaal, Forest Whitaker and Rachel McAdams. The film follows a boxer who sets out to get his life back on track after losing his wife in an accident and his young daughter to protective services. The film was released on July 24, 2015, by The Weinstein Company. Are we justified in saying that \"When it was released in 2015, the movie Southpaw was expected to be the top grossing movie in the box office.\"? Yes, no, or maybe? Maybe\n###\nOleg Smirnov (born April 8, 1980) is a Russian professional ice hockey right winger currently playing for HC Ryazan in the Russian Major League. He played in the Russian Superleague for Kristall Elektrostal, HC Lipetsk, HC Spartak Moscow, HC Dynamo Moscow, HC CSKA Moscow and Metallurg Novokuznetsk. He was drafted 144th overall in the 1998 NHL Entry Draft by the Edmonton Oilers. Are we justified in saying that \"Oleg Smirnov was born in Moscow on April 8, 1980.\"? Yes, no, or maybe? Maybe\n###\nThe Highway of Hope is a 1917 American Western silent film directed by Howard Estabrook and written by Harvey Gates and Willard Mack. The film stars House Peters, Sr., Kathlyn Williams, Jim Farley and Harry De Vere. The film was released on May 17, 1917, by Paramount Pictures. Are we justified in saying that \"House Peters, Sr. won an award for his singing in The Highway of Hope.\"? Yes, no, or maybe?", "doc_id": 786, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20720, 43325, 41398, 25342], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Christoph Ernst Friedrich von Forcade de Biaix (* 17 September 1821, B\u00fcren near Paderborn; \u2020 18 July 1891 at Reckenberg Castle, in Lichtenfels, Hesse) was a German Rittergut owner, Appellate Court Judge in Hamm, Supreme Court Judge in Berlin and Member of parliament in the German Reichstag. Are we justified in saying that \"Christoph has a long last name.\"? Yes, no, or maybe? Yes\n###\nSadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award. Are we justified in saying that \"The miniseries Sadat was filmed after his death.\"? Yes, no, or maybe? Yes\n###\nBoon Brewery (Brouwerij Boon) is a Belgian brewery situated in Lembeek, near Brussels, that mainly produces geuze and kriek beer of a fairly traditional lambic variety, but using distinctly modern brewing techniques and equipment. Other products of the brewery including Faro beer and Duivelsbier, the traditional beer of Halle. Are we justified in saying that \"I am the boon brewery\"? Yes, no, or maybe? No\n###\n\"Drop Girl\" is a song by American rapper Ice Cube, and produced by Redfoo for FooCo LLC.. The song, released on July 22, 2014. Drop Girl is the sixth single from Ice Cube's upcoming studio album \"Everythang's Corrupt\". The song features guest vocals from American singer Redfoo and fellow rapper 2 Chainz. In the chorus, it samples a part of Ice Cube's Today Was A Good Day. Are we justified in saying that \"\"Drop Girl\" is a song that was released over 2 decades ago\"? Yes, no, or maybe? No\n###\nAfter the Empire of Japan invaded and occupied the Northeast in 1931, the Chinese Communist Party organized small anti-Japanese guerrilla units, and formed their own Northeastern People's Revolutionary Army, dedicated to social revolution, but these were dwarfed by the Anti-Japanese Volunteer Armies which had been raised by their anti-Japanese, patriotic appeal. Are we justified in saying that \"The Japanese approved of the Empire of Japan. \"? Yes, no, or maybe?", "doc_id": 358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [32397, 15715, 4441, 32870], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Princess Maria Gabriella of Savoy (Maria Gabriella Giuseppa Aldegonda Adelaide Ludovica Felicita Gennara; born 24 February 1940) is the middle daughter of Italy's last king, Umberto II, and Marie Jos\u00e9 of Belgium, the \"May Queen\", and a sister of the pretender to their father's throne, Vittorio Emanuele, Prince of Naples. She is an historical writer. Are we justified in saying that \"Maria liked to pen historical text\"? Yes, no, or maybe? Maybe\n###\nPhakisa Freeway is a motor racing circuit located in Odendaalsrus, South Africa. From 1999 to 2004, the venue hosted the South African motorcycle Grand Prix of the MotoGP championship. It has a capacity of 60,000 spectators and opened in 1999. The track has a 4.24\u00a0km road course and a 1.5 mi oval course. The oval track is an exact copy of Las Vegas Motor Speedway from 1997. Are we justified in saying that \"Phakisa Freeway was the only circuit between 1999 to 2004\"? Yes, no, or maybe? Maybe\n###\nAllium campanulatum is a species of wild onion known by the common name dusky onion or Sierra onion. This is a flowering plant native to the western United States from southeastern Washington and northern Oregon to southern California, and western Nevada. The dusky onion grows in foothills and mountains, especially in dry areas, such as chaparral habitats. Are we justified in saying that \"Sierra onion has a pleasant odor.\"? Yes, no, or maybe? Maybe\n###\nCruel World is a 2005 American horror comedy film co-produced and directed by Kelsey T. Howard. The film is about a psychotic man who loses a reality game show and subsequently kills the host. He uses the house where the show took place to film his own reality show. In the show, several contestants perform challenges, and the losers are killed rather than being sent home. Are we justified in saying that \"Cruel World was directed by Kelsey T. Howard.\"? Yes, no, or maybe? Yes\n###\nTurnagain, also called Buru Island, is an island of the \"Western Islands\" region of the Torres Strait Islands archipelago, located in the northern section of Torres Strait, Queensland, Australia. Turnagain is located within the Torres Strait Island Region Local government area. Are we justified in saying that \"Turnagain is connected to land\"? Yes, no, or maybe?", "doc_id": 683, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34472, 31008, 32848, 2704], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ethan Suplee (born May 25, 1976) is an American film and television actor best known for his roles as Seth Ryan in \"American History X\", Louie Lastik in \"Remember the Titans\", Frankie in \"Boy Meets World\", Randy Hickey in \"My Name Is Earl\", Toby in \"The Wolf of Wall Street\", and his roles in Kevin Smith films. Are we justified in saying that \"Ethan Suplee had an acting role in American History X. \"? Yes, no, or maybe? Yes\n###\nDave Dennis (born 20 January 1986 in Sydney) is a national representative rugby union footballer who plays professionally for the Exeter Chiefs He was educated at Richmond High School in Sydney, when he played in the Australian Schoolboys Rugby team in 2004. His primary position is blindside flanker. He can also play No.8. Are we justified in saying that \"Dave Dennis was born in Sydney\"? Yes, no, or maybe? Yes\n###\nRAF Mount Batten was a Royal Air Force station and flying boat base at Mount Batten, a peninsula in Plymouth Sound, Devon, England. Originally a seaplane station opened in 1917 as a Royal Navy Air Service Station Cattewater it became RAF Cattewater in 1918 and in 1928 was renamed RAF Mount Batten. The station motto was \"In Honour Bound\" which is the motto of the Mountbatten family. Are we justified in saying that \"The station was renamed in the century before the current century\"? Yes, no, or maybe? Yes\n###\nSarah Beth Noriega (born April 24, 1976) is a former indoor volleyball player. She played for Loyola Marymount University from 1994 to 1997 and was named the 1997 West Coast Conference Player of the Year. She also played for the United States national team at the 2000 Summer Olympics. Are we justified in saying that \"sarah played voleyball since she was a kid\"? Yes, no, or maybe? Maybe\n###\nNate Albert (born 1970) is an American music executive, songwriter, producer and guitar player. He is currently the Executive Vice President of A&R at Capitol Records a division of Universal Music Group. He was formerly Senior Vice President of A&R at Republic Records, where he worked with such artists as The Weeknd, Florence & the Machine, Phantogram and the Lonely Island. Are we justified in saying that \"Nate Albert was born within the last 100 years.\"? Yes, no, or maybe?", "doc_id": 900, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36555, 38879, 36131, 42107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. That now resides at the official Evel Knievel Museum with Harley Davidson. Are we justified in saying that \"\nLathan McKay is an American curator, producer, actor, writer, and entrepreneur. A former professional skateboarder, he has assembled the largest collection of Evel Knievel memorabilia in the world. He also loves cats.\"? Yes, no, or maybe? Maybe\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries. Are we justified in saying that \"Agent: \nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is strange looking and mysterious.\"? Yes, no, or maybe? Maybe\n###\nLamarck Island is a rocky island 250 m long, lying 300 m east of Petrel Island and 300 m north-east of Rostand Island in the G\u00e9ologie Archipelago, off the Ad\u00e9lie Coast of Antarctica. It was charted in 1951 by the French Antarctic Expedition and named by them after Jean-Baptiste Lamarck, the French naturalist. Are we justified in saying that \"Jean-Baptiste Lamarck was not very proud to have the island named after him.\"? Yes, no, or maybe? Maybe\n###\nIn electromagnetism, charge density is a measure of electric charge is the amount of electric charge per unit length, surface area, or volume, called the linear, surface, or volume charge density, respectively. The respective SI units are C\u22c5m, C\u22c5m or C\u22c5m. Are we justified in saying that \"In electromagnetism can be measure in length.\"? Yes, no, or maybe? Yes\n###\nRoger Heman (March 28, 1932 \u2013 November 13, 1989) was an American sound engineer. He won an Academy Award for Best Sound and was nominated for another one in the same category. His father was also a sound engineer and also won an Academy Award, for Best Effects, Special Effects for \"Crash Dive\". Are we justified in saying that \"Roger Heman won an Academy Award on March 28, 1959.\"? Yes, no, or maybe?", "doc_id": 345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36990, 23350, 34213, 30104], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The discography of Death, a metal band, consists of seven studio albums and four live albums. Death was an American metal band founded in 1983. The band's founder, Chuck Schuldiner, is considered \"a pioneering force in death metal and grindcore\". The band ceased to exist after Schuldiner died of brain cancer in 2001, though it remains an enduring metal brand. Are we justified in saying that \"The discography of Death, a metal band, consists of 11 albums in total to date.\"? Yes, no, or maybe? No\n###\nThe Timber Mountain Log Ride is a log flume water ride at Knott's Berry Farm in Buena Park, California, United States. The ride is one of the oldest log flumes in the United States and is the most popular ride at Knott's Berry Farm. The ride is one of the few log flumes that is themed in the world. Are we justified in saying that \"The ride was built in 1902\"? Yes, no, or maybe? Maybe\n###\nNashville West was a short-lived American country rock quartet that was briefly together in the late 1960s. The group comprised multi-instrumentalist Gene Parsons, guitarist Clarence White, singer-guitarist-fiddler Gib Guilbeau and bassist Wayne Moore. Parsons and White left the band to join The Byrds while Guilbeau and Parsons later joined the Flying Burrito Brothers. Are we justified in saying that \"They were a popular quartet\"? Yes, no, or maybe? Maybe\n###\nHellgate (originally titled Shadows) is a 2011 American-Thai supernatural thriller directed and written by John Penney, starring William Hurt and Cary Elwes. Elwes plays the sole survivor of a car crash who, upon seeing ghosts, seeks help from a spiritual guru (Hurt). Are we justified in saying that \"Hellgate was released in American before Thailand\"? Yes, no, or maybe? Maybe\n###\nThe Bavarian Mountain Hound (German = \"Bayerischer Gebirgsschwei\u00dfhund\") is a breed of dog from Germany. As a scent hound, it has been used in Germany since the early 20th century to trail wounded game. It is a cross between the Bavarian Hound and the Hanover Hound. Are we justified in saying that \"The Bavarian Mountain Hound is typically a cross between the Bavarian Hound and Hanover Hound.\"? Yes, no, or maybe?", "doc_id": 688, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5174, 10539, 33686, 12149], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Wayne Coles-Janess is an Australian producer, writer and director of drama and documentary film and TV programs. Based in Melbourne, Australia, he has produced documentaries about frontier places in the country. He has also made some documentaries in several international locations, including during times of war. Are we justified in saying that \"He will no longer produce shows.\"? Yes, no, or maybe? Maybe\n###\nA surf break at Point Leo, on the Mornington Peninsula, one of the closest surf beaches to Melbourne in Victoria, Australia known as First Reef or more colloquially just \"The Reef\". Until the 1970s there was little or no resident surfing population in Point Leo, so the Reef was mainly surfed by the few transient waveriders who were exploring the many breaks to be found in Westernport Bay. Are we justified in saying that \"The Reef is a very small compared to other locations.\"? Yes, no, or maybe? Maybe\n###\nWenham Parva is a village and a civil parish in Suffolk, England. It covers the village of Little Wenham (whose ancient name it takes) and the hamlet of Wenham Grange. Located in Babergh district, it had a population of 20 in 2005, making it the joint-least populated parish in Suffolk alongside South Cove, Wangford and Wordwell. At the 2011 Census the population had increased to 185. Are we justified in saying that \"In 2011 Wenham Parva still had less than 200 people as a population.\"? Yes, no, or maybe? Yes\n###\nShameless Self-Promotion Is the Sloppy Meateaters' first studio album. The album contained the two original members of the band Josh Chambers (Sloppy Josh) and drummer Kevin Highfield (Sloppy Kevin). Although only two members of the band were recorded on the album the cover of the re-released album contained Travis Gerke who joined the band after the original release. Are we justified in saying that \"Sloppy Josh wanted to be a pianist before he joined the band\"? Yes, no, or maybe? Maybe\n###\nThe Cameroon Airlines Corporation, trading as Camair-Co, is an airline from Cameroon, serving as flag carrier of the country, a role which was previously filled by the now-defunct Cameroon Airlines. Camair-Co has its headquarters in the Immeuble La Rotonde in Douala, and operates out of Douala International Airport. Are we justified in saying that \"camair-co is a replacement of cameroon airlines\"? Yes, no, or maybe?", "doc_id": 773, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6458, 6508, 45456, 18527], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In economics, a Swan Diagram, also known as the Australian model (because it was originally published by Australian economist Trevor Swan in 1956 to model the Australian economy during the Great Depression), represents the situation of a country with a currency peg. Are we justified in saying that \"Trevor once had a pet swan\"? Yes, no, or maybe? Maybe\n###\nThe 2002 Indian vice-presidential election was held on 12 August 2002 to elect the newly-vacated post of Vice-President of India. Bhairon Singh Shekhawat defeated Sushil Kumar Shinde to become 11th Vice President of India. Incumbent VP Krishan Kant did not contest the election and died before the election occurred. Are we justified in saying that \"Krishan Kant served as Vice President of India before August 2002.\"? Yes, no, or maybe? Yes\n###\nEscape from Suburbia: Beyond the American Dream is a 2007 Canadian documentary film written and directed by Gregory Greene, as a sequel to Greene's film \"The End of Suburbia\", and set to address what is termed \"the upcoming energy crisis\". Through interviews with individuals, Gregory Greene outlines potential solutions to the coming energy crisis. Are we justified in saying that \"Greene didn't make any films before 2007. \"? Yes, no, or maybe? No\n###\nFrank Randolph Cady (September 8, 1915\u00a0\u2013 June 8, 2012) was an American actor best known for his recurring and popular role as storekeeper Sam Drucker in three American television series during the 1960s\u00a0\u2013 \"Petticoat Junction\", \"Green Acres\", and \"The Beverly Hillbillies\"\u00a0\u2013 and his earlier role as \"Doc Williams\" on \"The Adventures of Ozzie and Harriet\". Are we justified in saying that \"Cady was also in Dallas.\"? Yes, no, or maybe? Maybe\n###\nShoshana Elise Bean (born September 1, 1977) is an American stage actress, singer and songwriter known for her roles in Broadway musicals. She is best known for being the first replacement actress for the role of Elphaba on Broadway in the musical \"Wicked\". Are we justified in saying that \"Shoshana Elise Bean was born more than 3500 days ago.\"? Yes, no, or maybe?", "doc_id": 912, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36788, 4641, 6943, 44893], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Robert L. \"Rusty\" White (born July 1945 in Newton, Mississippi) is the founder of the \"Robb Report\", a magazine he created while studying art at the University of Mississippi in 1967. \"The Robb Report\" was originally a newsletter entitled \"Twentieth Century Confederates\". White sold \"The Robb Report\" in 1983. Are we justified in saying that \"Robert L. White's magazine started out as a newsletter.\"? Yes, no, or maybe? Yes\n###\nVixen! is a 1968 satiric softcore sexploitation film directed by American motion picture director Russ Meyer. It was the first film to be given an X rating for its sex scenes, and was a breakthrough success for Meyer. The film was developed from a script by Meyer and Anthony James Ryan, and starred Erica Gavin. Are we justified in saying that \"Vixen! was a sexploitation film made in 1968\"? Yes, no, or maybe? Yes\n###\n\"The Inbetweeners\" is a BAFTA Award-winning British sitcom created by Damon Beesley and Iain Morris, and broadcast on E4. The series follows the lives of four sixth form students \u2013 Will McKenzie (Simon Bird), Simon Cooper (Joe Thomas), Jay Cartwright (James Buckley) and Neil Sutherland (Blake Harrison). The series is narrated by Will, who acts as the programme's lead character. Are we justified in saying that \"bafta is an award \"? Yes, no, or maybe? Yes\n###\nJoe Fryer is an American journalist and storyteller working for NBC News as a west coast correspondent based at the NBC News West Coast Bureau in Universal City, California. Fryer joined NBC News in 2013 as a part-time correspondent and officially joined NBC News as a full-time correspondent on October 21, 2013. Are we justified in saying that \"Joe Fryer joined NBC News 3 years ago.\"? Yes, no, or maybe? No\n###\n\"Look at Me (When I Rock Wichoo)\" is a song by American indie rock band Black Kids, taken from their debut album \"Partie Traumatic\". It was released in the UK by Almost Gold Recordings on September 8, 2008 and debuted on the Top 200 UK Singles Chart at number 175. Are we justified in saying that \"The song was released in America in September 2008\"? Yes, no, or maybe?", "doc_id": 1, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24809, 12247, 2624, 44837], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The High Rock Canyon Wilderness is a U S Wilderness Area in Nevada under the Bureau of Land Management. It is located on the southwest side of High Rock Canyon and north of the Little High Rock Canyon Wilderness. It does not include the 4x4 trail in High Rock Canyon. Are we justified in saying that \"The High Rock Canyon Wilderness is government property.\"? Yes, no, or maybe? Yes\n###\nJoel Rueben Madden (born Joel Rueben Combs; March 11, 1979) is the lead vocalist for the American pop punk band Good Charlotte, as well as a record producer, actor, DJ, and UNICEF Goodwill Ambassador. He is also part of the pop rock collaboration The Madden Brothers with his twin brother Benji Madden. Are we justified in saying that \"Joel Madden is a musician known for rap music\"? Yes, no, or maybe? No\n###\nSpy Corps is a spy film for Christian families that was written and directed by J David Baker. It stars Sarah Beth Hill as a fearful high school teenager, and Adam Hale as a secret member of the Reserve Spy Training Corps, a training program for high school students who want to pursue a career as a spy. Are we justified in saying that \"Spy Corps was a film about religion and spies\"? Yes, no, or maybe? Yes\n###\nJosef Jan\u00ed\u010dek (born 28 December 1947 in Prague, Czechoslovakia, now Czech Republic) is a Czech rock keyboardist, singer, accordion and guitar player. He was a former guitarist of The Primitives Group; from 1969 he played with The Plastic People of the Universe. He was also a member of Milan Hlavsa's band called \"P\u016flnoc\". Since 1990, he is a member of The Velvet Underground Revival Band. Are we justified in saying that \"Josef Jan\u00ed\u010dek has an U.\"? Yes, no, or maybe? No\n###\nHerv\u00e9 Le Tellier (born 21 April 1957) is a French writer and linguist, and a member of the international literary group Oulipo (Ouvroir de Litt\u00e9rature Potentielle, which translates roughly as \"workshop of potential literature\"). Other notable members have included Raymond Queneau, Georges Perec, Italo Calvino, Jacques Roubaud, Jean Lescure and Harry Mathews. Are we justified in saying that \"George Washington was a member of Oulipo.\"? Yes, no, or maybe?", "doc_id": 419, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33381, 35993, 40278, 12471], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Haliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015. Are we justified in saying that \"Haliru Dantoro Kitoro III is a strange name.\"? Yes, no, or maybe? Maybe\n###\nThe 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Kak\u00e1. This was the first year in which players from clubs outside of the UEFA federation were eligible for nomination; this change also led to an increase in the voting pool to include journalists from outside UEFA countries. Are we justified in saying that \"The 2007 Ballon d'Or, given to the best football player in the world as judged by an international panel of sports journalists, was awarded to Sterling.\"? Yes, no, or maybe? No\n###\nB\u00ebor the Old is a fictional character in J.R.R. Tolkien's Middle-earth legendarium. He appears in \"The Silmarillion\" as the leader of the First House of the Edain in the First Age, which was called the \"Folk of B\u00ebor\" after him. He was the father of Baran and Belen and ancestor of Beren Camlost. Are we justified in saying that \"B\u00ebor the Old is a fictional character in J.R.R. was the father of Baran and Belen and Beren Camlost.\"? Yes, no, or maybe? No\n###\nEngine is the second album by American Music Club. It was jointly released by Frontier and Grifter in the US and by Zippo in the UK and Europe in 1987. The 1998 Warner Bros. Records reissue added three additional tracks from the same period. The artwork for the Zippo UK release features an incorrect track listing, putting the songs in the wrong order. Are we justified in saying that \"American Music Club consisted of five members\"? Yes, no, or maybe? Maybe\n###\nSeverin Bijeli\u0107 (10 February 1921 \u2013 28 July 1972) was a Serbian actor. He appeared in 77 films and television shows between 1949 and 1972. He starred in the 1967 film \"The Rats Woke Up\", which won the Silver Bear for Best Director at the 17th Berlin International Film Festival. Are we justified in saying that \"Severin Bijeli\u0107 is his stage name.\"? Yes, no, or maybe?", "doc_id": 645, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34274, 34944, 19276, 39355], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Matthew Mansfield (born 24 June 1969) is a former Australian rules footballer who played for the Footscray (now Western Bulldogs) Football Club in the Australian Football League. Originally recruited from the Glenorchy Football Club, Mansfield made his debut in the 1991 AFL season and played 32 games and kicked 5 goals until the 1993 AFL season. Mansfield played in 3 finals in the 1992 AFL season. Are we justified in saying that \"Matthew Mansfield was born within the last 9876 days.\"? Yes, no, or maybe? No\n###\nCentral Mountain Air Ltd. is a Canadian regional airline based in Smithers, British Columbia. It operates scheduled and charter services and transborder services. Its main base is Smithers Airport, with other bases at Calgary International Airport, Vancouver International Airport and Prince George Airport. Are we justified in saying that \"Smithers is the capital of British Columbia.\"? Yes, no, or maybe? Maybe\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor. Are we justified in saying that \"Prawin Pudi knows what Allu Arjun looks like\"? Yes, no, or maybe? Yes\n###\nThe Overwatch World Cup 2017 is an \"Overwatch\" eSports tournament, organized by Blizzard Entertainment, the game's developer. It features 32 represented nations from around the world, with the final tournament taking place at the Anaheim Convention Center from November 3-4, 2017. Are we justified in saying that \"In 2017, the final tournament in the 2017 Overwatch World Cup will be held during the summer.\"? Yes, no, or maybe? No\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\". Are we justified in saying that \"If one adds the number \"1\" to the number \"14\", one will arrive at the same number that's in the title of the magazine the human subject of this context was editor in chief of.\"? Yes, no, or maybe?", "doc_id": 307, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41606, 21249, 1484, 13739], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"Rudbeckia hirta is a weed.\"? Yes, no, or maybe? No\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"St Clement's has a population that has shrunk each year for the past 5\"? Yes, no, or maybe? Maybe\n###\nAnne Frank: The Diary of a Young Girl is an original radio play by author Meyer Levin (1905\u20131981). It was adapted from Levin\u2019s original stage dramatization of the same name, adapted from \"The Diary of a Young Girl\", Anne Frank's diary. It aired on CBS on September 18, 1952, the eve of Rosh Hashanah, to critical acclaim, and again in November 1952. Are we justified in saying that \"People have been watching Anne Frank's movie since they were children.\"? Yes, no, or maybe? Maybe\n###\nChris McKendry (born Christine McKendry February 18, 1968) is a journalist for ESPN, a role she has served since 1996. She was co-anchor of the 11-1pm ET weekday block of live ESPN \"SportsCenter\" shows, alongside Jay Crawford. As of April 1, 2016, she serves as full-time on-site host for ESPN tennis coverage of the Australian Open, French Open, Wimbledon and US Open. Are we justified in saying that \"Tennis is not even a sport that Chris McKendry likes.\"? Yes, no, or maybe? Maybe\n###\nAshcroft is a historic home located at Geneva in Ontario County, New York. It is a 2\u00a0\u2044 -story brick home with a high pitched slate roof with projecting eaves. It is a large Gothic Revival style country house set deep in the midst of once carefully landscaped grounds. The house and property were designed by Calvert Vaux in 1862. Are we justified in saying that \"Ashcroft is orange.\"? Yes, no, or maybe?", "doc_id": 734, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13410, 36766, 19469, 9018], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Marco Masini (born September 18, 1964 in Florence), is an Italian singer-songwriter and musician. . One of his greatest virtues is his voice due to his vocal range, which reaches difficult musical notes, according to experts . . Accompanied by guitarist Riccardo Cherubini, . Are we justified in saying that \"Marco Masini lives in Florence\"? Yes, no, or maybe? Maybe\n###\nThe office of the Governor of Abia State is an elected position. The governor of Abia State is the chief executive of the state and its executive branch. Eight different people have served as governor of Abia State since the state was created on August 27, 1991. The current governor is Okezie Ikpeazu of the People's Democratic Party, in office since May 29, 2015. Are we justified in saying that \"7 different people have served as governor of Abia State since the state was created on August 27, 1991. \"? Yes, no, or maybe? No\n###\nSidney is a town in Kennebec County, Maine, United States. The population was 4,208 at the 2010 census. Sidney was incorporated as a town on January 30, 1792. The town was named for Sir Philip Sidney, an English author. Sidney is included in the Augusta, Maine micropolitan New England City and Town Area. Since 1937, the town is the home of the New England Music Camp. Are we justified in saying that \"Sidney is a fishing destination.\"? Yes, no, or maybe? Maybe\n###\nStillwater Cove Regional Park is a regional park north of Jenner, California, U.S.A. that is maintained by the Sonoma County Regional Parks Department. It is located near the mouth of Stockhoff Creek. Access is by means of State Route 1. It was one of the filming locations for 20th Century Fox's 1947 fantasy film, \"The Ghost and Mrs. Muir\". Are we justified in saying that \"Stillwater Cove Regional Park is situated by water.\"? Yes, no, or maybe? Yes\n###\nThe Big Cube is a 1969 American thriller film directed by Tito Davison and starring Lana Turner, Karin Mossberg, George Chakiris, Daniel O'Herlihy and Richard Egan; it was one of Lana Turner's last movies. It is notable for its aggressive portrayal of LSD use and the 1960s youth counterculture as vicious evils. Are we justified in saying that \"The Big Cube is a film from the southern hemisphere\"? Yes, no, or maybe?", "doc_id": 94, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13899, 29032, 1454, 32729], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o was born in France.\"? Yes, no, or maybe? Yes\n###\nRBG Resources was a British public-limited firm based in London that was allegedly involved in a serious fraud worth close to \u00a3300 million (US$600 million). RBG Resources made $1.1 billion in sales in 2000. It was an affiliate of the United States based Allied Deals Inc., which was also involved in the fraud, and resulted in 14 people convicted or pleading guilty to related crimes. Are we justified in saying that \"RBG Resources is not an affiliate of a US based firm.\"? Yes, no, or maybe? No\n###\nBrew Masters is a television series that was run weekly on Discovery Channel starting on Sunday, November 21, 2010. The show focused on Sam Calagione, the founder and head of Dogfish Head Brewery in Milton, Delaware, and his staff as they searched the world for new, ancient, and imaginative inspirations for beers. Are we justified in saying that \"Brew Masters was aired once a month.\"? Yes, no, or maybe? No\n###\nUSS \"Chicago\" (CA-136) was a \"Baltimore\"-class heavy cruiser laid down on 28 July 1943 at Philadelphia, Pennsylvania, US, by the Philadelphia Navy Yard. Launched on 20 August 1944, she was sponsored by Mrs. Edward J. Kelly, wife of the Mayor of Chicago, Illinois, and commissioned at the Philadelphia Navy Yard on 10 January 1945, Captain Richard R. Hartung, USN, in command. Are we justified in saying that \"The wife of the Mayor of Chicago sponsored a heavy cruiser in 1943\"? Yes, no, or maybe? Yes\n###\nRobert Newton \"Bob\" Ford (January 31, 1862 \u2013 June 8, 1892) was an American outlaw best known for killing his gang leader Jesse James in April 1882, to collect a reward. For about a year, Ford and his older brother Charles performed paid re-enactments of the killing at publicity events. Later he drifted around the West, operating saloons and dance halls. Are we justified in saying that \"Ford was only twenty when he killed James. \"? Yes, no, or maybe?", "doc_id": 76, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9203, 31632, 28836, 3656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Jon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International. Are we justified in saying that \"Jon L. Luther's success in his industry can only be attributed to his hard work.\"? Yes, no, or maybe? Maybe\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs was almost kicked out of the New Democratic party.\"? Yes, no, or maybe? Maybe\n###\nSebo Walker (born April 28, 1988) is a professional skateboarder and artist living in Venice, Los Angeles who is best known for winning the Red Bull Manny Mania Amateur U.S. Championships in 2011 and for living in his Chrysler Town and Country van in West LA from 2010\u20132015. Are we justified in saying that \"sebo walker lived five years in los angeles\"? Yes, no, or maybe? Yes\n###\nThe High Bridge Branch was a branch line of the Central Railroad of New Jersey (CNJ) that started in High Bridge, New Jersey at a connection with the CNJ main line and continued north to iron-ore mines in Morris County. The High Bridge Branch line followed the South Branch of the Raritan River for much of its duration. Are we justified in saying that \"The High Bridge Branch never was near a body of water.\"? Yes, no, or maybe? No\n###\nMichael Blodgett (September 26, 1939 \u2013 November 14, 2007) was an American actor, novelist, and screenwriter. Of his many film and television appearances he is best known for his performance as gigolo Lance Rocke in Russ Meyer's 1970 cult classic \"Beyond the Valley of the Dolls\". He retired from acting in the late 1970s and began a writing career. Are we justified in saying that \"Blodgett acted and wrote at the same time\"? Yes, no, or maybe?", "doc_id": 313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14444, 559, 38676, 22091], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Pursuit of Happyness is a 2006 American biographical drama film based on entrepreneur Chris Gardner's nearly one-year struggle being homeless. Directed by Gabriele Muccino, the film features Will Smith as Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. Are we justified in saying that \"Director Gabriele Muccino is also a producer.\"? Yes, no, or maybe? Maybe\n###\nArthur William Feuerstein (born December 20, 1935) is an American chess player and winner of the first U.S. Armed Forces Chess Championship. According to the U.S. Chess Federation, Feuerstein is the shared 53rd ranked chess player over 65, regardless of country, residence or federation. Are we justified in saying that \"Arthur William Feuerstein played chess for many years\"? Yes, no, or maybe? No\n###\nIleana Carusio, best known with the stage names of Ramba, Mal\u00f9 or Mal\u00f9 Ramba (born 1967) is a retired Italian pornographic actress whose career spanned over 6 years. At first her name was \"Mal\u00f9\", but the most fitting Ramba was launched by a journalist as he was referring to her aggressive look with guns and magazines. She has appeared in many films. Are we justified in saying that \"Ileana Carusio belongs to the Generation X.\"? Yes, no, or maybe? Maybe\n###\n\"Boat on the River\" is a 1979 song by Styx, from their album \"Cornerstone\". It was released as a single in 1980, but did not chart in the band's native United States. However, it was popular in several German-speaking countries, becoming a top-five hit on the German, Austrian and Swiss charts (reaching number one on the latter.) Are we justified in saying that \"Styx released a single in 1980 that was popular in Europe\"? Yes, no, or maybe? Yes\n###\nThe Basketbowl was a college basketball game between Michigan State University and the University of Kentucky held on December 13, 2003 at Ford Field, a domed American football stadium in Detroit, Michigan. Kentucky won the game 79\u201374, never trailing throughout the contest. Are we justified in saying that \"Kentucky football defeated Michigan State University in 2003\"? Yes, no, or maybe?", "doc_id": 304, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39188, 44782, 39382, 32213], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Homebrew is a free and open-source software package management system that simplifies the installation of software on Apple's macOS operating system. Originally written by Max Howell, the package manager has gained popularity in the Ruby on Rails community and earned praise for its extensibility. Homebrew has been recommended for its ease of use as well as its integration into the command line. Are we justified in saying that \"Homebrew is better than other management systems.\"? Yes, no, or maybe? Maybe\n###\nRichard Noel Marshall Armitage (12 August 1928\u00a0\u2013 17 November 1986) was a talent agent, active in England in the 1950s\u20131980s. Among his clients were Rowan Atkinson, John Cleese, David Frost and Stephen Fry. Producer John Lloyd described him as \"the most powerful agent in the country at that time [the late 1970s]\". Are we justified in saying that \"Marshall passed away in the last decade the century.\"? Yes, no, or maybe? No\n###\nFoals are an English indie rock band from Oxford, England formed in 2005, consisting of lead vocalist and lead guitarist Yannis Philippakis, drummer and percussionist Jack Bevan, rhythm guitarist Jimmy Smith, bassist Walter Gervers, and keyboardist Edwin Congreave. Since the band's formation, their line-up has remained constant, except for the departure of former lead singer Andrew Mears. Are we justified in saying that \"It is hot outside.\"? Yes, no, or maybe? Maybe\n###\nVP-HL-1 was a Heavy Patrol Squadron (Landplane) of the U.S. Navy. The squadron was established as Bombing Squadron 116 (VB-116) on 1 December 1943, redesignated Patrol Bombing Squadron 116 (VPB-116) on 1 October 1944, redesignated Patrol Squadron 116 (VP-116) on 15 May 1946, redesignated Heavy Patrol Squadron (Landplane) 1 (VP-HL-1) on 15 November 1946 and disestablished on 22 May 1947. Are we justified in saying that \"The US Navy oversaw three redesignations of the initially branded Bombing Squadron.\"? Yes, no, or maybe? Yes\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records. Are we justified in saying that \"Christopher Tafoya is currently 43 years old. \"? Yes, no, or maybe?", "doc_id": 435, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10666, 10016, 28892, 23280], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Vladislav Adolfovitch Rusanov (Russian: \u0412\u043b\u0430\u0434\u0438\u0441\u043b\u0430\u0432 \u0410\u0434\u043e\u043b\u044c\u0444\u043e\u0432\u0438\u0447 \u0420\u0443\u0441\u0430\u043d\u043e\u0432 ) is a fantasy writer, candidate of technical sciences (1999). Writes in Russian language. Also is known for translations of fantasy and romantic poetry into Russian. Formerly a Ukrainian citizen he now identifies with the Donetsk People's Republic. Are we justified in saying that \"Rusanov's full nameis Vladislav Adolfovitch Rusanov. \"? Yes, no, or maybe? Yes\n###\nToolbox Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch. It is a remake of the 1978 film of the same name and was produced by the same people behind the original. The film centralizes on the occupants of an apartment who are stalked and murdered by a masked killer. Are we justified in saying that \"Tooldot Murders is a 2004 horror film directed by Tobe Hooper, and written by Jace Anderson and Adam Gierasch.\"? Yes, no, or maybe? No\n###\nExergonix Inc, is an energy storage company based in Kansas City, Missouri. It was founded in 2010, after spinning out of Kokam America, Inc., which was acquired by Dow Chemical Company in 2009. Exergonix develops, manufactures and deploys community-level and grid energy storage to supply peak-shaving, demand-management, and smart grid capabilities to the grid and micro-grid. Are we justified in saying that \"Exergonix Inc has been around for 100 years.\"? Yes, no, or maybe? No\n###\nWallace Michael Ross (19 September 1920 \u2013 20 January 2010) was the founder of the Derby Bach Choir. He was also the Master of Music at Derby Cathedral, assistant organist at several great English Cathedrals, teacher of languages and music at several schools including Sturgess School in Derby. He also founded the Derby Cathedral Brass Ensemble and the Derby Sinfonia. Are we justified in saying that \"He hated being a teacher\"? Yes, no, or maybe? Maybe\n###\nThe Texas A&M Aggie baseball team represents Texas A&M University in NCAA Division I college baseball. The Aggies have competed in the Southeastern Conference since 2013. The Aggies play home games at Olsen Field at Blue Bell Park. The team is led by head coach Rob Childress. Are we justified in saying that \"The Aggies are not the only team in the Southeastern Conference. \"? Yes, no, or maybe?", "doc_id": 875, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28436, 13561, 19967, 35155], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Legendary Majik Mijits is an album that was recorded by Steve Marriott and Ronnie Lane when they reformed under the name of \"Majik Mijits\" in 1981 and gave a one-off concert at the Bridgehouse pub in East London. The lineup included Jim Leverton, Mick Green, Mick Weaver, Dave Hynes and Sam Brown. Are we justified in saying that \"Over 10,000 people attended the concert put on by Steve Marriot and Ronnie Lane at the Bridgehouse Pub in East London.\"? Yes, no, or maybe? Maybe\n###\nJango is a crime-comedy series produced in 1961 by Associated Rediffusion for British television. It starred Robert Urquhart in the lead role of Jango Smith, with Moira Redmond as Dee Smith, his wife. The show also featured performances by Peter Sallis and Brian Wilde. Are we justified in saying that \"Brian Wilfe was in a crime-comedy series.\"? Yes, no, or maybe? Yes\n###\nBrandon Tyler McManus (born July 25, 1991) is an American football placekicker for the Denver Broncos of the National Football League (NFL). He was a member of their Super Bowl 50 championship team, beating the Carolina Panthers. He played college football at Temple and was signed by the Indianapolis Colts as an undrafted free agent in 2013. Are we justified in saying that \"Brandon Tyler McManus is over 20 years old\"? Yes, no, or maybe? Yes\n###\nConvoy PQ-4 was the fifth of the Arctic Convoys of World War II by which the Western Allies supplied material aid to the Soviet Union in its fight with Nazi Germany. The Convoy sailed from Hvalfjord, Iceland on 17 November 1941 and arrived at Archangelsk on 28 November 1941. Are we justified in saying that \"Convoy PQ-4 used ships to transport supplies.\"? Yes, no, or maybe? Yes\n###\nAfter Dark is a brand of Indian whisky, manufactured by Radico Khaitan. The whisky was test marketed in 2010, and rolled out nationwide in India by September 2011. It is a 100% grain-based whisky manufactured at Radico's Rampur distillery. It is available in 750ml, 375ml and 180ml bottles. The brand's tagline is \"One Life, Many Passions...Why wait\". Are we justified in saying that \"The Whisky brand was test marketed in India before 2010 by Radico Kahitan in 330ml bottles.\"? Yes, no, or maybe?", "doc_id": 337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18378, 29840, 7716, 18907], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Paul Albert Raymond Barlatier de Mas (13 October 1802, Saint-Omer - 24 January 1874, Paris), was a French Baron, Chief of the Paris-Lyon-M\u00e9diterran\u00e9e Railway Company, and mayor of Dammarie-les-Lys. Baron Barlatier de Mas was the grandson of Captain Paul Fran\u00e7ois Ignace de Barlatier de Mas. Are we justified in saying that \"Paul Albert Raymond Barlatier de Mas was a baron\"? Yes, no, or maybe? Yes\n###\nKidsty Pike is a fell in the English Lake District, standing to the west of Haweswater Reservoir. It is a subsidiary top of Rampsgill Head, but has long achieved the status of a separate fell, thanks to its classic peaked profile. Wainwright followed this convention in his \"Pictorial Guide to the Lakeland Fells\". Are we justified in saying that \"Kidsty Pike has been seen by hank.\"? Yes, no, or maybe? Maybe\n###\nTrue as a Turtle is a 1957 British comedy film directed by Wendy Toye and starring John Gregson, Cecil Parker, June Thorburn and Keith Michell. In the film, a young couple embark on a voyage on a ketch named \"Turtle\". John Coates wrote the screenplay, based on his novel of the same name. Are we justified in saying that \"Wendy went on to write other films\"? Yes, no, or maybe? Maybe\n###\nMary Isobel Downer, Lady Downer (13 December 1924 \u2013 14 October 2014) was a prominent South Australian patron, wife of federal MP and high commissioner Sir Alexander \"Alick\" Downer, and mother of Liberal Party leader, Australian Foreign Minister and high commissioner Alexander Downer. Are we justified in saying that \"Mary Isobel Downer, Lady Downer never loved her husband federal MP and high commissioner Sir Alexander \"Alick\" Downer\"? Yes, no, or maybe? Maybe\n###\n\"Aven Romale\" (Come in Gypsies), is a song by the Czech group Gipsy.cz that was the Czech entry at the 2009 Eurovision Song Contest held in Moscow, Russia. It scored zero points at the Eurovision Song Contest semi-final, thereby failing to qualify for the final. Are we justified in saying that \"Aven Romel was not popular in the eurovision song contest.\"? Yes, no, or maybe?", "doc_id": 565, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [486, 7265, 3019, 10085], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Love's Labour's Lost is a 2000 adaptation of the comic play of the same name by William Shakespeare, directed by and starring Kenneth Branagh. It was the first feature film to be made of this lesser-known comedy. Branagh's fourth film of a Shakespeare play (he did not direct the 1995 \"Othello\", although he did play Iago), \"Love's Labour's Lost\" was a box-office and critical disappointment. Are we justified in saying that \"The feature film Love's Labour's Lost was released more than 5 years ago, but within the last 50 years.\"? Yes, no, or maybe? Yes\n###\nThe 2015 City of Onkaparinga ATP Challenger was a professional tennis tournament played on hard courts. It was the first edition of the tournament which was part of the 2015 ATP Challenger Tour. It took place in Happy Valley, Australia between 3\u201311 January 2015. Are we justified in saying that \"The 2015 City of Onkaparinga ATP Challenger had 5 editions of the tournament.\"? Yes, no, or maybe? Maybe\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings. Are we justified in saying that \"Icre de fasole is made with beans, vegetable oil and onions.\"? Yes, no, or maybe? Yes\n###\nThe 24th Air Division is an inactive United States Air Force intermediate echelon command and control organization. It was last assigned to First Air Force, Tactical Air Command (ADTAC). It was inactivated on 30 September 1990 at Griffiss Air Force Base, New York. Are we justified in saying that \"The 24th Air Division stationed all over the world before becoming inactive \"? Yes, no, or maybe? Maybe\n###\nTinker Field was an outdoor-baseball stadium in Orlando, Florida, United States. It was named after baseball Hall of Famer, Joe Tinker. Tinker Field was located in the West Lakes neighborhoods of Downtown Orlando, adjacent to the Camping World Stadium and one mile west of the Amway Center. In April, 2015 the City of Orlando tore down the grandstands and removed all other extant buildings. Are we justified in saying that \"the Camping World Stadium the Amway Center and the tinker field stadium are very close to each other \"? Yes, no, or maybe?", "doc_id": 532, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [8755, 8471, 4671, 41056], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis. Are we justified in saying that \"The Copenhagen Consensus Center was founded before 1990.\"? Yes, no, or maybe? Maybe\n###\nAmanda Knox is a 2016 American documentary film about Amanda Knox, twice convicted and later acquitted of the 2007 murder of Meredith Kercher, directed by Rod Blackhurst and Brian McGinn. It premiered at the Toronto International Film Festival on September 10, 2016 and on Netflix on September 30, 2016. Are we justified in saying that \"Amanda Knox and the documentary both share the same name\"? Yes, no, or maybe? Yes\n###\nMore of Tom Lehrer was the second studio album recorded by musical satirist Tom Lehrer. The LP contains the same songs (in the same sequence) as the live album \"An Evening Wasted with Tom Lehrer\", which was recorded and released earlier in the same year. The album was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1959. Are we justified in saying that \"\"An Evening Wasted with Tom Lehrer\" was recorded and mixed in a single three-hour session at the RCA Studios in New York on July 8, 1979.\"? Yes, no, or maybe? No\n###\nThe 1977 Los Angeles Dodgers season had Tommy Lasorda replace longtime manager Walter Alston as Manager of the team. The Dodgers won the National League West by 10 games and defeated the Philadelphia Phillies in four games in the NLCS, then lost to the New York Yankees in the World Series. Are we justified in saying that \"The New York Yankees beat the Phillies in the 1977 world series. \"? Yes, no, or maybe? No\n###\nMurray, Utah was declared a city July 3, 1902, instituting a mayor-council form of government. The mayor of Murray was originally partisan, but switched to a non-partisan position. The term of mayor was originally two years, but amended to a four-year term in the 1940s in accordance with state law. The following is a list of Mayors of Murray, Utah. Are we justified in saying that \"Murray's mayors have always been lawyers.\"? Yes, no, or maybe?", "doc_id": 563, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [12836, 31563, 7014, 17180], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In guitar music, especially electric guitar, a power chord (also fifth chord) is a colloquial name for a chord that consists of the root note and the fifth. Power chords are commonly played on amplified guitars, especially on electric guitar with distortion. Power chords are a key element of many styles of rock and especially in heavy metal, and punk rock. Are we justified in saying that \"Carlos Santana is well known for his thrilling power chords.\"? Yes, no, or maybe? Maybe\n###\nCircus Palestine (Hebrew: \u05e7\u05e8\u05e7\u05e1 \u05e4\u05dc\u05e9\u05ea\u05d9\u05e0\u05d4\u200e \u200e , translit.\u00a0Kirkas Palestina) is a 1998 Israeli political satire film directed by Eyal Halfon, which was nominated for seven Israeli Film Academy Awards, winning five. The film was selected as the Israeli entry for the Best Foreign Language Film at the 71st Academy Awards, but was not accepted as a nominee. Are we justified in saying that \"Critics felt the movie deserved the two Israeli Film Academy Awards it did not win.\"? Yes, no, or maybe? Maybe\n###\nIan Drew is Entertainment Director for American celebrity magazine, Us Weekly. He speaks about celebrities, music and fashion on television shows including CNN, Good Morning America, The Early Show, MSNBC, and Fox News. He interviewed Janet Jackson for one of Us Weekly's best-selling issues. Are we justified in saying that \"Ian Drew is Entertainment Director for American celebrity magazine.\"? Yes, no, or maybe? Yes\n###\nI Am That Change is a 2014, Telugu short film directed by Sukumar and produced by actor Allu Arjun on Geetha Arts. Apart from Allu Arjun, the short film features an ensemble cast of Tanisshq Reddy, Vikram Chaitanya, Surya Ashrith, Trisha, Sri Varshini, Bharath Reddy and Sathish. Sai Karthik is the music director and Amol Rathod is the cinematographer while Prawin Pudi is the editor. Are we justified in saying that \"Prawin Pudi also edits other short films bye Geetha Arts.\"? Yes, no, or maybe? Maybe\n###\nAna B\u00e1rbara is the debut album by Mexican singer Ana B\u00e1rbara, released in 1994. She was nominated for a Premio Lo Nuestro Award in two Regional Mexican categories, including Best New Artist. She won her first Premio Furia Musical Award for Best New Artist. Are we justified in saying that \"Ana B\u00e1rbara has 1 premio furia musical award\"? Yes, no, or maybe?", "doc_id": 123, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6188, 13047, 36877, 25929], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Rodrequis La'Vant Stephens (born June 14, 1966 in Atlanta, Georgia) is a former American football linebacker in the National Football League for the Seattle Seahawks and the Washington Redskins. He played college football for the Georgia Tech Yellow Jackets. Are we justified in saying that \"Rodrequis La'Vant Stephens was paid over 5 million\"? Yes, no, or maybe? Maybe\n###\nSuccess is a 1983 studio album originally released by American singing duo The Weather Girls. The album includes the group's biggest hit, \"It's Raining Men\", which peaked at #1 on the U.S. Dance chart, #46 on the U.S. Pop chart, & #34 on the U.S. R&B chart. Are we justified in saying that \"Success was an album made by singing duo The Wonder Girls\"? Yes, no, or maybe? No\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke. Are we justified in saying that \"\"The Candidate\" is the 4th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall.\"? Yes, no, or maybe? No\n###\nWhite Fang 2: Myth of the White Wolf is a 1994 American Northern adventure film directed by Ken Olin. A sequel to the 1991 \"White Fang\", it stars Scott Bairstow, Alfred Molina, and Geoffrey Lewis. Filming took place in Aspen, Colorado and Vancouver, British Columbia. Walt Disney Home Video released this movie on VHS October 19, 1994. Are we justified in saying that \"White Fang 2: Myth of the White Wolf is a exciting film\"? Yes, no, or maybe? Maybe\n###\n\"Duffle Bag Boy\" is a song by American hip hop duo Playaz Circle, released as the debut lead single from their debut album, \"Supply & Demand\" (2007). The song features a guest appearance from fellow American rapper Lil Wayne and was produced by M16 and Liam Kantwill. The song peaked in the Top 40 of the U.S. \"Billboard\" Hot 100, reaching number 15. Are we justified in saying that \"Duffle Bag Boy had their first Top 10 Billboard song in 2010\"? Yes, no, or maybe?", "doc_id": 950, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [11363, 34685, 17205, 416], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound of Waves (\u6f6e\u9a12 , Shiosai ) is a 1954 novel by the Japanese author Yukio Mishima. It is a coming-of-age story of the protagonist Shinji and his romance with Hatsue, the beautiful daughter of the wealthy ship owner Terukichi. For this book Mishima was awarded the Shincho Prize from Shinchosha Publishing in 1954. It has been adapted for film five times. Are we justified in saying that \"\"The Sound of Waves\" received an award the same year it was published.\"? Yes, no, or maybe? Yes\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit. Are we justified in saying that \"There is a national park that is close to Bridge Mountain and managed by the United States.\"? Yes, no, or maybe? Maybe\n###\nSt. Mark's Coptic Orthodox Cathedral is a Coptic church located in the Abbassia District in Cairo, Egypt. The cathedral is the Seat of the Coptic Orthodox Pope. It was built during the time when Pope Cyril VI of Alexandria was Pope of the Coptic Orthodox Church, and was inaugurated by him in 1969. Are we justified in saying that \"St. Mark's Coptic Orthodox Cathedral was planned by Pope Cyril VI of Alexandria.\"? Yes, no, or maybe? Maybe\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"Migos has had at least one song in the Billboard top 100\"? Yes, no, or maybe? Yes\n###\nHe Was Cool (; lit. \"That Guy was Cool\") is a 2004 South Korean film based on the same-titled 2001 Internet novel written by Guiyeoni. The film was released in South Korean cinemas on July 23, 2004 and was the 35th most attended film of the year with 800,000 admissions. Are we justified in saying that \"He was cool was released in North Korea.\"? Yes, no, or maybe?", "doc_id": 385, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13379, 19697, 21056, 26088], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Big Ballet was a British documentary television programme produced by Rare Day and broadcast on Channel 4. The three-episode series was first broadcast on 6 February 2014. It followed Wayne Sleep and prima ballerina Monica Loughman as they worked with a troupe of amateur dancers to realise their dream of dancing Swan Lake. Are we justified in saying that \"Big Ballet was aired on Channel 4.\"? Yes, no, or maybe? Yes\n###\nTakeover/Cloud 9 is a British music publishing company. The organisation is a co-owned subsidiary of Takeover Entertainment Ltd and EMI Music Publishing. It was founded by English rapper Kwasi Danquah III (commonly known as Tinchy Stryder) and EMI Music Publishing\u2019s UK president and EMI European creative president, Guy Moot, as a publishing arm solely for Danquah's music in December 2008. Are we justified in saying that \"They own the publishing rights to Stryder's music in the USA.\"? Yes, no, or maybe? Maybe\n###\nLimnocharis flava (commonly known as yellow velvetleaf, sawah flower rush, sawah lettuce) is a species of aquatic flowering plant which is native to Mexico, Central America, South America, Cuba, Haiti and the Dominican Republic but widely naturalized in southern and southeastern Asia: India, Sri Lanka, Cambodia, Burma, Thailand, Vietnam, Indonesia, Malaysia and southern China (Guangdong, Yunnan). Are we justified in saying that \"Limnocharis grows on land. \"? Yes, no, or maybe? No\n###\nThe 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"The 1997 Porsche Tennis Grand Prix took place in 1999\"? Yes, no, or maybe? No\n###\nThe 2016\u201317 Danish Cup was the 63rd season of the Danish Cup competition. F.C. Copenhagen won the tournament, earning qualification into the second qualifying round of the 2017\u201318 UEFA Europa League. However, as F.C. Copenhagen also won the 2016\u201317 Danish Superliga, Br\u00f8ndby IF, the cup runners-up, is allotted that position in the 2017\u201318 UEFA Europa League. Are we justified in saying that \"F.C. Copenhagen won in 2017.\"? Yes, no, or maybe?", "doc_id": 896, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [1786, 25805, 42806, 23165], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Emmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\". Are we justified in saying that \"Emmanuel Frechette won the Canadian Screen Award for Best Art Direction or Production Design for War Witch (Rebelle) in 2017.\"? Yes, no, or maybe? No\n###\nMark Christopher Randall (born September 30, 1967) is an American former professional basketball player who played in four National Basketball Association (NBA) seasons for the Chicago Bulls, Minnesota Timberwolves, Detroit Pistons, and Denver Nuggets. Randall was selected by the Bulls in the first round (26th pick overall) of the 1991 NBA Draft and averaged 2.6 points per game for his career. Are we justified in saying that \"Mark Christopher Randall played basketball in high school.\"? Yes, no, or maybe? Maybe\n###\nThe Gaming Control Act was passed in 1992 to control the growth of the gaming industry and the introduction of casinos in Windsor and Niagara Falls, Ontario. The act was enforced by the Gaming Control Commission Ontario to ensure honesty, integrity, and financial responsibility to participants as well as preventing criminal activity such as lottery scams. Are we justified in saying that \"The Gaming Control Act ensured honesty\"? Yes, no, or maybe? Yes\n###\nIlse von Glatz (August 21, 1958 \u2013 May 2, 2014) was a Canadian actress who played an Advocate in the 1988 science fiction TV series \"War of the Worlds\". She also worked in \"The Mind of Simon Foster\" (episode of \"the 1985 version of The Twilight Zone\"). She also appeared in at least one episode of \"\" in 1989. Are we justified in saying that \"The show was science fiction\"? Yes, no, or maybe? Yes\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks. Are we justified in saying that \"Jack Bender had Evangeline Lilly act according to his interpretation of the script of \"Lost\"\"? Yes, no, or maybe?", "doc_id": 796, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40133, 2696, 16931, 31157], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Hooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\". Are we justified in saying that \"Hooked on a Feeling is a song by Swedish Rock.\"? Yes, no, or maybe? Yes\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"Thalia has at least four albums. \"? Yes, no, or maybe? Yes\n###\nCarmen Lebbos (Arabic: \u0643\u0627\u0631\u0645\u0646 \u0644\u0628\u0651\u0633\u200e \u200e ; born 1963) is a Lebanese actress who has been working in film, television and the theater since 1981. She has been in several television series and movies including Ziad Doueiri\u2019s \"West Beyrouth\" and Josef Fares\u2019s \"Zozo\". Are we justified in saying that \"Carmen Lebbos has only been in one television series.\"? Yes, no, or maybe? No\n###\nDiaspora studies is an academic field established in the late 20th century to study dispersed ethnic populations, which are often termed diaspora peoples. The usage of the term diaspora carries the connotation of forced resettlement, due to expulsion, coercion, slavery, racism, or war, especially nationalist conflicts. Are we justified in saying that \"Diaspora studies is usually taken by wealthy college students\"? Yes, no, or maybe? Maybe\n###\nSteve Koren is an Emmy Award winning writer/producer and screenwriter. Most notably he\u2019s written for \"Saturday Night Live\", \"Seinfeld\", and \"Veep\". He also wrote or co-wrote the movies \"Bruce Almighty\", \"Click\", \"A Night at the Roxbury\" and \"Superstar\". Are we justified in saying that \"The co-writer for Superstar also wrote for Seinfeld.\"? Yes, no, or maybe?", "doc_id": 640, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22036, 27488, 33394, 23875], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cari Elizabeth Roccaro (born July 18, 1994) is an American soccer defender from East Islip, New York. She currently plays for the United States under-20 women's national soccer team and helped the team win the 2012 FIFA Under-20 Women's World Cup held in Tokyo, Japan. She previously played for the New York Fury in the WPSL Elite. Are we justified in saying that \"Cari Elizabeth Roccaro is a man.\"? Yes, no, or maybe? No\n###\nNew Day is a 1949 book by Jamaican author V. S. Reid. It was Reid's first novel. \"New Day\" deals with the political history of Jamaica as told by a character named Campbell, who is a boy at the time of the Morant Bay Rebellion (in 1865) and an old man during its final chapters. It may have been the first novel to use Jamaican vernacular as its language of narration. Are we justified in saying that \"New Day was written by an African author.\"? Yes, no, or maybe? No\n###\nHaliru Dantoro Kitoro III (1938 \u2013 October 30, 2015) was a Nigerian traditional ruler and politician. Dantoro became Emir of the Borgu, a Nigerian traditional state located in Niger State, on February 26, 2002, following the overthrown of his predecessor, Alhaji Isiaku Musa Jikantoro. Dantoro served as Emir until his death on October 28, 2015. Are we justified in saying that \"Niger State is nice.\"? Yes, no, or maybe? Maybe\n###\nWeltenbrand is a darkwave band from Liechtenstein formed in 1995 by Oliver Falk. Ritchie Wenaweser and Simone Steiner joined for vocals. In the same year, the band secured a record deal with Witchhunt Records and subsequently released their first album, \"Das Rabenland\". a romantically inclined darkwave album that immediately found acceptance within the genre. Are we justified in saying that \"Weltenbrand was formed less than 5000 days ago.\"? Yes, no, or maybe? No\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer. Are we justified in saying that \"Gabriel Julio Fern\u00e1ndez Capello was born in a location south of Dallas, TX.\"? Yes, no, or maybe?", "doc_id": 976, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22509, 12023, 35265, 35463], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Conoclinium coelestinum, the blue mistflower, is a North American species of herbaceous perennial flowering plant in the sunflower family. It was formerly classified in the genus \"Eupatorium\", but phylogenetic analyses in the late 20th century research indicated that that genus should be split, and the species was reclassified in \"Conoclinium\". Are we justified in saying that \"The blue mistflower is from Texas.\"? Yes, no, or maybe? Maybe\n###\nTobias Svantesson (born April 1, 1963, in Malmo, Sweden), is a former professional tennis player from Sweden. He enjoyed most of his tennis success while playing doubles. During his career he won 2 doubles titles. He achieved a career-high doubles ranking of World No. 65 in 1991. His career high world ranking in singles was no 89. Are we justified in saying that \"Tobias Svantesson met Trump.\"? Yes, no, or maybe? Maybe\n###\nColin Francis Weeber Isaacs (born 1953) is a former politician in Ontario, Canada. He was a New Democratic Party member in the Legislative Assembly of Ontario representing the riding of Wentworth from 1979 to 1981. He works as an environmental consultant and journalist and publishes the Gallon Newsletter. Are we justified in saying that \"Colin Francis Weeber Isaacs represented the riding of Wentworth starting from 1979 and then ending on 1981\"? Yes, no, or maybe? Yes\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines is a sports news organization\"? Yes, no, or maybe? No\n###\nMetal Gear Solid is an action-adventure stealth video game produced by Konami Computer Entertainment Japan and released for the PlayStation in 1998. The game was directed, produced, and co-written by series creator Hideo Kojima, and serves as a sequel to the MSX2 video games \"Metal Gear\" and \"\", which Kojima also wrote and directed. Are we justified in saying that \"Metal Gear Solid was the first in the Metal Gear series which contains ten games.\"? Yes, no, or maybe?", "doc_id": 784, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [17859, 24656, 37819, 8161], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler. Are we justified in saying that \"Dan Deacon was born after 1985\"? Yes, no, or maybe? No\n###\nThere Is a Man in Our House (Arabic: \u0631\u064e\u062c\u0650\u0644 \u0628\u064a\u062a\u0650\u0646\u0627 \u0641\u064a\u200e \u200e Fi baitina rajul) is a 1961 Egyptian drama, history, romance film directed by Henry Barakat, an Egyptian film director of Lebanese origin. The film is based on a novel by an Egyptian writer, Ihsan Abdel Quddous, and stars Omar Sharif. Are we justified in saying that \"Both of Barakat's parents are Lebanese.\"? Yes, no, or maybe? Maybe\n###\nPaysonia stonensis (syn. \"Lesquerella stonensis\") is a species of flowering plant in the mustard family, known by the common name Stones River bladderpod. It is endemic to Tennessee in the United States, where it is limited to Rutherford County. It grows only in the floodplains of the Stones River, and certain tributaries. Are we justified in saying that \"People really like flowers\"? Yes, no, or maybe? Maybe\n###\nHook, Line and Sinker is an Australian fishing television program, produced by \"HLS Productions\" in Hobart, Tasmania and is hosted by Nick Duigan and Andrew Hart. The program premiered in 2001 and is broadcast nationally on the Southern Cross Television network. The show is aired on Saturday afternoons and runs for 30 minutes. Are we justified in saying that \"Australian fishing television program Hook Line and Sinker is the most popular fishing show in Australia.\"? Yes, no, or maybe? Maybe\n###\nInvitation to Sociology: A Humanistic Perspective is a 1963 book about sociology by sociologist Peter L. Berger, in which Berger sets out the intellectual parameters and calling of the scientific discipline of sociology. Many of the themes presented in this book were later developed in his 1966 book \"The Social Construction of Reality\", coauthored with Thomas Luckmann. Are we justified in saying that \"Peter L. Berger died in 1966.\"? Yes, no, or maybe?", "doc_id": 860, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [30137, 19368, 31959, 1802], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "KnowledgeWare was a software company headquartered in Atlanta, Georgia co-founded by James Martin and run by Fran Tarkenton. It produced a Computer Aided Software Engineering (CASE) tool called IEW (Information Engineering Workbench). KnowledgeWare was sold to Sterling Software in 1994, which was in its turn acquired by Computer Associates. Are we justified in saying that \"Computer Associates earned a lot of money from KnowledgeWare\"? Yes, no, or maybe? Maybe\n###\nLouis Glenn Marson (born June 26, 1986) is an American former professional baseball catcher. He played in Major League Baseball (MLB) for the Philadelphia Phillies and Cleveland Indians. He is currently the hitting coach for the Salt Lake Bees, the Triple A affiliate of Major League Baseball's Los Angeles Angels. Are we justified in saying that \"Louis Glenn Marson is an American\"? Yes, no, or maybe? Yes\n###\nPlatylesches lamba, the Neave's banded hopper, is a butterfly in the Hesperiidae family. It is found in Ivory Coast, Ghana, Cameroon, the Democratic Republic of the Congo (Shaba), western Uganda, Malawi and northern Zambia. The habitat consists of woodland and open places in the forest zone. Are we justified in saying that \"The butterfly is found in over 2 countries\"? Yes, no, or maybe? Yes\n###\nPaul Annacone and Christo van Rensburg were the defending champions. Annacone participated with John Fitzgerald, and lost in the quarterfinals to Scott Davis and David Pate, while Van Rensburg played with Kevin Curren, and lost in the semifinals to Grant Connell and Glenn Michibata.
Rick Leach and Jim Pugh defeated Connell and Michibata 3\u20136, 6\u20134, 6\u20132, in the final. Are we justified in saying that \"rick leach won in the final\"? Yes, no, or maybe? Yes\n###\nSpaceballs is a 1987 American comic science fiction film co-written, produced and directed by Mel Brooks. Starring Brooks, Bill Pullman, John Candy, and Rick Moranis, the film also features Daphne Zuniga, Dick Van Patten, and the voice of Joan Rivers. In addition to Brooks in a supporting role, the film also features Brooks regulars Dom DeLuise and Rudy De Luca in cameo appearances. Are we justified in saying that \"Dom DeLuise and Rick Moranis were supporting actors in film, whereas Rudy De Luca was one of the starring actors.\"? Yes, no, or maybe?", "doc_id": 439, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [15968, 20523, 23029, 20042], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Ireland ( ; Irish: \"\u00c9ire\" ] ; Ulster-Scots: \"Airlann\" ] ) is an island in the North Atlantic. It is separated from Great Britain to its east by the North Channel, the Irish Sea, and St George's Channel. Ireland is the second-largest island of the British Isles, the third-largest in Europe, and the twentieth-largest on Earth. Are we justified in saying that \"Ireland is the second largest island in the British Isles\"? Yes, no, or maybe? Yes\n###\nBernardino Zacchetti (active c. 1523) was an Italian painter of the Renaissance period. He was born in Reggio Emilia. His style recalls Raphael, and is also said to have worked with Michelangelo in the Sistine chapel. His picture of \"St. Paul\" in the church of San Prospero at Reggio recalls Il Garofalo. One of his pupils, Giovanni Soncini was the godfather of Corregio\u2019s second daughter. Are we justified in saying that \"Bernardino Zacchetti was an artist\"? Yes, no, or maybe? Yes\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues. Are we justified in saying that \"The news website is very popular\"? Yes, no, or maybe? Maybe\n###\nDan Deacon (born August 28, 1981) is an American composer and electronic musician based in Baltimore, Maryland. Since 2003, Deacon has released eight albums under several different labels. Deacon is renowned for his live shows, where large-scale audience participation and interaction is often a major element of the performance. Godson of Paul J Fowler. Are we justified in saying that \"Dan Deacon is a slow man\"? Yes, no, or maybe? Maybe\n###\nIt's OK! is a musical group formed by Redd Kross members Robert Hecker (guitar, vocals) and Victor Indrizzo (drums), along with bassist Abby Travis and the late Greg White on vocals. This initial line up of the band released the self-titled debut album \"It's OK!\". Are we justified in saying that \"It's OK had several members, they included Hecker (drums) Indrizzo (vocals) Travis (guitar, vocals) and White bassist.\"? Yes, no, or maybe?", "doc_id": 100, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2337, 12509, 19289, 26398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Cleethorpes Town Football Club is a football club based in Grimsby in North East Lincolnshire, England. The club are currently members of the Northern Premier League Division One South and play at Grimsby Borough's Bradley Football Development Centre. Are we justified in saying that \"We are in the 21st century. \"? Yes, no, or maybe? Yes\n###\nThe Charter Township of Lansing is a charter township of Ingham County in the U.S. state of Michigan. As of the 2010 census, it had a population of 8,126. The township consists of five non-contiguous tracts of land: one on the west side of Lansing, three on the east side between Lansing and East Lansing, and one on the southeast side of Lansing. Are we justified in saying that \"The population of the Charter Township of Lansing has decreased since the last census\"? Yes, no, or maybe? Maybe\n###\nHumphrey Mieno Ochieng (born 25 December 1989 in Nairobi) is a Kenyan footballer who currently plays for Kenyan Premier League side Tusker and the Kenya national team as a midfielder. He previously played for A.F.C. Leopards Sofapaka and Kenya Commercial Bank in the Kenyan Premier League, as well as Tunisian side Club Africain and Tanzanian club Azam. Are we justified in saying that \"Humphrey Mieno Ochieng is 40\"? Yes, no, or maybe? No\n###\nBeno\u00eet ( or ; ] ) is a Catholic French male given name, or, less frequently, Benoist. The name is the Old French word for \"blessed\", equivalent to the English name Benedict. The female form of the name is Beno\u00eete, or B\u00e9n\u00e9dicte as well as family name. Are we justified in saying that \"Benoit is a name given to boys.\"? Yes, no, or maybe? Yes\n###\nThe San Nicolao Tunnel is a motorway tunnel in the Swiss canton of Ticino. The tunnel is situated under the Monte Ceneri Pass that separates the north of the canton around Bellinzona from the south of the canton around Lugano. It forms part of the A2 motorway that links the north of Switzerland with Italy. It was completed in 1984, and is 1412 m in length. Are we justified in saying that \"The San Nicolao Tunnel is a tunnel for the big ships. \"? Yes, no, or maybe?", "doc_id": 577, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7694, 33732, 20855, 13796], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Lorca F\u00fatbol Club, S.A.D. is a Spanish football team based in Lorca, in the autonomous community of the Region of Murcia. Founded in 2003, it currently plays in Segunda Divisi\u00f3n, holding home games at Estadio Francisco Art\u00e9s Carrasco, which has a capacity of 8,120. Are we justified in saying that \"Lorca Futbol Club is a Spanish recipe based in Lorca, which has a serving capacity of 8,120.\"? Yes, no, or maybe? No\n###\nMargaret Lucille Jeanne Parker (born 24 July 1943) is a Member of the European Parliament (MEP) for the East Midlands region for the UK Independence Party. She was elected in 2014. She was born in Grantham and educated at Kesteven and Grantham Girls' School and De Montfort University where she read Law. Are we justified in saying that \"Margaret Lucille Jeanne Parker is less than 80 years old \"? Yes, no, or maybe? Yes\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Tillia tepe is still excavated to this day.\"? Yes, no, or maybe? Maybe\n###\nVan Cleef & Arpels is a French jewelry, watch, and perfume company. It was founded in 1896 by Alfred Van Cleef and his uncle Salomon Arpels in Paris. Their pieces often feature flowers, animals, and fairies, and have been worn by style icons such as Farah Pahlavi, the Duchess of Windsor, Grace Kelly, and Elizabeth Taylor. Are we justified in saying that \"Van Cleef & Arpels will be the number one French jeweler next year.\"? Yes, no, or maybe? Maybe\n###\nLex Talionis Fraternitas, Inc. Sodalitas Ducum Futurorum is an exclusive fraternal organization of Filipino jurists, legal practitioners and law students founded on September 29, 1969 at the San Beda College of Law. A chapter in the Ateneo de Davao University School of Law was established in 1974. In 1983, the Securities and Exchange Commission granted the incorporation of the fraternity. Are we justified in saying that \"Lex is an organization traditionally made of men\"? Yes, no, or maybe?", "doc_id": 614, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [5003, 22200, 44205, 21010], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Happy Mother's Day, Love George (also known Run Stranger, Run) is a 1973 American mystery film produced and directed by Darren McGavin. The film stars Patricia Neal, Cloris Leachman, Bobby Darin, Tessa Dahl, Ron Howard, Kathie Browne, Joe Mascolo, Simon Oakland, and Thayer David. Are we justified in saying that \"The filmed bombed at the box office.\"? Yes, no, or maybe? Maybe\n###\nCanning Downs was the first residential establishment built by a white person on the Darling Downs in Queensland, Australia. It is located a short drive from the town of Warwick and originally extended south east to Killarney and the McPherson Range. The area was first named after the British statesman George Canning by Allan Cunningham. Are we justified in saying that \"Canning Downs was the first residential establishment built by a white person on the Darling Downs in Queensland, Australia. White people are very cool.\"? Yes, no, or maybe? Maybe\n###\nWireshark is a free and open source packet analyzer. It is used for network troubleshooting, analysis, software and communications protocol development, and education. Originally named Ethereal, the project was renamed Wireshark in May 2006 due to trademark issues. Are we justified in saying that \"Trademark issues affected the release of Wireshark.\"? Yes, no, or maybe? Maybe\n###\nSt Clement's is a district in Oxford, England, on the east bank of the River Cherwell. Its main road, St Clement's Street (often shortened to just \"St Clement's\"), links The Plain (a roundabout) near Magdalen Bridge with London Place at the foot of Headington Hill at the junction with Marston Road to the north. Are we justified in saying that \"st clements is not in england\"? Yes, no, or maybe? No\n###\nLA1:TV (strictly the Lancaster University Student Television Station, often simply LA1) is a non-profit student television station at Lancaster University. It is a constitutional part of the Lancaster University Students' Union (LUSU) but is run as an independent student society. Some of LA1\u2019s current programmes include \"Good Morning Lancaster\" (GML), \"Sugar TV\", and \"Sound Booth\". Are we justified in saying that \"LA1 broadcasts in the morning\"? Yes, no, or maybe?", "doc_id": 977, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [35703, 29333, 28450, 23597], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Come With Me To Hell, Pt. 1 is the second studio album collaboration between Memphis-based rappers DJ Paul and Lord Infamous. It was released in 1994 and distributed through the independent record label, Prophet Entertainment. A \"Remastered Edition\" of Come With Me To Hell, Pt. 1 was released on March 6, 2014, to critical acclaim, following the success of the re-released material. Are we justified in saying that \"Come With Me To Hell, Pt. 1 was written in 1995\"? Yes, no, or maybe? No\n###\nThe Kid from Left Field is a 1953 baseball film starring Dan Dailey, Anne Bancroft, Lloyd Bridges, and Billy Chapin. The film marked the reunion of Dailey and director Harmon Jones who had teamed up at 20th Century Fox a year earlier in another baseball film, the biographical \"The Pride of St. Louis\". Are we justified in saying that \"The Kid from Left Field is a baseball film\"? Yes, no, or maybe? No\n###\nDemi Lovato: Live in Concert (also known as the Summer Tour 2009) was the debut headlining concert tour by American singer Demi Lovato, launched in support of her debut album \"Don't Forget\" (2008) and the second studio album \"Here We Go Again\" (2009). Are we justified in saying that \"Demi Lovato has only released one album.\"? Yes, no, or maybe? No\n###\nSusan Peters (born Suzanne Carnahan; July 3, 1921 \u2013 October 23, 1952) was an American film, stage, and television actress. After studying acting with Austrian theatre director Max Reinhardt, she appeared in several uncredited bit parts before earning a minor supporting part in \"Santa Fe Trail\" (1940). Her supporting role in \"Tish\" led to Peters signing a contract with Metro-Goldwyn-Mayer in 1942. Are we justified in saying that \"Susan Peters detested minor supporting parts.\"? Yes, no, or maybe? Maybe\n###\nTunnel Vision is a 2001 novel by author Keith Lowe, his debut novel, published by MTV Books on 1 October 2001. Set in London, the plot revolves around Andy, a man soon to be married, who makes a drunken bet with his friend Rolf two nights before his wedding that he can visit every single station of the London Underground in a single day. Are we justified in saying that \"Tunnel Vision was written in 2001\"? Yes, no, or maybe?", "doc_id": 365, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28554, 5055, 30510, 21148], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Smithereens is a 1998 album from UK singer-songwriter Nick Harper. It was produced by Glenn Tilbrook of the band Squeeze and released on Tilbrook's own Quixotic label; he also sang co-lead vocals (with Harper) on the acoustic version of \"Smithereens\" which ends the album. Are we justified in saying that \"Nick Harper owns the label Quixotic. \"? Yes, no, or maybe? No\n###\nHi! Pristin (stylized as HI! PRISTIN) is the debut mini-album by South Korean girl group Pristin. It was released on March 21, 2017, by Pledis Entertainment, and distributed by LOEN Entertainment. The EP consists of six songs, including the singles \"Wee Woo\" and \"Black Widow\". In order to promote the album, the group performed on several Korean music shows. Are we justified in saying that \"Hi! Pristin are considered a boy band\"? Yes, no, or maybe? No\n###\nJohan Martin Schr\u00f6der (born 13 May 1931 in Amsterdam) is a Dutch pilot and founder of Martinair, the second Dutch airline (after KLM) and the first Dutch air charter company. At the founding in 1958 the company was known as Martin's Air Charter (MAC). Are we justified in saying that \"He was a dutch pilot\"? Yes, no, or maybe? Yes\n###\nStephen R. \"Steve\" Bissette (born March 14, 1955) is an American comics artist, editor, and publisher with a focus on the horror genre. He is known for working with writer Alan Moore and inker John Totleben on the DC comic book \"Swamp Thing\" in the 1980s. Are we justified in saying that \"Swamp Thing takes place entirely in a swamp.\"? Yes, no, or maybe? Maybe\n###\nThe Battle of Vauchamps (14 February 1814) was the final major engagement of the Six Days Campaign of the War of the Sixth Coalition. It resulted in a part of the Grande Arm\u00e9e under Napoleon I defeating a superior Prussian and Russian force of the Army of Silesia under Field-marshal Gebhard Leberecht von Bl\u00fccher. Are we justified in saying that \"The Battle of Vauchamps starts with C.\"? Yes, no, or maybe?", "doc_id": 175, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20056, 14839, 1981, 32026], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Stanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele. Are we justified in saying that \"Stanley Fredrick Steele scored 97 goals from 1955 until 1968.\"? Yes, no, or maybe? Yes\n###\nUnited Spirits Limited, abbreviated to USL, is an Indian alcoholic beverages company, and the world's second-largest spirits company by volume. It is a subsidiary of Diageo, and headquartered at UB Tower in Bangalore, Karnataka. USL exports its products to over 37 countries. Are we justified in saying that \"USL doesn't make much alcohol.\"? Yes, no, or maybe? No\n###\nFasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings. Are we justified in saying that \"Fasole batuta is a dipping paste.\"? Yes, no, or maybe? Yes\n###\n\"Come Back in One Piece\" is the third official single from the \"Romeo Must Die\" soundtrack. The song was performed by Aaliyah and features a guest appearance by DMX. The song was not a Top 40 hit single in the U.S., though it was a minor urban contemporary hit. In Europe, \"Come Back in One Piece\" and \"I Don't Wanna\" were released together as double A-side single. Are we justified in saying that \"Come Back in One Piece was a hit for 2pac.\"? Yes, no, or maybe? No\n###\n\"Whatever the Case May Be\" is the twelfth episode of the first season of \"Lost\". It was directed by Jack Bender and written by Damon Lindelof and Jennifer Johnson. It first aired on January 5, 2005, on ABC. The character of Kate Austen (Evangeline Lilly) is featured in the episode's flashbacks. Are we justified in saying that \"The 10th episode of the first season of \"Lost\" aired on January 1, 2005.\"? Yes, no, or maybe?", "doc_id": 741, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [10670, 4797, 20954, 44174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frankenstein Castle (German: \"Burg Frankenstein\" ) is a hilltop castle in the Odenwald overlooking the city of Darmstadt in Germany. It is thought that this castle may have been an inspiration for Mary Shelley when she wrote her 1818 Gothic novel \"Frankenstein\". Are we justified in saying that \"Frankenstein Castle is also known as Burg Frankenstein\"? Yes, no, or maybe? Yes\n###\nThe Inter-American Peace Force (IAPF) was established, by the Organization of American States, on 23 May 1965, after the United States's intervention in the Dominican Republic. It largely consisted of over 42,600 United States military personnel, plus the following troops were sent by each country; Are we justified in saying that \"The IAPF was founded in 1990\"? Yes, no, or maybe? No\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison. Are we justified in saying that \"Alix Bancourt is a blogger who writes in french for her online blog that is translated later to english.\"? Yes, no, or maybe? Yes\n###\nThe Gold Diggers is a play written by Avery Hopwood. It popularized the use of the term \"gold digger\" to refer to women who seek wealthy partners, as opposed to the earlier usage referring to gold miners. Producer David Belasco staged it on Broadway in 1919, with Ina Claire in the lead role. It was a hit, running for two consecutive seasons before going on tour. Are we justified in saying that \"Avery Hopwood was born in eighteen hundred eighty six.\"? Yes, no, or maybe? Maybe\n###\nD.A.R.Y.L. is a 1985 American science fiction film written by David Ambrose, Allan Scott and Jeffrey Ellis. It was directed by Simon Wincer and stars Barret Oliver, Mary Beth Hurt, Michael McKean, Danny Corkill, and Josef Sommer. The original music score was composed by Marvin Hamlisch. Are we justified in saying that \"Mary Beth starred in an American science fantasy film.\"? Yes, no, or maybe?", "doc_id": 166, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [9727, 35748, 3123, 16169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Melbourne Heart FC Futsal was a futsal club based in Melbourne, Victoria, founded in 2012. They played in the F-League, the top tier of Australian Futsal. The club was disbanded before the start of the 2014 season after the A-League team were bought by Manchester City FC. Are we justified in saying that \"Melbourne Heart FC Futsal was around for more than three years. \"? Yes, no, or maybe? No\n###\nThe 89th Medium Tank Battalion was an armored tank unit of the United States Army. It was activated in Korea in August, 1950 and in November 1951, it was assigned to the 25th Infantry Division. The unit participated no fewer than ten campaigns, from 1951 through the Armistice in 1953 with the 26th Division. It earned the Presidential Unit Citation and the Navy Unit Commendation. Are we justified in saying that \"The 89th Medium Tank Battalion was the very bravest armored tank unit of the United States Army\"? Yes, no, or maybe? Maybe\n###\nHenry Gabriel Ginaca (May 19, 1876 - 1918) was an American engineer who invented, at the direction of Hawaiian pineapple magnate James Dole in 1911, a machine that could peel and core pineapples in an automated fashion. Called the Ginaca machine, the invention exponentially increased pineapple production and revolutionized the fruit canning industry. He died in the Spanish flu epidemic. Are we justified in saying that \"The Ginaca machine was used to process pineapples.\"? Yes, no, or maybe? Yes\n###\nArthur C. Clarke's World of Strange Powers is a popular thirteen-part British television series looking at strange worlds of the paranormal. It was produced by Yorkshire Television for the ITV network and first broadcast in 1985. It was the sequel to the 1980 series \"Arthur C. Clarke's Mysterious World\". Are we justified in saying that \"Arthur C. Clarke has two separated yet related Tv series mentioned in this excerpt. \"? Yes, no, or maybe? Yes\n###\nBugger or \"buggar\" is a slang word. In the United Kingdom, the term is a general-purpose expletive, used to imply dissatisfaction, or to refer to someone or something whose behaviour is in some way displeasing or perhaps surprising. In the US, particularly in the Midwest and South, it is a slang but not offensive noun meaning \"small critter.\" Are we justified in saying that \"Bugger is a term used in every country.\"? Yes, no, or maybe?", "doc_id": 36, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6272, 39048, 20251, 36525], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "You Can Be Anyone This Time Around is an album by Timothy Leary released in 1970. The disc features three \"raps\" by Leary backed with psychedelic music. The purpose of the album was to raise funds for Leary's political candidacy for Governor of California. Are we justified in saying that \"You Can Be Anyone This Time Around was released more than 15 years ago.\"? Yes, no, or maybe? Yes\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"the Edgar Award is prominent award to win.\"? Yes, no, or maybe? Maybe\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences. Are we justified in saying that \" Rude boy is the third studio album by Bey-once was recorded in June 08 2014\"? Yes, no, or maybe? No\n###\nShades 1968\u20131998 is a 4 CD-Box-Set by the British hard rock band Deep Purple. It was released on 16 March 1999. It spans their career from 1968 to 1998. This box set contains rare edits and singles which are remastered along with album versions of their biggest hits. Are we justified in saying that \"It was released on 16 July1999\"? Yes, no, or maybe? No\n###\nDavid Armand (born 1980) is an American writer of fiction, non-fiction, and poetry. He has published three novels, \"The Pugilist's Wife\", \"Harlow\", and \"The Gorge\". He has also published a collection of poems, \"The Deep Woods\", and a memoir titled \"My Mother's House\". He is currently Writer-in-Residence at Southeastern Louisiana University. Are we justified in saying that \"David Armand is still alive.\"? Yes, no, or maybe?", "doc_id": 316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24731, 8654, 26355, 33122], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Two Men And A Truck is a franchised moving company, headquartered in Lansing, Michigan, with franchises in 41 U.S. states, as well as the United Kingdom, Canada, and Ireland. The company is the largest franchised moving company in the United States with more than 410 locations. Are we justified in saying that \"Two Men And A Truck has many locations.\"? Yes, no, or maybe? Yes\n###\nJon L. Luther is an American foodservice industry executive. He was the chairman and chief executive officer of Dunkin' Brands. Luther is the Chairman of the Board of the Culinary Institute of America and Arby's Restaurant Group, and a director at Six Flags Entertainment Corporation, Wingstop Restaurants, and Tempur Sealy International. Are we justified in saying that \"Luther earns seven figures in his executive position.\"? Yes, no, or maybe? Maybe\n###\nAmy Timberlake is the author of three children\u2019s books: \"One Came Home\", \"That Girl Lucy Moon\", and \"The Dirty Cowboy\". \"One Came Home\" was awarded the Newbery Honor and the Edgar Award. \"That Girl Lucy Moon\" was awarded by the Friends of American Writer\u2019s Literacy, and \"The Dirty Cowboy\" has received a Parent\u2019s Choice Gold Medal and won the 2004 Golden Kite Award. Are we justified in saying that \"Amy Timberlake won a Parent\u2019s Choice Gold Medal and won the 2024 Golden Kite Award.\"? Yes, no, or maybe? No\n###\nTaina is an American sitcom that aired on Nickelodeon and distributed by Nelvana Limited. It was one of the last live-action comedy shows taped at Nickelodeon Studios but later moved to the Nickelodeon on Sunset in Hollywood, for its second season. The show aired from January 14, 2001 to May 11, 2002. Are we justified in saying that \"It aired for a little over a year\"? Yes, no, or maybe? Yes\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The perpetrator sought to attack the police from the beginning\"? Yes, no, or maybe?", "doc_id": 535, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [36854, 733, 21861, 25181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Shannon Kelley is a former American football quarterback. He was the starting quarterback of the Texas Longhorns in the beginning of 1988. After graduating, he married Olympian and popular American athlete Mary Lou Retton and after pursuing a business career, went into college coaching. He's currently the assistant head football coach at Houston Baptist University. Are we justified in saying that \"Shannon Kelley wants to coach Texas State.\"? Yes, no, or maybe? Maybe\n###\nRudyard Kipling's The Jungle Book is a 1994 live-action American adventure film co-written and directed by Stephen Sommers, produced by Edward S. Feldman and Raju Patel, from a story by Ronald Yanover and Mark Geldman. It is the second film adaptation by The Walt Disney Company of the Mowgli stories from \"The Jungle Book\" and \"The Second Jungle Book\" by Rudyard Kipling. Are we justified in saying that \"Rudyard Kipling's The Jungle Book was not animated\"? Yes, no, or maybe? Yes\n###\nView from the Top is a 2003 American romantic comedy film directed by Bruno Barreto, and stars Gwyneth Paltrow, Christina Applegate, Candice Bergen, Joshua Malina, Mark Ruffalo, Rob Lowe, Mike Myers, and Kelly Preston. The film follows a young woman (Paltrow) from a small town who sets out to fulfill her dream of becoming a flight attendant. Are we justified in saying that \"View from the Top was seen by Bush.\"? Yes, no, or maybe? Maybe\n###\nPasquines is a policy and politics non-profit news organization that covers news related to politics, government, design and economy in Puerto Rico. The website has its base of operations in Mayaguez, PR. It was founded by William-Jose Velez Gonzalez who serves as Editor in chief. Are we justified in saying that \"Pasquines covers news about the government and politics of PR\"? Yes, no, or maybe? Yes\n###\nNewtrament is a musician, MC and DJ known for releasing an early UK electro/hip hop record - \"London Bridge is Falling Down\" - on Jive Records. It was based on the nursery rhyme (previously adapted by the reggae group Culture) with a political message that electoral politics were a sham. Are we justified in saying that \"Newtrament owns Jive Records.\"? Yes, no, or maybe?", "doc_id": 997, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28735, 29773, 26853, 583], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sea Wall (French: Un barrage contre le Pacifique ) is a 2008 film by Cambodian director Rithy Panh in a French/Cambodian/Belgian co-production. The film opened on 7 January 2009 in France. It was adapted from the 1950 novel \"The Sea Wall\" by Marguerite Duras. The novel had previously been adapted as \"This Angry Age\" by Ren\u00e9 Cl\u00e9ment in 1958. Are we justified in saying that \"Marguerite Duras directed the film. \"? Yes, no, or maybe? No\n###\nThe 1960 Gator Bowl was a college football bowl game between the Southwest Conference (SWC) co-champion Arkansas Razorbacks and the Georgia Tech Yellow Jackets representing the Southeastern Conference (SEC). Arkansas defeated Georgia Tech, 14\u20137, in front of 45,104 spectators. There were two players named Most Valuable Player: Jim Mooty of Arkansas and Maxie Baughan of Georgia Tech. Are we justified in saying that \"The 1960 Gator Bowl had a winner.\"? Yes, no, or maybe? Yes\n###\n\"Don't Look Back\" is a song by British pop-rock band Fine Young Cannibals. It was released as the third single from the band's 1988 album \"The Raw & the Cooked\". The song reached the top 40 charts in the United Kingdom, United States, Canada, Australia, and New Zealand. Are we justified in saying that \"The album \"Don't Look Back\" by Fine Young Cannibals had 3 top 40 hits. \"? Yes, no, or maybe? Maybe\n###\nGay Sex in the 70s is a 2005 American documentary film about gay sexual culture in New York City in the 1970s. The film was directed by Joseph Lovett and encompasses the twelve years of sexual freedom bookended by the Stonewall riots of 1969 and the recognition of AIDS in 1981, and features interviews with Larry Kramer, Tom Bianchi, Barton Lidice Bene\u0161, Rodger McFarlane, and many others. Are we justified in saying that \"Gay Sex in the 70s is one of the best documentaries ever.\"? Yes, no, or maybe? Maybe\n###\nLive at Austin City Limits Festival by Northern Irish singer-songwriter Van Morrison is a limited edition live album recorded from the Austin City Limits Festival concert at which he was the first night headliner on September 15, 2006. It has only been made available at live Van Morrison concerts and at the Van Morrison Official website. Are we justified in saying that \"Van Morrison is in another band.\"? Yes, no, or maybe?", "doc_id": 832, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [228, 7984, 26631, 1049], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Fasole b\u0103tut\u0103 or Fasole f\u0103c\u0103luit\u0103 (literally \"mashed beans\") or icre de fasole (roe of beans) is a beans-based paste. This dip is traditionally made by mashing boiled beans and mixed with vegetable oil adding some chopped onions. Some recipes call for garlic, chili pepper, lime/lemon juice and/or additional seasonings. Are we justified in saying that \"Lime juice is sometimes added to fasole b\u0103tut\u0103 recipes because of its acidity.\"? Yes, no, or maybe? Maybe\n###\nWilliam Hargis Bowman, Jr. (April 21, 1941 \u2013 February 22, 2011), better known by his stage name, Beau Dollar, was a soul vocalist and drummer for King Records. He performed on many studio albums for various artists under contract with King, including James Brown. His most prominent work was performed as \"Beau Dollar & The Dapps\" and \"Beau Dollar & The Coins\". Are we justified in saying that \"Bowman played the flute when he was young\"? Yes, no, or maybe? Maybe\n###\nFarmington Falls is an unincorporated village in the town of Farmington, Franklin County, Maine, United States. The community is located along the Sandy River 5 mi southeast of the village of Farmington; U.S. Route 2, Maine State Route 27, Maine State Route 41, and Maine State Route 156 all pass through the village. Farmington Falls has a post office with ZIP code 04940. Are we justified in saying that \"Farmington Falls is just south of the Canadian Border. \"? Yes, no, or maybe? Maybe\n###\nHomicide: The Movie is a television movie that aired February 13, 2000, one year after the completion of the American police drama television series \"\". It was written by the series' head writer Tom Fontana and staff writers Eric Overmyer and James Yoshimura, and directed by Jean de Segonzac, who had served as a cinematographer and director several times during the show's run. Are we justified in saying that \"homicide was aired five years ago\"? Yes, no, or maybe? No\n###\nPersuasion was the planned fifth studio solo album by Adam Ant, planned for 1992-3 but never released. The album has however surfaced as bootlegs, and nowadays circulates on file sharing networks. This album is one of the 20 written about in \"The Greatest Music Never Sold\" by Dan Leroy, a book that revealed information on the lost recordings of many famous musicians. Are we justified in saying that \"Persuasion was Adam Ants most popular album\"? Yes, no, or maybe?", "doc_id": 574, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13577, 13904, 22893, 16458], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Henry Newman, {'1': \", '2': \", '3': 'Cong. Orat.', '4': \"} , (21 February 1801 \u2013 11 August 1890) was an Anglican priest, poet and theologian and later a Catholic cardinal, who was an important and controversial figure in the religious history of England in the 19th century. He was known nationally by the mid-1830s. Are we justified in saying that \"John Henry Newman was born less than 100 years ago.\"? Yes, no, or maybe? No\n###\nJo\u00e3o was born in Boulogne-Billancourt, France to Pedro de Alc\u00e2ntara, Prince of Gr\u00e3o-Par\u00e1 and Countess Elisabeth Dobr\u017eensky de Dobr\u017eenicz. His father had been a member of the Brazilian Imperial Family but had joined his mother Isabel, Princess Imperial of Brazil in exile after the abolition of the monarchy. When Jo\u00e3o was 19, he emigrated to Brazil. Are we justified in saying that \"Jo\u00e3o starts with a J.\"? Yes, no, or maybe? Yes\n###\nGreatest Hits Volume 1 is a greatest hits compilation album by The Beatles which was exclusive to Australia and New Zealand. The album was compiled by EMI Australia to fill in the gap between \"Rubber Soul\" and \"Revolver\" (much like \"A Collection of Beatles Oldies\" would in 1966 in between \"Revolver\" and \"Sgt. Pepper's Lonely Hearts Club Band\"). Are we justified in saying that \"One of the counties that got the album starts with a T\"? Yes, no, or maybe? Yes\n###\nTillya tepe, Tillia tepe or Till\u0101 tapa (Persian: \u0637\u0644\u0627 \u062a\u067e\u0647\u200e \u200e ) or (literally \"Golden Hill\" or \"Golden Mound\") is an archaeological site in the northern Afghanistan province of Jowzjan near Sheberghan, excavated in 1978 by a Soviet-Afghan team led by the Greek-Russian archaeologist Viktor Sarianidi, a year before the Soviet invasion of Afghanistan. The hoard is often known as the Bactrian gold. Are we justified in saying that \"Viktor Sarianidi was also a physicist and did remarkable work in the field \"? Yes, no, or maybe? Maybe\n###\nThere is a little Shia community in El Salvador. There is an Islamic Library operated by the Shia community, named \"Fatimah Az-Zahra\". They published the first Islamic magazine in Central America: \"Revista Biblioteca Isl\u00e1mica\". Additionally, they are credited with providing the first and only Islamic library dedicated to spreading Islamic culture in the country. Are we justified in saying that \"The community is south of the United States.\"? Yes, no, or maybe?", "doc_id": 0, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24730, 11223, 30185, 41012], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In the American Mafia, a made man is a fully initiated member of the Mafia. To become \"made\", an associate first has to be sponsored by another made man. An inductee will be required to take the oath of Omert\u00e0, the mafia code of silence. After the induction ceremony the associate becomes a \"made man\", and holds the rank of soldier (Italian: soldato) in the Mafia hierarchy. Are we justified in saying that \"An inductee is a soldier before the induction ceremony\"? Yes, no, or maybe? No\n###\nThe 2014 Rialto Channel New Zealand Film Awards was the third presentation of the New Zealand Film Awards, a New Zealand film industry award. The 2014 ceremony took place in Shed 10 on Queen's Wharf in Auckland on Friday 12 December 2014. It was webcast live at the nzherald.co.nz, and later broadcast on the Rialto Channel. Are we justified in saying that \"The New Zealand Film Awards was livestreamed.\"? Yes, no, or maybe? Yes\n###\nThe Bigger Picture is a 2014 British animated short film directed by Daisy Jacobs. It has been nominated for the Academy Award for Best Animated Short Film at the 87th Academy Awards. It won the BAFTA Award for Best Short Animation at the 68th British Academy Film Awards. Are we justified in saying that \"The Bigger Picture has the voice of dan.\"? Yes, no, or maybe? Maybe\n###\nDuel is a 1971 television (and later full-length theatrical) thriller film written by Richard Matheson, which is based on his own short story. The film is the full-length film directing debut of American director, producer, and screenwriter Steven Spielberg. Are we justified in saying that \"Duel is the directorial debut of producer, screenwriter, and director Richard Matheson. \"? Yes, no, or maybe? No\n###\nManuel de Falla y Matheu (] , 23 November 187614 November 1946) was a Spanish composer. Along with Isaac Alb\u00e9niz and Enrique Granados, he was one of Spain's most important musicians of the first half of the 20th century. His image was on Spain's 1970 100-pesetas banknote. Are we justified in saying that \"Manuel de Falla y Matheu has an A.\"? Yes, no, or maybe?", "doc_id": 677, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [13041, 37945, 21469, 33911], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Enrique Leff (born Mexico, 1946) is a Mexican economist, who defines himself today as an environmental sociologist and environmentalist. He has written 25 books and 180 articles on political ecology, environmental sociology, environmental economics, environmental epistemology and environmental education. He is regarded as one of the key environmental thinkers in Latin America. Are we justified in saying that \"Enrique Leff is currently alive\"? Yes, no, or maybe? Yes\n###\nGrenzschutzgruppe 9 (GSG 9) (English: Border Protection Group 9 ) is the elite Police Tactical Unit of the German Federal Police (German: \"Bundespolizei\" ). GSG 9 counterparts on the state level are the Special Deployment Commandos (German: \"Spezialeinsatzkommandos (SEK)\" ). Are we justified in saying that \"They work only on the borders.\"? Yes, no, or maybe? Maybe\n###\nChristopher Tafoya (born June 2, 1976), better known by his stage name Sleep or Sleep of Oldominion, is an American hip hop artist from Farmington, New Mexico, who is currently based in Portland, Oregon. He is a founding member of the Pacific Northwest hip hop collective Oldominion and hip hop duo The Chicharones alongside Josh Martinez. He is currently signed to Strange Famous Records. Are we justified in saying that \"tafoya has never eaten grapes\"? Yes, no, or maybe? Maybe\n###\nThe Coca-Cola Bottling Company of Cape Cod is a former bottler of Coca-Cola, Dr Pepper and Canada Dry soft drinks located in Sandwich, Massachusetts, United States. The company was bought out in 2000 by the Coca-Cola Bottling Company of Northern New England. Are we justified in saying that \"The Coca-Cola Bottling Company of Cape Cod has never been sold.\"? Yes, no, or maybe? No\n###\nRuth Pryor (1906-2001) was a Chicago ballet dancer and instructor, and the first American ballerina to dance the role of the Swan Queen in Swan Lake, in 1930. She was known for \"her feat of whirling thirty-six times a minute on her toes,\" according to the Purple Parrot of Northwestern University. Are we justified in saying that \"Ruth Pryor lived to be around 95 years old.\"? Yes, no, or maybe?", "doc_id": 907, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16883, 5903, 36310, 25175], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Aodh Mac Cathmhaoil, O.F.M., (Latin: Hugo Cavellus; anglicised: Hugh MacCaghwell) (1571 \u2013 22 September 1626), was an Irish Franciscan theologian and Archbishop of Armagh. He was known by Irish speakers at Louvain by the honorary name \"Aodh Mac Aingil\" (\"Mac Aingil\" is Irish for \"Son of an Angel\"), and it was under this title that he published the Irish work \"Sc\u00e1th\u00e1n Shacramuinte na hAthridhe\". Are we justified in saying that \"Aodh Mac Cathmhaoil has Irish ancestry \"? Yes, no, or maybe? Yes\n###\nThings Happen at Night is a 1947 British supernatural ghost comedy film directed by Francis Searle and starring Gordon Harker, Alfred Drayton, Robertson Hare and Gwynneth Vaughan. The film is based upon a stage play, \"The Poltergeist\", by Frank Harvey Jnr. Are we justified in saying that \"\"Things Happen at Night\" is a ghost supernatural-esque comedy made in 1944. \"? Yes, no, or maybe? No\n###\nThe Veterinary Medical College Application Service (VMCAS) is a centralized application service for students applying to veterinary school. Created by the Association of American Veterinary Medical Colleges (AAVMC) in 1995, VMCAS handles applications for most of the veterinary schools in the United States, as well as several in Canada, the United Kingdom, New Zealand and Australia. Are we justified in saying that \"The Veterinary Medical College Application Service (VMCAS) is the only application service for students applying for veterinary school in 1995\"? Yes, no, or maybe? Maybe\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"David Lee Murphy likes breakfast\"? Yes, no, or maybe? Maybe\n###\nKlagenfurt am W\u00f6rthersee (] ; Slovene: \"Celovec ob Vrbskem jezeru\" , Italian: \"Clanforte\" , Friulian: \"Clanfurt\" ) is the capital of the federal state of Carinthia in Austria. With a population of 99,100, it is the sixth-largest city in the country. The city is the bishop's seat of the Roman Catholic Diocese of Gurk-Klagenfurt and home to the Alpen-Adria-Universit\u00e4t Klagenfurt. Are we justified in saying that \"There is not a city called Klagenfurt in Australia.\"? Yes, no, or maybe?", "doc_id": 72, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34699, 26765, 32765, 4961], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Seven Ways from Sundown is a 1960 American Eastmancolor Western film directed by Harry Keller and starring Audie Murphy and Barry Sullivan. It is based on the novel of the same name by Clair Huffaker, who also wrote the script. Young cast member Teddy Rooney is the son of actors Mickey Rooney and Martha Vickers. Are we justified in saying that \"Teddy Rooney will go down as the greatest actor after many years of analysis, hundreds of years in the distant future.\"? Yes, no, or maybe? Maybe\n###\nAnna Hamilton Phelan is an American actress and scriptwriter. She has been nominated for an Oscar for her work on \"Gorillas in the Mist\", as well as a nomination for a Writers Guild of America Award for her work on \"Mask\" and again for \"Gorillas in the Mist\". Are we justified in saying that \"Anna Hamilton Phelan is an American.\"? Yes, no, or maybe? Yes\n###\nInterTV Grande Minas is a Brazilian television station affiliated with Rede Globo coverage in the Northern part of the Noroeste, Central and the Jequitinhonha and Mucuri of Minas Gerais. Operates on VHF channel 4, in its headquarters city, Montes Claros / MG. Belongs to the Rede InterTV. Are we justified in saying that \"Minas Gerais is at Brazil\"? Yes, no, or maybe? Yes\n###\nThomas Morrison (born 30 June 1983) is an English actor who has performed in theatre, TV and film. He is best known for his appearances in \"On the Shore of the Wide World\" and as Scripps in Cast B and C of Alan Bennett's \"The History Boys\". working alongside Steven Webb and Matt Smith. Are we justified in saying that \"Morrison prefers the British spelling of \"theatre\".\"? Yes, no, or maybe? Maybe\n###\nBlack Snake is a 1973 American film directed by Russ Meyer. It was Meyer's return to self-financed projects, following the end of his brief deal at 20th Century Fox. Meyer's only attempt at the Blaxploitation genre, it was filmed in Panavision and was shot on location in Barbados. Are we justified in saying that \"Black Snake featured black actors\"? Yes, no, or maybe?", "doc_id": 572, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29769, 44762, 28720, 26332], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sadat is a 1983 American two-part, four-hour television miniseries based on the life and death of the late 3rd President of Egypt, Anwar Sadat starring Louis Gossett Jr. as Sadat and Madolyn Smith as Sadat's wife, Jehan. It was distributed by Columbia Pictures Television through Operation Prime Time. Gossett's performance earned him a nomination for an Emmy Award and a Golden Globe Award. Are we justified in saying that \"Sadat is a four-hour television miniseries based on the life and death of the late 3rd President of the US.\"? Yes, no, or maybe? No\n###\nThe Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds. Are we justified in saying that \"The Kien Giang Ridgeback is the smallest of the ridgeback breeds.\"? Yes, no, or maybe? No\n###\nEnglandsfarere (English: We Leave for England ) is a 1946 Norwegian war film directed by Toralf Sand\u00f8, starring Knut Wigert and J\u00f8rn Ording. The film follows the Norwegian resistance fighters Harald (Wigert) and Arild (Ording) in their flight from the Gestapo. Are we justified in saying that \"The word England is in the translation of Englandsfarere.\"? Yes, no, or maybe? Yes\n###\nLookout Mountain, elevation 6536 ft , is the second highest peak in Oregon's Mount Hood National Forest and the highest point in Badger Creek Wilderness. It sits about 8 miles east-southeast of Mount Hood, separated from it by the valley of the East Fork Hood River. Are we justified in saying that \"It is in Washington.\"? Yes, no, or maybe? No\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology. Are we justified in saying that \"Pavel Sergeyevich Alexandrov wrote in Russian.\"? Yes, no, or maybe?", "doc_id": 498, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [39565, 42213, 15147, 1230], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Juan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"Aragone made his first appearance at the 2017 US Open and lost that important game.\"? Yes, no, or maybe? Maybe\n###\nJuan Cruz \"JC\" Aragone (born June 28, 1995) is an American tennis player. He attended the University of Virginia and was a member of three NCAA Men's Tennis Championship winning teams. Aragone made his first ATP main draw appearance at the 2017 US Open after securing a spot through the qualifying tournament. Are we justified in saying that \"Aragone played tennisin the university of virginia\"? Yes, no, or maybe? Yes\n###\nUSFC \"Fish Hawk\" was a fisheries research ship operated by the United States Commission of Fish and Fisheries and its successor, the United States Bureau of Fisheries, from 1880 to 1926. She was the first large ship purpose-built by any country for the promotion of fisheries, and spent her 46-year career operating along the United States East Coast, in the Gulf of Mexico, and off Puerto Rico. Are we justified in saying that \"The USFC \"Fish Hawk\" was not in operation in 1808. \"? Yes, no, or maybe? Yes\n###\nRichard Colson Baker (born April 22, 1990), better known by his stage names MGK and Machine Gun Kelly, is an American rapper and actor, from Cleveland, Ohio. MGK embarked on a musical career as a teenager, releasing a mixtape in 2006. He went on to release four more mixtapes. Are we justified in saying that \"He was born in a hospital\"? Yes, no, or maybe? Maybe\n###\nPatricia Donoho Hughes (August 18, 1930\u00a0\u2013 January 20, 2010) was a First Lady of Maryland, married to former Maryland Governor Harry Hughes. She was educated at Sorbonne (1949) and Bryn Mawr College (1951) before getting married on June 30, 1951. She later continued her education at the University of Delaware (1966). Mrs. Hughes was a teacher and educator by profession. Are we justified in saying that \"Particia Donoho Hughes was the First Lady of Maryland in 2010.\"? Yes, no, or maybe?", "doc_id": 468, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40096, 2088, 33160, 36263], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Valentine is a 2001 American slasher film directed by Jamie Blanks, and starring Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. Loosely based on the novel of the same name by Tom Savage, the film follows a group of women in San Francisco who are stalked by a man whom they tormented during their childhood. Are we justified in saying that \"The film was released in 2000\"? Yes, no, or maybe? No\n###\nEmmanuel Fr\u00e9chette is a Canadian film production designer. He is a two-time winner of the Canadian Screen Award for Best Art Direction or Production Design, at the 1st Canadian Screen Awards in 2013 for \"War Witch (Rebelle)\" and at the 5th Canadian Screen Awards in 2017 for \"Two Lovers and a Bear\". Are we justified in saying that \"\"Two Lovers and a Bear\" was released in 2017.\"? Yes, no, or maybe? Yes\n###\nThe 2004 IIFA Awards, officially known as the 5th International Indian Film Academy Awards ceremony, presented by the International Indian Film Academy honoured the best films of 2003 and took place between May 20\u201322, 2004. This year, the city of Singapore played host to the Indian Film Industry. The tag line of this year's IIFA Awards was \"Uniquely IIFA, Uniquely Singapore ...\". Are we justified in saying that \"The ceremony took place for 2 days\"? Yes, no, or maybe? Yes\n###\nThe Krylov\u2013Bogolyubov averaging method (Krylov\u2013Bogolyubov method of averaging) is a mathematical method for approximate analysis of oscillating processes in non-linear mechanics. The method is based on the averaging principle when the exact differential equation of the motion is replaced by its averaged version. The method is named after Nikolay Krylov and Nikolay Bogoliubov. Are we justified in saying that \"The Krylov\u2013Bogolyubov averaging method is used mostly in physics\"? Yes, no, or maybe? Maybe\n###\nThe 1998 Idaho Vandals football team represented the University of Idaho in the 1998 NCAA Division I-A football season. The Vandals, led by fourth-year head coach Chris Tormey, were members of the Big West Conference and played their home games at the Kibbie Dome, an indoor facility on campus in Moscow, Idaho. Are we justified in saying that \"The 1998 Idaho Vandals football team was formed after nineteen ninety nine.\"? Yes, no, or maybe?", "doc_id": 880, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22667, 36316, 38195, 29459], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sherbino Mesa Wind Farm is located in Pecos County in west Texas. The first 150 megawatts (MW) of the project, which has a potential capacity of 750 MW, is in operation. Phase I utilizes 50 Vestas V-90 Mk.5 wind turbine generators, each with a rated capacity of 3 MW. BP will operate phase I of the project. Are we justified in saying that \"The Sherbino Mesa Wind Farm requires a lot of electricity to power\"? Yes, no, or maybe? Maybe\n###\nWest Coast hip hop is a hip hop music subgenre that encompasses any artists or music that originate in the West Coast region of the United States. The gangsta rap subgenre of West Coast hip hop began to dominate from a radio play and sales standpoint during the early 1990s with the birth of G-funk and the emergence of Suge Knight and Dr. Dre's Death Row Records. Are we justified in saying that \"Dr. Dre's Death Row Records only worked with West Coast rappers.\"? Yes, no, or maybe? Maybe\n###\nA Daughter of the Wolf is a 1919 American silent drama film directed by Irvin Willat and written by Marion Fairfax and Hugh Pendexter. The film stars Lila Lee, Elliott Dexter, Clarence Geldart, Raymond Hatton, Richard Wayne, and Minnie Devereaux. The film was released on June 22, 1919, by Paramount Pictures. Are we justified in saying that \"There were speaking lines in A Daughter of the Wolf.\"? Yes, no, or maybe? No\n###\nElmira is a city in Chemung County, New York, US. It is the principal city of the Elmira, New York Metropolitan Statistical Area, which encompasses Chemung County, New York. The population was 29,200 at the 2010 census. It is the county seat of Chemung County. Are we justified in saying that \"Elmira's population rose after 2010\"? Yes, no, or maybe? Maybe\n###\nThe ABA League Finals MVP award, also known as the Adriatic League Finals MVP award (formerly the Final Four MVP), is an annual award that is given to the most valuable player of the finals of the European regional Adriatic ABA League, which is the top-tier level professional basketball league for countries of the former Yugoslavia. The award has been given since the 2001\u201302 ABA League season. Are we justified in saying that \"The ABA League has not given more than 0 awards\"? Yes, no, or maybe?", "doc_id": 631, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [37962, 31305, 27504, 40133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns. Are we justified in saying that \"The 8.8 cm Flak was sold at an auction for an undisclosed amount.\"? Yes, no, or maybe? Maybe\n###\nKDMD is an Ion Television-affiliated television station located in Anchorage, Alaska, United States. Owned by Ketchikan Television LLC, the station broadcasts a standard definition digital signal on UHF channel 32 (or virtual channel 33 via PSIP) from a transmitter located in Eagle River. KDMD maintains studios on East 66th Avenue near the Seward Highway in Anchorage. Are we justified in saying that \"KDMD has a stock price that is currently rising.\"? Yes, no, or maybe? Maybe\n###\nBanking in the United States is regulated by both the federal and state governments. The five largest banks in the United States at December 31, 2011 were JPMorgan Chase, Bank of America, Citigroup, Wells Fargo, and Goldman Sachs. In December 2011, the five largest banks' assets were equal to 56 percent of the U.S. economy, compared with 43 percent five years earlier. Are we justified in saying that \"TD bank is one of the five largest banks in the USA\"? Yes, no, or maybe? No\n###\nHooked on a Feeling is an album by Swedish Rock band Blue Swede recorded in 1973 and released in 1974. They became known internationally largely due to their 'ooga chaka' cover of Jonathan King's 1971 version of the 1968 B. J. Thomas song \"Hooked on a Feeling\". Are we justified in saying that \"Hooked on a Feeling is a song by Swedish Rock.\"? Yes, no, or maybe? Yes\n###\nThe Vienna State Opera (German: Wiener Staatsoper ) is an Austria opera house and opera company based in Vienna, Austria. It was originally called the Vienna Court Opera (Wiener Hofoper). In 1920, with the replacement of the Habsburg Monarchy by the First Austrian Republic, it was renamed the Vienna State Opera. The members of the Vienna Philharmonic are recruited from its orchestra. Are we justified in saying that \"Wiener Hofoper and Wiener Staatsoper are different opera companies.\"? Yes, no, or maybe?", "doc_id": 685, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44310, 34129, 40163, 27106], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Bailey Gatzert (December 29, 1829 \u2013 April 19, 1893) was an American politician and the eighth mayor of Seattle, Washington, serving from 1875 to 1876. He was the first Jewish mayor of Seattle, narrowly missing being the first Jewish mayor of a major American city (Moses Bloom became mayor of Iowa City, Iowa, in 1873), and has been the only Jewish mayor of Seattle to date. Are we justified in saying that \"Moses Bloom was mayor of Iowa City, Iowa in 1875\"? Yes, no, or maybe? Maybe\n###\nIppadikku Rose (Tamil: \u0b87\u0baa\u0bcd\u0baa\u0b9f\u0bbf\u0b95\u0bcd\u0b95\u0bc1 \u0bb0\u0bcb\u0bb8\u0bcd ; English: Yours truly, Rose ) is a Tamil talk show aired on Vijay TV. The show hosted by Rose. The talk show deals with current affairs touching a wide variety of social issues including traditions, taboos, rebels and culture. This is the first TV show in India hosted by a transgender person. The show is telecast at every Thursday at 11:PM IST. Are we justified in saying that \"Rose doesn't speak Tamil.\"? Yes, no, or maybe? No\n###\nJon Garth Murray (November 16, 1954 \u2013 September 29, 1995) was the second son of late controversial activist Madalyn Murray O'Hair, the first president and founder of American Atheists, Inc., in 1963. He was also the half-brother of the reverend William \"Bill\" Murray. Are we justified in saying that \"reverend William \"Bill\" Murray is an atheist.\"? Yes, no, or maybe? Maybe\n###\nThe X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004. Are we justified in saying that \"The second game is technically superior to the first game.\"? Yes, no, or maybe? Maybe\n###\nThe 2016\u201317 ProA was the 10th season of the ProA, the second level of basketball in Germany. The champions the runners-up of the play-offs are promoted to the 2017\u201318 Basketball Bundesliga. The season started on September 22, 2016 and ended on May 7, 2017. Mitteldeutscher BC won the championship and promoted along with runners-up Oettinger Rockets. Are we justified in saying that \"Mitteldeutscher BC was promoted to the 11th season.\"? Yes, no, or maybe?", "doc_id": 401, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44651, 13643, 1860, 1528], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Neil Sedaka: Italiano is a 1964 compilation album containing twelve of Neil Sedaka's Italian-language recordings. It was released in Italy by RCA Victor's Italiana studios. Of the twelve songs on the album, six were recorded by Sedaka in English. A seventh song on the album, \"A 16 Anni Tu Vuoi Amare\", is an Italian-language version of Andrea Carroll's 1963 hit, \"It Hurts To Be Sixteen\". Are we justified in saying that \"The compilation album Neil Sedaka: Italiano was released in nineteen hundred sixty five.\"? Yes, no, or maybe? No\n###\nThe church St. Ulrich is a Roman Catholic parish church in Neubau, the 7th district of Vienna, Austria. The official name is \"Pfarrkirche hl. Ulrich und Maria Trost \" (Parish church of St. Ulrich and Mary's consolation), it is also known as Ulrichskirche . The Baroque hall church with two towers was built in 1721. It is consecrated to St. Ulrich and St. Mary. Are we justified in saying that \"The Baroque hall church with two towers was built 100 years after 1621.\"? Yes, no, or maybe? Yes\n###\nPetr Korda was the defending champion, but he was eliminated in the third round by Todd Martin.
Yevgeny Kafelnikov won the title, defeating Thomas Enqvist in the final, 4\u20136, 6\u20130, 6\u20133, 7\u20136. With this win, Kafelnikov became the first Russian (male or female) to win the Australian Open. Are we justified in saying that \"Thomas Enqvist has played in the final of the Australian Open.\"? Yes, no, or maybe? Yes\n###\nGlaiza Herradura-Agullo (born February 24, 1978) is a Filipino former child actress. She was the first-ever grand winner of the Little Miss Philippines segment of \"Eat Bulaga!\" in 1984. She starred in RPN-9's television series \"Heredero\" with Manilyn Reynes and Richard Arellano. She won the 1988 FAMAS Best Child Actress award for her role in \"Batas Sa Aking Kamay\" starring Fernando Poe, Jr.. Are we justified in saying that \"Glaiza Herradura-Agullo suffers from diabetes.\"? Yes, no, or maybe? Maybe\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award. Are we justified in saying that \"It took workers a total of forty four years to complete the construction of the Panama Canal. \"? Yes, no, or maybe?", "doc_id": 461, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [29902, 15172, 2660, 17521], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Thameslink and Great Northern are the brand names used by the Govia Thameslink Railway train operating company on the Thameslink and Great Northern routes of the Thameslink, Southern and Great Northern franchise, previously operated by First Capital Connect. Are we justified in saying that \"One of the names starts with an N\"? Yes, no, or maybe? Yes\n###\nThe 2008\u201309 season was Aston Villa's 134th professional season; their 98th season in the top-flight and their 21st consecutive season in the top flight of English football, the Premier League. They were managed by Martin O'Neill \u2013 in his third season since replacing David O'Leary. The 2008\u201309 season was the first spell in European competition for O'Neill, and the first for Villa, in 6 seasons. Are we justified in saying that \"Aston Villa played in the premier league\"? Yes, no, or maybe? Yes\n###\nThe 2011\u201312 Seattle Redhawks men's basketball team represented the Seattle University in the 2011\u201312 college basketball season. This was head coach Cameron Dollar's 3rd season at Seattle U. The Redhawks played their home games at KeyArena as Independent members of Division I. They finished 12\u201315 overall. Are we justified in saying that \"2011-2012 was the Redhawks worst year\"? Yes, no, or maybe? Maybe\n###\nCapX is a British online news website and aggregator founded by the Centre for Policy Studies, and features columnists and contributors such as Tim Montgomerie, Daniel Hannan and V. S. Naipaul. The site offers original content and aggregated news and blogs, and features opinion on politics, economics, and business issues. Are we justified in saying that \"The Centre for Policy Studies owns CapX and is a british institution. \"? Yes, no, or maybe? Maybe\n###\nBoy Meets Girl is an ITV comedy-drama television miniseries starring Rachael Stirling and Martin Freeman. In the show, Danny Reed (Freeman) is struck by lightning. When he wakes up from the attack, he is inside the body of a woman, fashion journalist Veronica Burton (Stirling). Written by David Allison, the series began on 1 May 2009. Are we justified in saying that \"Boy Meets Girl, which was written by David Allison, ended when Danny Reed was struck by lightning.\"? Yes, no, or maybe?", "doc_id": 259, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7981, 16064, 35219, 11272], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"The Ones Who Walk Away from Omelas\" is a 1973 plotless, short, descriptive work of philosophical fiction, though popularly classified as a short story, by American writer Ursula K. Le Guin. With deliberately both vague and vivid descriptions, the narrator depicts a summer festival in the utopian city of Omelas, whose prosperity depends on the perpetual misery of a single child. Are we justified in saying that \"The Ones Who Walk Away from Omelas is critically acclaimed.\"? Yes, no, or maybe? Maybe\n###\nMate Pavi\u0107 (born 4 July 1993) is a Croatian professional tennis player specialising in doubles. Mate won the 2016 US Open mixed doubles title in partnership with Laura Siegemund, and reached the 2017 Wimbledon Championships men's doubles finals partnering Oliver Marach. Are we justified in saying that \"Mate Pavi\u0107 was born in 1994\"? Yes, no, or maybe? No\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke. Are we justified in saying that \"The Candidate was the turning point in Jack Shephard's career.\"? Yes, no, or maybe? Maybe\n###\nPrinceton Junction is a railroad station in Princeton Junction, New Jersey, located in West Windsor Township. It serves NJ Transit (NJT) and Amtrak on the Northeast Corridor (NEC), and NJ Transit on the Princeton Branch. The station's Amtrak station code is PJC. Are we justified in saying that \"Princeton Junction is a railroad in North Carolina.\"? Yes, no, or maybe? No\n###\nThe 1999 Acura Classic \u2013 Doubles was the doubles event of the twentieth edition of the third tournament in the US Open Series. Martina Hingis and Natasha Zvereva were the defending champions but Hingis did not compete this year. Zvereva played with Mary Pierce, and they were defeated in the first time by Cara Black and Irina Selyutina. Are we justified in saying that \"The 1999 Acura Classic \u2013 Doubles was the doubles event of the third edition of the twentieth tournament in the US Open Series. \"? Yes, no, or maybe?", "doc_id": 116, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [21051, 32129, 24146, 30069], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "John Cameron Urschel (born June 24, 1991) is a Canadian mathematician and retired professional American football guard and center. He played college football at Penn State and was drafted by the Baltimore Ravens in the fifth round of the 2014 NFL Draft. Urschel played his entire NFL career with Baltimore before announcing his retirement on July 27, 2017, at 26 years old. Are we justified in saying that \"John Cameron Urschel (born June 24, 11991) is a Canadian mathematician and retired professional American football guard and center.\"? Yes, no, or maybe? No\n###\nThe 8.8 cm Flak 18/36/37/41 (commonly called the eighty-eight) was a German 88 mm anti-aircraft and anti-tank artillery gun from World War II. It was widely used by Germany throughout the war, and was one of the most recognized German weapons of that conflict. Development of the original model led to a wide variety of guns. Are we justified in saying that \"The 8.8 cm Flak was invented after World War II\"? Yes, no, or maybe? No\n###\nThe Enrolled Missouri Militia was a state militia organization of Missouri in 1862 during the American Civil War. It was a part-time force whose primary purpose was to serve as garrison and infrastructure guards, both to augment the Unionist Missouri State Militia in defense versus raids and to free the Missouri State Militia for offensive operations versus Confederate guerrillas and recruiters. Are we justified in saying that \"The Missouri Militia have killed hundreds of people.\"? Yes, no, or maybe? Maybe\n###\nMcColo was a San Jose-based web hosting service provider. In late 2008, the company was shut down by two upstream providers, Global Crossing and Hurricane Electric, because a significant amount of malware and botnets had been trafficking from the McColo servers. Are we justified in saying that \"McColo was run by hackers\"? Yes, no, or maybe? Maybe\n###\nMurder of the Universe is the tenth studio album by Australian psychedelic rock band King Gizzard & the Lizard Wizard. It was released on 23 June 2017 by Flightless Records in Australia, ATO Records in the United States, and Heavenly Recordings in the United Kingdom. It is the second of five albums set to be released in 2017. Are we justified in saying that \"Murder of the Universe directly preceded the 11th album\"? Yes, no, or maybe?", "doc_id": 264, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [40057, 12957, 19850, 11803], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Game Plan was a pinball manufacturer that produced pinball tables from 1978 to 1985. Game Plan was a subsidiary of AES Technology Systems and was located in Elk Grove Village, Illinois. Game Plan's president was former Chicago Coin table designer Wendell McAdams. Are we justified in saying that \"Game Plan will continue to make pinball machines.\"? Yes, no, or maybe? No\n###\nBernard Taylor, CBE (born 1934 in Wiltshire, England) is a British author of horror, suspense and romantic fiction and of true-crime non-fiction. He has written several plays for the theatre, and has also written for television and radio. He has more recently written novels under the pseudonym Jess Foley. Are we justified in saying that \"Bernard Taylor was an author and businessman\"? Yes, no, or maybe? Maybe\n###\nSwift Rivers is a children's historical novel by Cornelia Meigs. Set initially in 1835 in Minnesota, it is a story of the early days of the logging industry, when logs were floated down the Mississippi to St. Louis. The novel, illustrated by Forrest W. Orr, was first published in 1931 and was a Newbery Honor recipient in 1933. Are we justified in saying that \"Cornelia Meigs illustrated Swift Rivers in 1931.\"? Yes, no, or maybe? No\n###\nJohn Robert Gamble (born 1948) is a former professional baseball shortstop. He played in 13 games in two seasons for the Detroit Tigers of Major League Baseball. He was drafted in the 2nd round of the 1966 Major League Baseball Draft by the Los Angeles Dodgers and acquired by the Tigers in the 1970 Rule V Draft. Are we justified in saying that \"John Robert Gamble no lomger plays\"? Yes, no, or maybe? No\n###\nJunun is a 2015 album by the Israeli composer Shye Ben Tzur, the English composer and Radiohead guitarist Jonny Greenwood, and the Indian ensemble the Rajasthan Express. It was produced by Greenwood and recorded, mixed, and engineered by Radiohead producer Nigel Godrich. Are we justified in saying that \"Individuals from three different musical groups came together and worked on the song June\"? Yes, no, or maybe?", "doc_id": 605, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [20240, 44884, 43480, 14606], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Mean Girls 2 is a 2011 American teen comedy television film directed by Melanie Mayron. It is a stand-alone sequel to the 2004 film \"Mean Girls\". The film premiered on ABC Family on January 23, 2011. The film stars Meaghan Martin, Jennifer Stone, Maiara Walsh, Nicole Gale Anderson, Claire Holt, and Diego Boneta. Tim Meadows reprises his role as Principal Ron Duvall from the original film. Are we justified in saying that \"Mean Girls 2 was well received by critics.\"? Yes, no, or maybe? Maybe\n###\nThe Hyundai Xcent is an A-segment sedan by Hyundai Motor Company. Based on the Hyundai Grand i10, the Xcent is manufactured by Hyundai Motor India Limited in Chennai. It made its debut on February 4, 2014, three days ahead of its world premiere at the Auto Expo 2014. Are we justified in saying that \"Two Thousand Sixteen was the year that the Hyundai Xcent debuted. \"? Yes, no, or maybe? No\n###\nThe Sydney/Melbourne Express was an overnight intercapital passenger train service that operated between the Australia's largest two cities, Sydney and Melbourne, between August 1986 and November 1993. Operated jointly by State Rail Authority and V/Line the name depended on the direction of travel, with the train nicknamed the 'Sex' or 'Mex'. Are we justified in saying that \"The Sydney/Melbourne Express was operated by 2 different entities\"? Yes, no, or maybe? Yes\n###\nVirginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly. Are we justified in saying that \"Virginia is the eleventh biggest congressional district\"? Yes, no, or maybe? Maybe\n###\nThe 44th Filmfare Awards were held on February 21, 1999, in Mumbai India. Karan Johar's directorial debut \"Kuch Kuch Hota Hai\" dominated the awards, including a sweep of the major acting categories. Ram Gopal Varma's \"Satya and Mani Ratnam's \"Dil Se..\" were the other big winners. Are we justified in saying that \"Kuch Kuch Hota Hai didn't win all the categories in The 44th Filmfare Awards.\"? Yes, no, or maybe?", "doc_id": 589, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38045, 40221, 23330, 26101], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Frank Viola is an American author, speaker, and blogger on Christian topics. His work focuses on Jesus studies and biblical narrative, with a strong emphasis on helping the poor and the oppressed. He is most noted for his emphasis on the centrality and supremacy of Jesus Christ. Are we justified in saying that \"His work's concentrates on Mary.\"? Yes, no, or maybe? No\n###\nThe Winter Hill air disaster occurred on 27 February 1958 when the Silver City Airways Bristol 170 Freighter \"G-AICS\", traveling from the Isle of Man to Manchester, England, crashed into Winter Hill (also known as Rivington Moor) several hundred yards away from the Independent Television Authority's Winter Hill transmitting station. Are we justified in saying that \"The disaster occured in February\"? Yes, no, or maybe? Yes\n###\nMarks was a manor house located near Marks Gate at the northern tip of the London Borough of Barking and Dagenham in London, England, the house standing on what is now Warren Hall Farm. The name Marks (historically Markes) is believed to have been derived from the \"de Merk\" family who build the original manor in the 14th Century. The manor house was demolished in 1808 Are we justified in saying that \"Marks has gold .\"? Yes, no, or maybe? Maybe\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys. Are we justified in saying that \"Scott Edward Morriss was born in 1979\"? Yes, no, or maybe? No\n###\n\"Eternally\" is a song with music by Charles Chaplin, and words by the English lyricists Geoff Parsons and John Turner. The music was initially composed for Charles Chaplin's film \"Limelight\" (1952) titled \"Terry's Theme\"; the film won an Oscar for \"Best Original Dramatic Score\" at the Are we justified in saying that \"The words to Eternally were written partially by Geoff Parsons\"? Yes, no, or maybe?", "doc_id": 2, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [44906, 42817, 37592, 25674], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Strangers is an American country band best known as the back-up band for singer-songwriter Merle Haggard. Formed in 1965 in Bakersfield, California, United States, the band continued to tour with original co-founding member Norman Hamlet, as well as Haggard's children Dana and Ben. Are we justified in saying that \"Merle Haggard had two daughters.\"? Yes, no, or maybe? No\n###\nThe Outsiders are a professional wrestling tag team consisting of Kevin Nash and Scott Hall, best known for their first appearances in World Championship Wrestling (WCW) in 1996. They later teamed also in the World Wrestling Federation (WWF), Total Nonstop Action Wrestling (TNA), and Pro Wrestling ZERO1-MAX. Are we justified in saying that \"The Outsiders don't get paid\"? Yes, no, or maybe? Maybe\n###\nCorn smut is a plant disease caused by the pathogenic fungus Ustilago maydis that causes smut on maize and teosinte. The fungus forms galls on all above-ground parts of corn species, and is known in Mexico as the delicacy huitlacoche; which is eaten, usually as a filling, in quesadillas and other tortilla-based foods, and soups. Are we justified in saying that \"Corn smut is something fed to toddlers and kids\"? Yes, no, or maybe? Maybe\n###\nSilent Scream (originally known as The Retreat) is an independent, horror film directed by Matt Cantu and Lance Kawas and starring Scott Vickaryous, Melissa Schuman and Shanti Lowry. It premiered at the Chicago Horror Film Festival on October 28, 2005 and was released on DVD on December 5, 2006. Are we justified in saying that \"Silent Scream was too scary for kids\"? Yes, no, or maybe? Maybe\n###\nWooden Leather is the second studio album by Kentucky-based rap sextet band Nappy Roots, to their first album \"Watermelon, Chicken & Gritz\". It was released on August 26, 2003 and featured singles \"Roun' the Globe\", and \"Sick and Tired\" (featuring Anthony Hamilton). Are we justified in saying that \"The first single features Anthony Hamilton.\"? Yes, no, or maybe?", "doc_id": 759, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [2081, 5897, 20361, 8937], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Whitechapel murders were committed in or near the impoverished Whitechapel district in the East End of London between 3 April 1888 and 13 February 1891. At various points some or all of these eleven unsolved murders of women have been ascribed to the notorious unidentified serial killer known as Jack the Ripper. Are we justified in saying that \"Nobody died in East End in 1888\"? Yes, no, or maybe? No\n###\nThe Magic Roundabout (known in the original French as \"Le Man\u00e8ge enchant\u00e9\") was a French-British children's television programme created in France in 1963 by Serge Danot, with the help of Ivor Wood and Wood's French wife, Josiane. The series was originally broadcast from 1964 to 1974 on ORTF (Office de Radiodiffusion T\u00e9l\u00e9vision Fran\u00e7aise). Are we justified in saying that \"Danot and Wood begged for the help of Josiane to create The Magic Roundabout.\"? Yes, no, or maybe? Maybe\n###\nRudbeckia hirta, commonly called black-eyed-Susan, is a North American flowering plant in the sunflower family, native to Eastern and Central North America and naturalized in the Western part of the continent as well as in China. It has now been found in all 10 Canadian Provinces and all 48 of the states in the contiguous United States. Are we justified in saying that \"Rudbeckia hirta is a very popular plant with generation Z\"? Yes, no, or maybe? Maybe\n###\nPavel Sergeyevich Alexandrov (Russian: \u041f\u0430\u0301\u0432\u0435\u043b \u0421\u0435\u0440\u0433\u0435\u0301\u0435\u0432\u0438\u0447 \u0410\u043b\u0435\u043a\u0441\u0430\u0301\u043d\u0434\u0440\u043e\u0432 ), sometimes romanized Paul Alexandroff or Aleksandrov (7 May 1896 \u2013 16 November 1982), was a Soviet mathematician. He wrote about three hundred papers, making important contributions to set theory and topology. Are we justified in saying that \"Pavel Sergeyevich Alexandrov wrote mostly about topology and set theory.\"? Yes, no, or maybe? Maybe\n###\nThe Substitute is a 1993 American television film directed by Martin Donovan, written by David S. Goyer under his pseudonym Cynthia Verlaine, and is also Mark Wahlberg's first acting role and credited as \"Marky Mark\", due to his successful hip hop career. Are we justified in saying that \"David S. Goyer is married to Cynthia Verlaine\"? Yes, no, or maybe?", "doc_id": 816, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [6277, 18348, 17319, 36884], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Resorts Casino Tunica, formerly Southern Belle Casino and Harrah's Tunica Mardi Gras Casino, is a 201-room hotel and a 35000 sqft casino located in Tunica Resorts, Mississippi. It is one of three casinos located in the \"Casino Strip\" area, along with Sam's Town and Hollywood Casino. Resorts is owned by Gaming & Leisure Properties and operated by Penn National Gaming. Are we justified in saying that \"Sam's Town is located in the Casino Strip.\"? Yes, no, or maybe? Yes\n###\nStanley Frederick Steele (born 5 January 1937) is an English former footballer. A half-back and inside-forward, he scored 97 goals in 370 league and cup games for Port Vale between 1955 and 1968. His parents named him Stanley Frederick in honour of Stanley Matthews and Freddie Steele. Are we justified in saying that \"Stanley Frederick Steele was born in an english hospital\"? Yes, no, or maybe? Maybe\n###\nHabib (Habibollah) Elghanian (Persian: \u062d\u0628\u06cc\u0628 (\u062d\u0628\u06cc\u0628\u200c\u0627\u0644\u0644\u0647) \u0627\u0644\u0642\u0627\u0646\u06cc\u0627\u0646\u200e \u200e , 5 April 1912 \u2013 9 May 1979) was a prominent Iranian Jewish businessman and philanthropist who served as the president of the Tehran Jewish Society and acted as the symbolic head of the Iranian Jewish community in the 1970s. Are we justified in saying that \"Habib studied business administration while in college\"? Yes, no, or maybe? Maybe\n###\nThe Santa Cova Funicular (Catalan: \"Funicular de la Santa Cova\" ) is a funicular railway at Montserrat, near Barcelona in Catalonia, Spain. The line descends from the monastery, and the upper terminus of the Montserrat Rack Railway, on a continuous curve to a lower station that gives access, via a path, to Santa Cova, a shrine lower down the mountain. Are we justified in saying that \"The Santa Cova Funicular is a very unpopular railway\"? Yes, no, or maybe? Maybe\n###\nLexington County is a county located in the U.S. state of South Carolina. As of the 2010 census, the population was 262,391, and the 2016 population estimate was 286,186. Its county seat and largest town is Lexington. The county was created in 1785. Its name commemorates the Battle of Lexington in the American Revolutionary War. Are we justified in saying that \"Lexington County's population grew between 2010 and 2016.\"? Yes, no, or maybe?", "doc_id": 438, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14296, 35768, 18261, 35656], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Julia Goldani Telles (born March 18, 1995) is an American actress and ballet dancer. She is best known for her supporting role as Whitney Solloway on the Showtime original series \"The Affair\" and as Sasha Torres on the short-lived ABC Family series \"Bunheads\". Are we justified in saying that \"Julia Goldani Telles was not in the \"Bunheads\" for a long time.\"? Yes, no, or maybe? Yes\n###\nLeventhorpe Academy is a mixed, 11-19 secondary school and sixth form in the historic market town of Sawbridgeworth, Hertfordshire. The school became a business and Enterprise Academy in August 2011. The intake at age 11 is drawn mainly from the pleasant and prosperous towns of Sawbridgeworth and Bishop's Stortford and from the surrounding villages. Are we justified in saying that \"Leventhorpe Academy is an 11-19 secondary school.\"? Yes, no, or maybe? Yes\n###\nAlix Bancourt is a fashion blogger who goes by the name The Cherry Blossom Girl. She is based in Paris, France. The title of Alix's blog, The Cherry Blossom Girl, comes from the French band Air's song \"Cherry Blossom Girl\". Her blog is written in French and translated to English by Victoria Morrison. Are we justified in saying that \"Alix Bancourt doesn't have nicknames.\"? Yes, no, or maybe? No\n###\nAnime Speed is a megamix compilation album of \"Dancemania\"'s \"Speed\" series, released by EMI Music Japan in 2005. The album features uptempo cover remixes of popular theme songs for various anime works such as \"Dragon Ball Z\", \"Slam Dunk\" and \"Neon Genesis Evangelion\". The successor, \"Anime Speed Newtype Edition\", was released in 2006. Are we justified in saying that \"Anime Speed and Anime Speed Newtype Edition are the only two albums to have featured anime music in 2005 and 2006.\"? Yes, no, or maybe? Maybe\n###\n\"Look at My Dab\" (originally titled \"Bitch Dab\") is a song by American hip hop group Migos. It was released as a single on October 30, 2015 by Quality Control Entertainment and 300 Entertainment. The song was included on their mixtape \"Back to the Bando\" (2015) and was produced by Drumma Boy. It peaked at number 87 on the US \"Billboard\" Hot 100 chart. Are we justified in saying that \"Migos released a song on the eve of Halloween\"? Yes, no, or maybe?", "doc_id": 763, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24130, 11680, 3279, 5838], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Recently extinct mammals are any mammal that went extinct since the year 1500 C. E., as defined by the International Union for Conservation of Nature (IUCN). Strong scientific records show that since the year 1500 roughly 80 mammal species have become extinct. Are we justified in saying that \"Zero mammals have gone extinct since the 1500s\"? Yes, no, or maybe? No\n###\nCarl Filip Anton Forsberg (] ; born 13 August 1994) is a Swedish professional ice hockey player. He is an alternate captain for the Nashville Predators of the National Hockey League (NHL). Forsberg was selected by the Washington Capitals in the first round (11th overall) of the 2012 NHL Entry Draft. Are we justified in saying that \"Filip Forsberg is a Swedish professional ice hockey player that has been the best Swede player in the NHL.\"? Yes, no, or maybe? Maybe\n###\nAmor a la Mexicana (English: \"Mexican-style love\") is the fifth studio album by Mexican singer Thal\u00eda. The album has been regarded as one of the singer's best releases to date, especially for the album's first single \"Amor a la Mexicana\", which showcased the evolution of Thal\u00eda as a musician. Are we justified in saying that \"Thal\u00eda received much praise for her fifth studio album.\"? Yes, no, or maybe? Yes\n###\nThe History Boys is a 2006 British comedy-drama film adapted by Alan Bennett from his play of the same name, which won the 2005 Olivier Award for Best New Play and the 2006 Tony Award for Best Play. It was directed by Nicholas Hytner, who directed the original production at the Royal National Theatre in London, and features the original cast of the play. Are we justified in saying that \"The History Boys in a 2006 American comedy-drama.\"? Yes, no, or maybe? No\n###\nUnlike a charitable foundation, a private foundation does not generally solicit funds from the public. And a private foundation does not have the legal requirements and reporting responsibilities of a registered, non-profit or charitable foundation. Not all foundations engage in philanthropy: some private foundations are used for estate planning purposes. Are we justified in saying that \"Foundations follow legal requirements.\"? Yes, no, or maybe?", "doc_id": 444, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [34022, 22917, 4206, 22425], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Jos\u00e9 Celestino Mutis botanical garden is Colombia's biggest botanical garden. It serves both as a recreation and research center with an emphasis on Andean and P\u00e1ramo ecosystems. The garden is located in Bogot\u00e1 and features plants from every Colombian altitude, climate and region. It was founded in 1955, in honor of botanist and astronomer Jose Celestino Mutis. Are we justified in saying that \"The garden was tributed in the memory of a botanist.\"? Yes, no, or maybe? Yes\n###\n54-40 is a Canadian alternative rock group from Vancouver, British Columbia, who take their name from the slogan Fifty-Four Forty or Fight!, coined to express the unsuccessful expansionist agenda of James K. Polk's presidency, intent upon controlling a contested U.S.-Canada border area in the Oregon boundary dispute. Are we justified in saying that \"The group writes disco songs\"? Yes, no, or maybe? No\n###\nClub Deportivo D\u00e9nia is a Spanish football team based in D\u00e9nia, in the autonomous community of Valencia. Founded in 1927 it plays in Divisiones Regionales de F\u00fatbol in the Valencian Community, holding home games at \"Estadio Diego Mena Cuesta\", with a capacity of 3.000 seats. Are we justified in saying that \"The club was founded in Spain.\"? Yes, no, or maybe? Yes\n###\nHundreds of ancient stone religious monuments lie on the island of Java. Known as \"candi\" in Indonesian, they date from the early classical period of Javanese civilisation, beginning in the first part of the 8th century CE and ending after 900 CE. The majority were built between 780 CE and 860 CE, even though the civilisation that created them existed for many centuries. Are we justified in saying that \"Hundreds of ancient stone religious monuments lie on the island of Java. There were too many of them.\"? Yes, no, or maybe? Maybe\n###\nMaastricht (] ; Limburgish : \"Mestreech\" ; French: \"Maestricht\" ; Spanish: \"Mastrique\" ) is a city and a municipality in the southeast of the Netherlands. It is the capital and largest city of the province of Limburg, as well as the largest city in the historical duchy of Limburg, that today spans the Netherlands and Belgium. Are we justified in saying that \"Maastricht is the capital city of the Netherlands.\"? Yes, no, or maybe?", "doc_id": 134, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [31445, 43709, 15059, 16829], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "\"Sunny Sundae Smile\" is a song by the alternative rock band My Bloody Valentine. It was released as a non-album single in February 1987 on Lazy Records. Recorded at Alaska Studios in London, \"Sunny Sundae Smile\" was the band's first release on Lazy Records and the final release to feature original vocalist David Conway. Are we justified in saying that \"The song Sunny Sundae Smile was not apart of an album\"? Yes, no, or maybe? Yes\n###\nDobbs Ferry is a village in Westchester County, New York. The population was 11,093 at the 2016 census. The Village of Dobbs Ferry is located in, and is a part of, the town of Greenburgh. The village ZIP code is 10522. Most of the Village falls into the boundaries of the Dobbs Ferry Union Free School District. Are we justified in saying that \"Some of Dobbs Ferry is not in the boundaries of Dobbs Ferry Union Free School District\"? Yes, no, or maybe? Yes\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"He has held the position of the Tibetan Youth Congress since September 2007.\"? Yes, no, or maybe? Yes\n###\nOak Furniture Land is a privately owned British furniture retailer of fully assembled hardwood cabinetry furniture, sofas, beds and mattresses for bedrooms, living rooms, dining rooms, nurseries and small office/home offices. The company has 74 stores across the UK, and its headquarters in Swindon in Wiltshire, England. Are we justified in saying that \"Oak Furniture Land has 70 stores across the UK\"? Yes, no, or maybe? No\n###\nRobert Cary Blanchard (November 5, 1968 \u2013 September 6, 2016) was an American football placekicker in the National Football League. He played eight years for five teams: the New York Jets for his first two years, the Indianapolis Colts after taking 1994 off, the Washington Redskins in 1998, the New York Giants in 1999, and the Arizona Cardinals in his final season. Are we justified in saying that \"Blanchard made field goals to multiple teams\"? Yes, no, or maybe?", "doc_id": 667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [42666, 20501, 35409, 730], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Inferno (also released with the title, Operation Cobra) is a 1997 feature film directed by Fred Olen Ray starring Don Wilson, Deepti Bhatnagar and R. Madhavan. Evan Lurie, Michael Cavanaugh and Tan\u00e9 McClure appear in other pivotal roles. Wilson plays the role of Interpol agent Kyle Connors on a mission set in India. Are we justified in saying that \"The setting of Inferno takes place in India \"? Yes, no, or maybe? Yes\n###\nGeorge White's Scandals is a 1934 American musical film directed by George White and written by Jack Yellen. The film stars Rudy Vall\u00e9e, Jimmy Durante, Alice Faye, Adrienne Ames, Gregory Ratoff, Cliff Edwards and Dixie Dunbar. The film was released on March 16, 1934, by Fox Film Corporation. Are we justified in saying that \"George White is a slow man\"? Yes, no, or maybe? Maybe\n###\nHealth For All is a programming goal of the World Health Organization (WHO), which envisions securing the health and well being of people around the world that has been popularized since the 1970s. It is the basis for the World Health Organization's primary health care strategy to promote health, human dignity, and enhanced quality of life. Are we justified in saying that \"The WHO is a wealthy organization\"? Yes, no, or maybe? Maybe\n###\nThe Emperor: Owner of the Mask () is a South Korean television series starring Yoo Seung-ho, Kim So-hyun, Kim Myung-soo, Yoon So-hee, Heo Joon-ho and Park Chul-min. It aired on MBC every Wednesday and Thursday at 22:00 (KST) from May 10, 2017 for 40 episodes. Are we justified in saying that \"The Emperor: Owner of the Mask stars went on to other projects after the series was over.\"? Yes, no, or maybe? Maybe\n###\nYahy\u0101 ibn Kh\u0101lid (Arabic: \u064a\u062d\u064a\u0649 \u0628\u0646 \u062e\u0627\u0644\u062f\u200e \u200e ) (died 806\u00a0CE ) was a member of the powerful Persian Barmakids family, son of Khalid ibn Barmak. Around 765, he was appointed to Azerbaijan by the Caliph Al-Mansur. Yahya's son Fadl ibn Yahya was born at Ar-Reiy, at the same time as Caliph al-Mahdi's son Harun. Al-Mahdi entrusted Yahya in 778 with Harun's education. Are we justified in saying that \"Khalid was born in 730 CE. \"? Yes, no, or maybe?", "doc_id": 338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [16516, 185, 5412, 18337], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Sky Television plc was a public limited company which operated a nine-channel satellite television service, launched by Rupert Murdoch's News International on 5 February 1989. Sky Television and its rival British Satellite Broadcasting suffered large financial losses and merged on 2 November 1990 to form British Sky Broadcasting (BSkyB). Programming merger took effect on 1 December 1990. Are we justified in saying that \"Sky Television was seen by Obama.\"? Yes, no, or maybe? Maybe\n###\nThe Little League World Series took place between August 22 and August 27 in Williamsport, Pennsylvania. Westbury American Little League of Houston, Texas defeated American Little League of West New York, New Jersey in the championship game of the 20th Little League World Series. Are we justified in saying that \"THe Little League World series takes place end of August.\"? Yes, no, or maybe? Yes\n###\nSing A to Z is the tenth album by popular children's entertainers Sharon, Lois & Bram, originally released in 1990. This album, like many other Sharon, Lois & Bram albums has been re-released many times. It is rumored that the idea for this album came from Lois when she and Sharon were window shopping and came across an alphabet quilt on display. Are we justified in saying that \"Sing A to Z was released by Metallica\"? Yes, no, or maybe? No\n###\nAlice Sue Claeys (born February 24, 1975) is a former competitive figure skater. Representing Belgium, she won silver at the 1992 Skate Canada International and finished in the top ten at three ISU Championships \u2014 the 1992 World Junior Championships (4th), the 1992 World Championships (7th), and the 1993 European Championships (8th). Are we justified in saying that \"Alice Sue Claeys enjoys skiing\"? Yes, no, or maybe? Maybe\n###\nLeavitt Peak is located in the Emigrant Wilderness near Sonora Pass in the eastern Sierra Nevada range of California. Leavitt Peak is located on the Tuolumne County - Mono County line. The Pacific Crest Trail runs close to the east of Leavitt Peak, at an elevation of about 10800 ft elevation. The peak offers views south to Yosemite National Park and north towards South Lake Tahoe. Are we justified in saying that \"Leavitt Peak is not covered by vegetation\"? Yes, no, or maybe?", "doc_id": 391, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [41492, 36632, 18423, 44194], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "International Cycling Classic, also known as the Point Premium Root Beer or simply SuperWeek, was a 17-race series over 17 days open to licensed amateur and professional cyclists. The series took place primarily in the area surrounding Milwaukee, Wisconsin. Are we justified in saying that \"All 17 days had one race take place.\"? Yes, no, or maybe? Yes\n###\nGettin' Out the Good Stuff is the second album released by American country music artist David Lee Murphy. The tracks \"Every Time I Get Around You\" and \"The Road You Leave Behind\" were both Top 5 hits on the U.S. \"Billboard\" Hot Country Singles & Tracks charts in 1996. \"Genuine Rednecks\" and \"Breakfast in Birmingham\" were released as well, although neither reached Top 40 on the country charts. Are we justified in saying that \"In 1996 David Lee Murphy had 2 tracks that landed on the Top 5 hits in the US Billboard.\"? Yes, no, or maybe? Yes\n###\nZale Dalen is a Canadian film and television director. He is best known for the 1980 film \"The Hounds of Notre Dame\", for which he garnered a Genie Award nomination for Best Director at the 2nd Genie Awards in 1981, and the cult films \"Skip Tracer\" (1977) and \"Terminal City Ricochet\" (1990). Are we justified in saying that \"Zale Dalen is a Canadian television director\"? Yes, no, or maybe? Yes\n###\nPort Melbourne is an inner suburb of Melbourne, Australia, 5\u00a0km south-west from Melbourne's Central Business District. It is split between the local government areas of Melbourne and Port Phillip. The area to the north of the West Gate Freeway is in the City of Melbourne. The area to the south is in the City of Port Phillip. At the 2011 Census, Port Melbourne had a population of 14,521. Are we justified in saying that \"At the 2010 Census, Port Melbourne had a population of 14,521.\n\"? Yes, no, or maybe? Maybe\n###\nTom\u00e1s Nistal Fern\u00e1ndez (born 31 August 1948) is a former road cyclist from Spain. He was a professional cyclist from 1969 to 1977. He represented his native country at the 1972 Summer Olympics in Munich, West Germany, where he finished in 54th place in the men's individual road race. Are we justified in saying that \"Tom\u00e1s Nistal Fern\u00e1ndez has won races.\"? Yes, no, or maybe?", "doc_id": 543, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14292, 28059, 33778, 25156], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sound and the Fury is an American drama film directed by James Franco. It is the second film version of the novel of the same name by William Faulkner. The previous adaptation, directed by Martin Ritt, was released in 1959. The film was released in a limited release and through video on demand on October 23, 2015, by New Films International. Are we justified in saying that \"The film was available worldwide.\"? Yes, no, or maybe? Maybe\n###\nKnightriders, also known as George A. Romero's Knightriders, is a 1981 American drama film written and directed by George A. Romero and starring Ed Harris, Gary Lahti, Tom Savini, Amy Ingersoll, Patricia Tallman and Ken Foree. It was filmed entirely on location in the Pittsburgh metro area, with major scenes in suburban Fawn Township and Natrona. Are we justified in saying that \"Knightriders was filmed on location in Pittsburgh metro area and debuted in 1981.\"? Yes, no, or maybe? Yes\n###\nScott Edward Morriss (born 10 October 1973 in Wandsworth, London) is an English bass player and illustrator, best known as a member of The Bluetones. He is the younger brother of frontman Mark Morriss, with whom he also played with as backing group, The Mummys. Are we justified in saying that \"Mark Morriss, Scott's older brother, was born before the year of nineteen hundred and seventy three.\"? Yes, no, or maybe? Yes\n###\nThe Cuban Embassy in Washington, DC, is the diplomatic mission of Cuba to the United States of America. It is located at 2630 16th Street Northwest, Washington, D.C., in the Adams Morgan neighborhood. The building was originally constructed in 1917 as the Cuban embassy, and served in that capacity until the United States severed relations with Cuba in 1961. Are we justified in saying that \"The Cuban Embassy held a staff of 20 during its operations.\"? Yes, no, or maybe? Maybe\n###\nClearance Giddens is an African American Elvis impersonator from Melfa, Virginia, who has been billed as the \"Black Elvis\". He has appeared on the \"The Arsenio Hall Show\" and the \"Geraldo Show\", and in the film \"Honeymoon in Vegas\". In the early 1990s, he also sang on stage in a duet with Jimmy Buffett singing \"Jailhouse Rock\". He is listed in the book \"I Am Elvis: A Guide to Elvis Impersonators\". Are we justified in saying that \"Clearance Giddens has a q.\"? Yes, no, or maybe?", "doc_id": 762, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Yes", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [24192, 35558, 1578, 25869], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Gary Lynn \"Sudsy\" Sutherland (born September 27, 1944) is a former American baseball player. He played college baseball at the University of Southern California and later played 13 seasons in Major League Baseball, principally as a second baseman (717 games) and shortstop (164 games), from 1966 to 1978. Are we justified in saying that \"Gary Sutherland has 2 sons who play professional sports\"? Yes, no, or maybe? Maybe\n###\nGabriel Julio Fern\u00e1ndez Capello (born (1964--) 24, 1964 in Buenos Aires, Argentina) is a musician and composer better known by his stage name Vicentico. Co-founder and vocalist of the band Los Fabulosos Cadillacs along with Flavio Cianciarulo. He was part of the group since its creation in 1984 to the year 2001, when he began a solo career as a singer. Are we justified in saying that \"The group broke up when he left to go solo\"? Yes, no, or maybe? Maybe\n###\nDavid Tench Tonight was a short-lived television talk show created for Network Ten in Australia. The series featured David Tench, an animated fictional character, as host. The name \"Tench\" is a partial anagram created from the name Channel Ten. The actor behind the digital Tench was Australian actor Drew Forsythe. Are we justified in saying that \"The featured actor was not australian\"? Yes, no, or maybe? No\n###\nThe 2015\u201316 Dartmouth Big Green men's basketball team represented Dartmouth College during the 2015\u201316 NCAA Division I men's basketball season. The Big Green, led by sixth-year head coach Paul Cormier, played their home games at Leede Arena in Hanover, New Hampshire and were members of the Ivy League. The Big Green finished the season 10\u201318, 4\u201310 in Ivy League play to finish in sixth place. Are we justified in saying that \"The players of The Big Green had terrible grades in college\"? Yes, no, or maybe? Maybe\n###\nHigh Noon Toons was a 3-hour programming block of cartoons hosted by two cowboy hand puppets named Haas and Lil' Jo (a Bonanza pun) shown on Cartoon Network in the mid-1990s. The series was made by Matt Thompson and Adam Reed, who later went on to create adult-themed cartoon series such as \"Sealab 2021\" and \"Frisky Dingo\" for Adult Swim and \"Archer\" for FX. Are we justified in saying that \"Cartoon Network first started in nineteen hundred eighty five.\"? Yes, no, or maybe?", "doc_id": 70, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [7314, 531, 18185, 23787], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Phu Quoc Ridgeback is a breed of dog from Ph\u00fa Qu\u1ed1c Island in Vietnam's southern Ki\u00ean Giang Province. The Phu Quoc Ridgeback is one of only three breeds that has a ridge of hair that runs along its back in the opposite direction from the rest of the coat (the others are Rhodesian Ridgeback and Thai Ridgeback). The Phu Quoc is the smallest of the three ridgeback breeds. Are we justified in saying that \"The Phu Quoc Ridgeback is a popular dog in Vietnam\"? Yes, no, or maybe? Maybe\n###\nThe 1902\u201303 Ottawa Hockey Club season was the club's 18th season of play. The club would win the CAHL championship in a playoff with the Montreal Victorias to win the Club's first Stanley Cup. For their win, the players would each be given a silver nugget. From that day forward, the club was nicknamed the \"Silver Seven.\" Are we justified in saying that \"Winners of the Stanley Cup in in 1903 were give a nickname related to a metal.\"? Yes, no, or maybe? Yes\n###\nThe 23rd Infantry Brigade was an infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II. In the Second World War the brigade saw active service in the Syria-Lebanon Campaign, the Western Desert Campaign and the Burma Campaign. Are we justified in saying that \"The 23rd Infantry Brigade was the only infantry brigade of the British Army that saw active service in both World War I, mainly on the Western Front, and World War II\"? Yes, no, or maybe? Maybe\n###\nThe Original Rude Girl is the second studio album by Puerto Rican reggaeton recording artist Ivy Queen released on December 15, 1998 by Sony Discos. It is the follow up studio album to Queen's debut effort \"En Mi Imperio\" released in 1997. The album includes Queen's debut single \"In The Zone\" featuring Wyclef Jean, which helped to increase the album and Queen's exposure to American audiences. Are we justified in saying that \"Wyclef Jean will perform \"In The Zone\" with Ivy Queen in December of 2019.\"? Yes, no, or maybe? Maybe\n###\nAniket Vishwasrao is an Indian film actor. He is best known for his work in Marathi cinema. He made his screen debut in Sudhir Mishra's \"Chameli\" and first appeared in Marathi cinema with \"Lapoon Chhapoon\" (2007). In 2011, he achieved popularity with the release of \"Fakt Ladh Mhana\". Are we justified in saying that \"Vishwasrao's first movie was in MArathi cinema\"? Yes, no, or maybe?", "doc_id": 644, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "No", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [33427, 32346, 7793, 1092], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Dostluk Spor Kul\u00fcb\u00fc is a multi-sport club established 1973 as a women's football club in Istanbul, Turkey. It is known as the country's first ever women's football club. The club also maintains the branches of basketball, swimming, tennis and volleyball to its activities. The club's colors are orange and black. \"Dostluk\" is the Turkish word for \"Friendship\". Are we justified in saying that \"Friendship translated to Turkish means Kulubu\"? Yes, no, or maybe? No\n###\nUtamaro and His Five Women or Five Women Around Utamaro (Japanese: \u6b4c\u9ebf\u3092\u3081\u3050\u308b\u4e94\u4eba\u306e\u5973 , Hepburn: Utamaro o meguru gonin no onna ) is a 1946 Japanese film directed by Kenji Mizoguchi. It is based on the novel of the same title by Kanji Kunieda, itself a fictionalized account of the life of printmaker Kitagawa Utamaro. It was Mizoguchi's first film made under the American occupation. Are we justified in saying that \"Kanji Kunieda novel was released in 1940\"? Yes, no, or maybe? Maybe\n###\nGloria Stavers (October 3, 1927 \u2013 April 1, 1983) was the editor in chief of \"16 Magazine\". Her personality gave this teen celebrity magazine its stamp for many years. Stavers is credited with being one of the first women rock and roll journalists, but male editors, detractors and those who scoffed at teen or celebrity magazines sometimes called her \"Mother Superior of the Inferior\". Are we justified in saying that \"Gloria Stavers had a bad time at work due to discrimination\"? Yes, no, or maybe? Maybe\n###\nNorth High Bridge Park is a 0.85 acre city park located on the east bank bluffs above the Mississippi River in Saint Paul, Minnesota, United States. The park is adjacent to the High Bridge and was created when the new High Bridge was finished in 1987. The park includes gardens, sculptures and an overlook of the Mississippi River. Are we justified in saying that \"North High Bridge Park was planned before the High Bridge was completed.\"? Yes, no, or maybe? Maybe\n###\nVincent Edward \"Bo\" Jackson (born November 30, 1962) is a former baseball and American football player. He is one of the few athletes to be named an All-Star in two major sports, and the only one to do so in both baseball and football. He is widely considered one of the greatest athletes of all time. Are we justified in saying that \"Many professional sports players have been named All-Star in separate sports.\"? Yes, no, or maybe?", "doc_id": 594, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [38616, 34468, 22319, 14944], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Donaldson Center Airport (IATA: GDC,\u00a0ICAO: KGYH,\u00a0FAA LID: GYH) is a public airport six miles (10\u00a0km) south of the central business district of Greenville, a city in Greenville County, South Carolina, United States. It located at the Donaldson Center Industrial Air Park and is owned by the City and County of Greenville. Are we justified in saying that \"Donaldson Center Airport has an area available for private planes.\"? Yes, no, or maybe? Maybe\n###\nThe Consolidated Tape Association (CTA) oversees the dissemination of real-time trade and quote information (market data) in New York Stock Exchange (NYSE) and American Stock Exchange (AMEX) listed securities (stocks and bonds). It is currently chaired by Emily Kasparov of the Chicago Stock Exchange, the first woman and the youngest chair elected to the position. Are we justified in saying that \"Emily Kasparov was not the first woman elected to chair the CTA.\"? Yes, no, or maybe? No\n###\nOn 10 September 2016, a man armed with a knife attacked another man walking his dog in Minto, a suburb of Sydney, Australia. As he stabbed the victim the accused allegedly shouted \"someone is going to die today.\" The perpetrator subsequently sought to attack police, but was arrested a short time later. Are we justified in saying that \"The man had serious mental problems.\"? Yes, no, or maybe? Maybe\n###\nTsewang Rigzin is the current president of the Tibetan Youth Congress. He has held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013. Prior to attaining his current position he served as the president of the Portland/Vancouver regional chapter of the Tibetan Youth Congress. Are we justified in saying that \"Tsewang Rigzin held the position since September 2007, and on August 8, 2008 he was re-elected to serve through August 2013\"? Yes, no, or maybe? Yes\n###\nThe Puppet Master is an album by King Diamond released in 2003. A limited edition includes a DVD on which King Diamond is telling the story of The Puppet Master. It is a concept album with a storyline telling the tale of a young couple who go to watch a puppet show in Budapest in the 1700s, and end up being turned into undead puppets by the Puppet Master and his wife. Are we justified in saying that \"King Diamond has never been to the circus.\"? Yes, no, or maybe?", "doc_id": 654, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [27119, 25883, 13751, 36050], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The X-Files Game is an interactive movie point-and-click adventure video game developed by HyperBole Studios and first published by Fox Interactive. The game was released for Microsoft Windows, Mac OS and PlayStation in 1998, and is based on the television series \"The X-Files\". A second, but unrelated game, \"\", was released for PlayStation 2 in 2004. Are we justified in saying that \"A remake of both games will be produced soon.\"? Yes, no, or maybe? Maybe\n###\nWonder Woman is a never-aired television pilot produced by Warner Bros. Television and DC Entertainment for NBC, based on the DC Comics character of the same name. David E. Kelley wrote the pilot, which was directed by Jeffrey Reiner. Adrianne Palicki starred as the main character. Are we justified in saying that \"Jeffrey Reiner's Wonder Woman was the first attempt at transitioning a DC comic to TV.\"? Yes, no, or maybe? Maybe\n###\nEldrid Nordb\u00f8 (born 12 August 1942) is a Norwegian politician for the Labour Party. She was personal secretary to the Minister of Social Affairs in 1971, state secretary to the prime minister (1986-89), and Minister of Trade and Shipping (1990-91). She is married to economist and politician Bj\u00f8rn Skogstad Aamo. Are we justified in saying that \"Eldrid Nordb\u00f8 was personal secretary to Bj\u00f8rn Skogstad Aamo.\"? Yes, no, or maybe? No\n###\nHenry Pelham Fiennes Pelham-Clinton, 4th Duke of Newcastle-under-Lyne {'1': \", '2': \", '3': \", '4': \"} (31 January 1785 \u2013 12 January 1851) was a British nobleman and politician who played a leading part in British politics in the late 1820s and early 1830s. He was styled Lord Clinton from birth until 1794 and Earl of Lincoln between 1794 and 1795. Are we justified in saying that \"Henry Pelham Fiennes Pelham-Clinton died alone\"? Yes, no, or maybe? Maybe\n###\nValan is a small coastal village on the island of Mager\u00f8ya in Nordkapp Municipality in Finnmark county in far northern Norway. Honningsv\u00e5g Airport, the local airport for the town of Honningsv\u00e5g is located in Valan. The town lies a few kilometres south of Valan. Are we justified in saying that \"Valan is inaccessible by road\"? Yes, no, or maybe?", "doc_id": 380, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [28253, 18783, 14610, 13398], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things. Are we justified in saying that \"The Wet is about deep sea diving.\"? Yes, no, or maybe? No\n###\nJohn Wellborn Root (January 10, 1850 \u2013 January 15, 1891) was an American architect who was based in Chicago with Daniel Burnham. He was one of the founders of the Chicago School style. Two of his buildings have been designated a National Historic Landmark; others have been designated Chicago landmarks and listed on the National Register of Historic Places. In 1958, he received the AIA Gold Medal. Are we justified in saying that \"John Wellborn Root was 27when he died.\"? Yes, no, or maybe? No\n###\nColorz of Rage is a 1999 debut feature film for Dale Resteghini. The independent urban drama features Debbie (Nicki Richards) and Tony Mespelli (Dale Resteghini) trying to make it in New York City despite great adversity. It also features hip-hop star Redman and R&B singer Cheryl \"Pepsii\" Riley. Are we justified in saying that \"The film was in English.\"? Yes, no, or maybe? Maybe\n###\nThe Path Between the Seas: The Creation of the Panama Canal, 1870\u20131914 (1977) is a book by the American historian David McCullough, published by Simon & Schuster. It won the U.S. National Book Award in History, the Francis Parkman Prize, the Samuel Eliot Morison Award and the Cornelius Ryan Award. Are we justified in saying that \"The author of The Path Between the Seas was an American historian \"? Yes, no, or maybe? Yes\n###\nWCBC is an AM radio station that serves the greater area of Cumberland, Maryland. Founded in April, 1976, WCBC provides news coverage: locally, regionally, and nationally; weather forecasts; participation in major community events to promote the area and its organizations by way of remote broadcasts and community service announcements. Are we justified in saying that \"The radio station has many listeners.\"? Yes, no, or maybe?", "doc_id": 180, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "No", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [26089, 32315, 17773, 904], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The 1997 Porsche Tennis Grand Prix was a women's tennis tournament played on indoor hard courts at the Filderstadt Tennis Club in Filderstadt in Germany that was part of Tier II of the 1997 WTA Tour. It was the 20th edition of the tournament and was held from 6 October through 12 October 1997. First-seeded Martina Hingis won the singles title, her second consecutive at the event. Are we justified in saying that \"The 1997 Porsche Tennis Grand Prix took place in 1980\"? Yes, no, or maybe? No\n###\nBridge Mountain is a mountain located in the Spring Mountain range of southern Nevada. It is located on land managed by the United States Bureau of Land Management as the Red Rock Canyon National Conservation Area, part of the Rainbow Mountain Wilderness. Bridge Mountain is named for the natural feature of a bridge-like natural arch of sandstone near the summit. Are we justified in saying that \"Bridge Mountain is located in the United States.\"? Yes, no, or maybe? Yes\n###\nAsana ( ) is a web and mobile application designed to help teams track their work. It was founded in 2008 by Facebook co-founder Dustin Moskovitz and ex-engineer Justin Rosenstein, who both worked on improving the productivity of employees at Facebook. Are we justified in saying that \"Asana was built in 2016.\"? Yes, no, or maybe? No\n###\nWaking Up is Hard to Do is the second studio album by the American indie rock band Giant Drag, released on March 5, 2013 on Full Psycho Records, the band's own label. It is the band's first full-length release of original material since \"Hearts and Unicorns\" (2005) and was released as a digital download on Giant Drag's Bandcamp music store. Are we justified in saying that \"Giant Drag released 2 albums in 2013.\"? Yes, no, or maybe? Maybe\n###\nDavid Halberstam (April 10, 1934 \u2013 April 23, 2007) was an American journalist and historian, known for his work on the Vietnam War, politics, history, the Civil Rights Movement, business, media, American culture, and later, sports journalism. He won a Pulitzer Prize for International Reporting in 1964. In 2007, while doing research for a book, Halberstam was killed in a car crash. Are we justified in saying that \"The reporter was popular for his pieces on history\"? Yes, no, or maybe?", "doc_id": 503, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [22662, 8957, 13474, 13497], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "The Sherbino Mesa Wind Farm is located in Pecos County in west Texas. The first 150 megawatts (MW) of the project, which has a potential capacity of 750 MW, is in operation. Phase I utilizes 50 Vestas V-90 Mk.5 wind turbine generators, each with a rated capacity of 3 MW. BP will operate phase I of the project. Are we justified in saying that \"The Sherbino Mesa Wind Farm is located in a rural part of Texas\"? Yes, no, or maybe? Maybe\n###\nThe Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis. Are we justified in saying that \"The Copenhagen Consensus Center is a very effective conference.\"? Yes, no, or maybe? Maybe\n###\nGlobacom Limited (or GLO) is a Nigerian multinational telecommunications company headquartered in Lagos. GLO is a privately owned telecommunications carrier that started operations on 29 August 2003. It currently operates in four countries in West Africa, namely Nigeria, Republic of Benin, Ghana and C\u00f4te d'Ivoire. As of June 2009, the company has employed more than 2,500 people worldwide. Are we justified in saying that \"GLO has had over a thousand employees.\"? Yes, no, or maybe? Yes\n###\nNydala Abbey (Swedish: \"Nydala kloster\" ) was a Cistercian monastery in the province of Sm\u00e5land, Sweden, near the lake Rusken. Although the abbey ceased to operate in the 16th century, its church was renovated and converted into a Protestant church during the 17th century and is still in use. The church belongs to the Church of Sweden and is part of the Diocese of V\u00e4xj\u00f6. Are we justified in saying that \"Nydala Abbey was recobstructed in the 16th century.\"? Yes, no, or maybe? No\n###\n\"Up All Night\" is an American television sitcom created by Emily Spivey that airs on NBC. The series stars Christina Applegate and Will Arnett as Regan and Chris Brinkley, a couple who struggle to balance their home lives (especially with their newborn child, Amy) and their work lives. Are we justified in saying that \"\"Up All Night\" is a sitcom from a country that was previously a British colony.\"? Yes, no, or maybe?", "doc_id": 129, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Maybe", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [14646, 16233, 35206, 45322], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Virginia's Eleventh Congressional District is a U.S. congressional district in the Commonwealth of Virginia. The district stretches from Herndon to Quantico, comprising most of Fairfax County, all of the city of Fairfax, and part of eastern Prince William County. The residents of the 11th district are represented by Democrat Gerry Connolly. Are we justified in saying that \"All democrats in Virginia's Eleventh Congressional District voted for Gerry Connolly\"? Yes, no, or maybe? Maybe\n###\nRoss Dawson (born 1962) is an Australian author, futurist, entrepreneur and former stockbroker. Best known for his 2002 book 'Living Networks', Dawson founded the futures think tank Future Exploration Network and consults on digital futures to various big organisations such as Ernst & Young, Macquarie Bank, Microsoft and News Corp. Are we justified in saying that \"Ross Dawson ends with a N.\"? Yes, no, or maybe? Yes\n###\n\"The Candidate\" is the 14th episode of the American Broadcasting Company's sixth season of the serial drama television series \"Lost\" and 117th episode overall. The episode aired on May 4, 2010, on ABC in the United States. The episode was written by Elizabeth Sarnoff and Jim Galasso and directed by Jack Bender. The episode is centered on Jack Shephard and John Locke. Are we justified in saying that \"Elizabeth Sarnoff likes to be called Beth.\"? Yes, no, or maybe? Maybe\n###\nIn field hockey, a penalty stroke, sometimes known as a penalty flick, is the most severe penalty given. It is predominantly awarded when a foul has prevented a certain goal from being scored or for a deliberate infringement by a defender in the penalty circle. Are we justified in saying that \"There is a penalty box in field hockey.\"? Yes, no, or maybe? No\n###\nIn ancient Roman religion, Antevorta was a goddess of the future, also known as Porrima. She and her sister Postverta (or Postvorta) were described as companions or siblings of the goddess Carmenta, sometimes referred to as \"the Carmentae\". They may have originally been two aspects of Carmenta, namely those of her knowledge of the future and the past (compare the two-faced Janus). Are we justified in saying that \"Antevorta and Postverta were sibling rivals.\"? Yes, no, or maybe?", "doc_id": 755, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} +{"pred": "Maybe", "target": "Yes", "answer_choices_list": ["Yes", "Maybe", "No"], "fewshot_idx": [18475, 42371, 8995, 1998], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train_r2", "fewshot_num": 4, "ctx": "Conoclinium coelestinum, the blue mistflower, is a North American species of herbaceous perennial flowering plant in the sunflower family. It was formerly classified in the genus \"Eupatorium\", but phylogenetic analyses in the late 20th century research indicated that that genus should be split, and the species was reclassified in \"Conoclinium\". Are we justified in saying that \"Conoclinium coelestirum is a blue mistflower of North American species of hebaceous perennial flowering plant in the Daisy flower family.\"? Yes, no, or maybe? No\n###\nMy Cat Is An Alien (MCIAA) is the name of the Italian musical duo and outsider audiovisual artists consisting of brothers Maurizio and Roberto Opalio, formed in Torino, Italy, in late 1997. They release avant garde /experimental music in a peculiar form of improvisation that MCIAA themselves define 'instantaneous composition'. Are we justified in saying that \"The brother worked on another musical after My Cat is an alien. \"? Yes, no, or maybe? Maybe\n###\nMars Audiac Quintet is the third album by the band Stereolab, released in August 1994. Initial releases of the CD came with bonus two-track disk, the double vinyl album came with a 7\". During the recording of the album, guitarist Sean O'Hagan left as a full-time member to form his own group, while keyboardist Katharine Gifford was added. Are we justified in saying that \"Stereolab has released at least four albums.\"? Yes, no, or maybe? Maybe\n###\nSir Christopher Edward Wollaston MacKenzie Geidt {'1': \", '2': \", '3': \", '4': \"} (born 17 August 1961) was the private secretary to Queen Elizabeth II from September 2007 to 2017. As of July 2016, Geidt also serves as the Chairman of the Council of King's College London, succeeding the Duke of Wellington. Are we justified in saying that \"Christopher Edward Wollaston MacKenzie Geidt was born in the late summer.\"? Yes, no, or maybe? Yes\n###\nThe Tampere Floral Festival is an annual summer festival held in Tampere, Southern Finland in July and/or August. During the festival the centre of the city is decorated by flower arrangements and about 150 events such as concerts, parades, a \"wine village\" and a children's day take place. The festival lasts approximately one week and attracts thousands of local residents and visitors. Are we justified in saying that \"Tampere, Finland is a festive place in late summer.\"? Yes, no, or maybe?", "doc_id": 954, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_name": "justified in saying", "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": ""} diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be366ac21e25a2d58230e1ae4b4697a9bc53da14 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de2edb93d987999f6ba31b7ef08afc7fa1c9757811dbf170ad9bb4a6f1001e62 +size 1169220 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44af54cab521edecb73344a531c9436971bfb45e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a19af33eb3f3b40aa6e6196d0f2d5f4593d252c2e4e92cc9cfdd94879a3049e3 +size 1699210 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5250ebb283bbfc128eb276cf7ee26f44efc69377 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e98d140d4944defe4d97a8b6665e36dd3e59b1be78f3625c9e9ea8caff730515 +size 2218283 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..471f7d1e10028141ed30bbbaf88313c71af29551 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a002f08f3f2ea87cd884b010817998704c85d8a5be8606dfa3b12b9597f430dc +size 2731721 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7334455ecd79cbaa14af1f27ec30d1b3e5bcff8e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd39167294d27a4700a5d0262f4f31d2d593f3955ce6fca33e6ae043c4f3ccb8 +size 1421162 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ad529be223f1b73f1925dc03db4317fd929b9b0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:097b98144f057156ae8add6701f7f4d6a9dc1239c2b8207a69c2a36505eb304e +size 2061765 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3153438ca8f20be3861fa0e55e5357dd1192e25f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939e75980d9c35f6a211651290c4c20cb5a57fd9bcab95c4dc42ef21386498e2 +size 2689889 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b536068cfe4ba87830415d1af68e6929582b6a9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24f52fca61ad5c00f4355d2bbe5045e4b8f584c2e113ad1748aa244dcec62b65 +size 3312527 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e78d34a1210ca186c8ccc19b77d595565792d5cc --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93f9f36ca5be1a77a1d8057af1269fd9c00581ff04e3ba6e7a4a190d4612a1f2 +size 1187166 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3841b8123917ee48f60896d6b6e1ced327fc0b42 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc9fa290c2e6b67304c45a18474b77935bb71b8767ee8918d7de0da71d9fe5a +size 1730739 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2025c3b1dcf2915711a35de85a4277977669c507 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5fc0e8cb900c2b6a42c7cf00c92fbe78f9c86195a4243243da2e16a15d0d32 +size 2264399 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec84165200c1d166d0e61d21b022acde5015543b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33436ed50fcd59b701ee02ae2f487d8eca034636a803070cd89c8f65e2259f57 +size 2792196 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eb57ce41653cadc6f47b907a1a44fc8ae870f0ee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c5e82bf203cf910f4055a0be53434368233d28ab1f894ae8b245284a211242 +size 1370645 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9d6350c3c037eadbad97ea6a2698f83d3e3e9f20 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe709edafc14e80bc973713053c9e14337943a2d4e6748a077f646830a2a33c +size 1959145 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..31f1815714271c395d6669b7f389ba7a0effeaa7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e1c0bb20cacb3a134de73b4331ce34ed971fa61a425e80fa97ca2ed6b37041d +size 2537674 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..17dedd34de72248df1151504616c861fc46c2776 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:528e2a789bf1cfb80b7f5d22e0204dac8444cbedfa9646506bb72700e63e15f1 +size 3110473 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2883f5590f5c24bd2c6db7defaa92ea7c6ea8312 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3d5a1337148c2603db3d08bd724a3329b3919b37e424787c21ae4d49f2c5599 +size 1209618 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bf01877183d6bda1f6d756953e46c26097741dd --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408c0bae580fb1304bc3b2e4ab9b4fdef5baf31f1d36b2ac1bcf481f4b791399 +size 1760712 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..862a1ed12cf52e7b5c4f8aaeee6cb70d7723ea3a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0641e496eda7b0e76745e4ed39863acb5821e1c5537b9d33bce0b21495a3208 +size 2301640 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cd1515e0ce1b09a929658ee9e8f2fe7a42aa1447 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bea87e9e2923a8fbec8ab8dd16512569169d117ba2ff056225686a93933166f +size 2836507 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..27fafd0a788cc8750c3be8bf4e685c702b9479bd --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f5bbcb0da161536276b960d6416e6990858bf67ee4d6239915c6a603a56de71 +size 1216777 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e95e79da459f29a9c771b2d9325cabf6ff038095 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e0f949786024fa0d5be423506146f17df986579e183e61b9fc4b6d19fd291cf +size 1670632 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1fa52103be02a9d45140ceaf82674d789f5ec13f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381f1bfccc8bb3bab46de90651059c02ff7465e9093526230675fc32a6c9b051 +size 2120084 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ba58ca971f44de56e476cde2290b6d3c807b35ab --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4140b1913f34a33599a36a1bdbd05db7c638659cca23b272b78626ac489e033a +size 2577615 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e64173afc0fd233a41f624073566c7725efa6ff --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6bdeaed212f2ef1bdd1ece53653f29ea22d6b9c5cdc78f56cfd1bc6b99e657 +size 1458339 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00c891ba9ddee72394b599dee9d8d048794991ed --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0302526c4b093e4506bb29f28e2b9372904f055c2b064cf9c8fd5e181ba2f941 +size 1961035 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b40cc69d7d2ecea99ab303fc4741dcd77b8142f4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abf01d8e365269b732c11a2bc3e57fdd3106d74c801b320763a809e06c0c233d +size 2457770 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d9301985a69a19181ee891e5c4e80a8cfe36cf16 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63cbe8411d04a590562237d4c2df78d1e61e700a605eee93df7175c276c6776b +size 2963567 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74d38dc844f3aefb363085537aa13baa69e75bec --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:760af3c0f15476c5d3a9d37cdec96f8588f57b36f3686923a09c88d6f5d1648d +size 1505932 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b5caa1476b66be043cad2659a78f0e4de59f29ec --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2c901992f5779400bca8838545410a57c2c52e6bfef9230e5d1af018ef0e1d +size 2032925 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..262587bfbfafde86c3d43e3989251ea693815f3d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc2400b78fcbd4f7c9848cf3438dedabe660bd05c5690bf5b940c95029e987c +size 2553908 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fd68f6e21ffb6e8a8df0ddd23cb0b519324c82ac --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43be9b6b42e0e695a72f6ace8ddc66e9619182b22d764eadee1b1ec9f3fb57c +size 3084213 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b89826f9631a031d2015e0762324fe6c1e39bacb --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e1ef95a2c0e7e3e132cfaba2320aa7462cd479c245d1ef71549a26fb25c0f4 +size 1202714 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bae337cadff629e9bdfe510a7c025a515b87c7b0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da44068e7c637ff547f25c3d20db9183aa84c6822aede7a8a82c55a97b182d6 +size 1638992 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bff1a6e0a45eeee60f7658da9687ce98b4c104de --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acacb01733c0ce2c135b9706071077db2b82d119f57e7bb0e8f9c1398941a8ba +size 2070864 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be7a6db5b18ad99ce8f39defbbf2a86e431d0a2b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17c1f481f218f0d877d635e7063d1963ed958f5656c1c4ee09008b2ab205eed2 +size 2510815 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6b9af3ffc76ae89b369822a77d8cc35fcd19c80c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2cd4ef704756b4bdaaca96d8f49dd6ca311cc19f197cb2d703a6a5671e5b3c1 +size 1187313 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..504327ad9b6e2ec44f53f26d2ef2b7df1849b018 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92fe7550cbdfa90a622a02b4fb695438934170a0784d512207b44b298732336c +size 1558012 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3e0e76b91529da3abc9414328c1dbf0a310511dc --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5734559ac95ca9b7b61a9cb8dd884b48eab53d7df7f878f4a60cd4ddc2285271 +size 1922349 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f077aefb404f53b1ab44052fd35846f48374554d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:194745866191d3a5abbd3120144bd90759729e8dd9f7877d05f6f34cee6c0c8f +size 2295616 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7fbeb236d5759b8bae2ff51c4e0895e5c59d47f3 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d354c0046fb6c26cf31c0a56bd52157bf6a9b18c653546303f7a0309145ef222 +size 2351241 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..570e703c4984efeeeb7d5d2e9d4f4aec3ca864e8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d66ac0d2d171dacf1ca33779534b3979000917fcd7d7b1dd897de68f88449d +size 3174164 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ddcd956ed5879478381b0e03f597bb9e4596452b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7729e5eee5a2d7bea8ad58035f568fcb588cf3d8c708c8441e6a540b7cdd5fde +size 4008946 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dce9dac25a542a59f542871dfb9c996e79547af9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ca4e36d50c86585e7a2b7f3114e7d7fcd85f5e0fe7f8b36e1ce57fe685e93c +size 4831612 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bffc127a07778f6e3815b57483bfd98a23d546d9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a21ad6f02e9a8ff4d0fa25bfefa74703bcb153eb3cd1d70c94394e70830cb964 +size 2746095 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..89cb12ff69fe9aebba76a9c9928f0fa4844c025b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c3b0c59846636735f056ca9de340f8799f63390780064654ea543f89d44d77 +size 3649418 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8baf9153c2959f33b1f3699e03b4ff58e0115feb --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dae19bbcfaf692d81a45324e5c83a2844e3964a4adeca58156a73cc28953757 +size 4566969 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..108b508e6144f97cb5d35e66088e1e5807800e89 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e4bdfb1f9c92967e58b33e072971fb2c3ca60d04ebde01f9370f1fe787a1cec +size 5470304 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e256ef548d17f3a34c3bd41e41801682071606e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a97fd07ae950876a26a1efd51e48d750deba2ca83205e5a8a5fa8869ea5ae6 +size 2843317 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4829472430922307248578f6755304232a97170 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b40b3cc7089082a06c85be0615f76689b67367b39203b3d2bdaf3547918744e4 +size 3796084 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efce0acbaeb7ecfaef36516d84cd87367d57381b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00fb77cd7bc57711ee90f21d0597d185b8882f63b7b72eac7a69672812095487 +size 4762891 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4348147992c0dc038d09c3dacb9f5a59ad9f46b5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2bee34f5c946a53051e5e91067c6788c42a040b5feda3aea39d448e92a4ca4 +size 5715815 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bc2946cfe4059539092b59e3579465592e603c2d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601a826dfab374ebd130bb0c746b486057de6f48b36c65d435fc231a918f5edf +size 2322732 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed232ca041e4f96714bc22de7e8bd70d6fb1dffb --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:102bfec0c312372c89788e0b8dc26ae09595ac97b2ba08f4a010bbcac1196d79 +size 3110021 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bc9bdeff32066c7485721cf7bc93af4cbc20edc --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc41d920ed4b4b88794f8fea243cc47a6859a87068cdefd24f31ff69b386e274 +size 3909160 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..05eecdd8704098d3dc11fd1b8b4e428f083f816d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f376b753c5e5c1e6accc09d5b4d81b5e4fdecae817f726b466c62ffcf5373a +size 4696184 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c6699a6e9c52b2822058e309d947ae8a899b08f0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a5e95f032273377e731bda24f04664b555f5076ed4b6a1dcb69903ad4f542b1 +size 2197573 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1796151d65da231d83ecce4659a6ee38c1c631d5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f45bd961f4e08f81514b90abe4782d3f8ca0d47e8264dc721b5f7a67269360e6 +size 2832589 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b78ad286817de627a124677ee1e5fd17ce70a809 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4157d553096634769135a36bf8b4539d3c729deb46982eee7b98d0f0fc5e587 +size 3481212 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0ea7c3bc74fef872fa1806d4da64e6e912c3ae8f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb4d2bf082245d57c86bc40efc248e6356187b5d12c8cd2b0f26a76440a20ecd +size 4116183 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f654a9915123f42e822144d6aca7848e3ba8884 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3535062ac853058d2d529aec3fabab2a822abb65d9454dbc5d5cac14bfb1049c +size 3641827 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8641ddcc283951531763a867e8f618b2f1792a45 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9cc7bd03855c945b11c428ff81fb1638bd2ccaea81eff6399d063a85ca98ab +size 5656992 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..960c428bb32c36138315a89f7dade031653d0e07 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c087bc41dcf73c06c7b600c1a61d56329e387c0e6bfc71b4f56cc641e987f975 +size 7693977 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77aa0142e34d11f03fa2dbce6c24168ebf556c40 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961defa3fb339bfd758055538887075fd2908359666c50ebfd5d9384a76cbc43 +size 9728047 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..780ee6c7114690f4cd912028d8a2f58186708592 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d565655047ef455dc7505f04d62d1ed815fd00f009793607f3b1f184b2ad41f5 +size 3984810 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4907fa0f6fad03f585a5184f73500f6d6354a72 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e632237bb0a865bc37e5603264f03718b2e31e6cdd9270a4cecbaa6bf71d9c4e +size 6168138 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45c46073c13945fb1293d928b5a808c372c2779c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f06841eb7b4af64cafd20196158daf0f23f51252c5c82476df6530f4462d46a +size 8371708 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c72965a43d08e794b7791e0809d32e0ab68d677f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a3fb4dad5dc58e301c0dd68ecaca1db7afb552045d241ab2c7ea52c721a242 +size 10572838 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8b65fb7ea4476ed23e0f10e41a367f9516bc0964 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861ef059d9c526ade022f7a3b412335799f138c4105e017277e143b4b2184515 +size 4041660 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07b38b84fdce7bd6a77c26f21f63d5bc681543b6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b308469c973553d2b4ccc11d4f3a7d6376679075a17e28669d3878bc806c5a34 +size 6261132 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0e727d7fd1c628e1aa6f157d6e76b84c0720f7d0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05c30007e3da43dfd7075818fe1c543e7f8931eca04fc6d9e52ccf0ca799441a +size 8501475 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3fa82a2158583f59078d1dfc2b4e5dd109bc853e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec1912658f5291c2ef3132ee342eb5781d5d3a245d79c90446bfa75ba4a49d9b +size 10738578 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0cacfd872b589f26a58b7af7671db4f3c539fb91 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f303fcc24958fad1ab8cc3be490c82c0a7a5a7f8e11cce45379d18ee6684558 +size 3663836 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c6a78a6e7839d69b6c614821198c3463a690658 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a7760ea963e7b9f29778e6313b40f5211e4e9df120f875e9e721d375eac6c9a +size 5687994 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5f5e9b3d656687a6a420666f71c1a640aecd45d0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ddac81c06314d57ceebb519c74332df3665a17ed4ca7542b624fc75113789d +size 7732808 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a720d159c9d5f5f75cb9cf31e79045ed1d5f326 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0375e92785f9dc3e961051333c21eb6fb3e845eefb499585151ecd370a2af3d8 +size 9775089 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..227a9aa6501f989d03e7d5a8cf8ccb7922f7b030 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:771d42e0756e8f54db34930b683a2f31474bed648bd3c51005a16c58f420e870 +size 3862313 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550 +size 5990492 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..88995e5e64f4371be20d54c6795a4d8a823672a6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51da751223591e6ea8b6b03dae9f79a1915187aa33f8c10fef121cead4ff81e9 +size 8142021 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..83baa87d3a4d8fc6c26aaa081fd811f08cb74b27 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1d0fb51f534b9c1a337ef79b8285c5e379b316442487db8012c8e2ee91a644 +size 10290000 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b40a8ee95efd72dfc2b904276880aca89375d732 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c140a7e5435fe38cb3fe5daacc2ec82418a51b5a64c7d14c6ebc8b952bb661e +size 55152 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af87a90a716809dfdf60cf1c9b91f5da041ae726 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2726b26d77be6b2e080b20ecfe8c418e391648633ed79810f035b89274369aa3 +size 77969 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e4b6c8e12895b1a810c8d84db23f57c43dfb04ad --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba12a302cb537a1a19e6e34fd8953d97b4890db9e856c14bf48a27ab78e77974 +size 99584 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6bec13ea6d3d6fa0ae3bf8d5af78943ed9fce697 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944c98e63e9f403f326b6b43d226e812beacb6e0a1139d12a918965203eedac1 +size 120740 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a64c7150e5078131c07ed8b5dce4397c352fa2a4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb081ac8d5fba0fc825849dd6492b28d8a27c5e869ac8115bcadeb44aad35b4c +size 66238 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cad6ffefb978f04a01a57cfb725d853010cd2c05 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed26e386023ca0c8e8b7f924dcc74e77497958defa40408ec342883c93d2564b +size 94139 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcb8e5cfe4586dd1a20169c5f49492c1da4a942d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242e6969b9864670915df69e6df6cd563110c02e1e663baa24a57829921c06f4 +size 120826 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8e40a1dd92c58da688703db40c32fc03401f975e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab1f622ac2115ff107b3721be6dee196628ed16607d4d2746d1e18a78bb753b +size 147057 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c37be375e96eec6f9003dc50699fc8c08ff9b9ae --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_4.jsonl @@ -0,0 +1,56 @@ +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [224, 152, 13, 207], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: Yeah. The radio doesn't really have much news sometimes. The stations I listen to are just mainly music. B: Yeah, I think you pretty much have to listen to all news station to get any news at all. A: Yeah. Do you think that TV is, uh, pretty accurate. Using only the above description and what you know about the world, \"TV is pretty accurate\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nB: That might be kind of tough, huh. A: It really would, yes, yes, and like I said, my sister's still in it, and I really don't think my mother'd want to be there, either. Using only the above description and what you know about the world, \"his mother would want to be there\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nIt was Alan's idea. He made a sour kind of joke out of it, that they must wait until their wedding night. Carolyn agreed because she could see he meant it although she didn't understand why. Using only the above description and what you know about the world, \"Alan meant it\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: you know, sometimes I would go over, but you know, it wouldn't hit me in a big way because I knew that, uh, I would have it covered in that respect. A: Right. Right. That's good. I don't think we've gone that far, to pay it you know, in advance before we spend it, Using only the above description and what you know about the world, \"they've gone that far\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: Yeah. I understood that. you know, I work full time and I have two kids so my spare time usually involves something with the kids. A: Yeah. B: You know, hobbies, I can't really say that we have hobbies. Using only the above description and what you know about the world, \"they have hobbies\" is definitely correct, incorrect, or inconclusive?", "doc_id": 9, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [34, 209, 137, 151], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Jane ate without pausing. Hunger was an unknown experience. She had never imagined it could actually hurt. Using only the above description and what you know about the world, \"hunger could actually hurt\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: It was just a side benefit. B: Yeah, yeah, because, I'm not big or anything, but I'm not in great shape, But when I worked out, I got in pretty good shape. I didn't build up muscle, though, I just got real good and toned. A: Yeah. B: I don't think women look good with muscles. Using only the above description and what you know about the world, \"women look good with muscles\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. Using only the above description and what you know about the world, \"people should be allowed to carry guns in their vehicles\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Well, presumably those who find out such information, if they are doing it, I would prefer to not to be known, and, I mean, you know, the classic, oh, I don't know C I conspiracy theories or whatever, would have such parties trying to do it without your knowledge. So there's,, things that invade that second type of privacy where you do know about them and possibly things that invade that second type of privacy without you knowing about it, and I can't talk about the second one other than to generate paranoia. It's a surmise and, I'd like to think that it's quite low, at least in this country. B: to surmise. It is there. A: I don't think I'd like the KGB monitoring my phone or anything like that. Using only the above description and what you know about the world, \"he would like the KGB monitoring his phone\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Now the part about where you said the apartment complex puts up signs that says no soliciting, I've even gone so far as to put that, I've got a storm door on the front of the house and I've put, in, I don't know how much clearer it can be, it's a red sign with silver letters saying no soliciting. I guess I should make another one that says religious or otherwise, cause I still get, B: Yeah, yeah, that's true, yeah. No I didn't go that far but, uh, yeah I probably could do the same thing, uh, you know, I don't have a storm door, but I'm sure I could rig up something. But you know I don't think that that would stop people. Using only the above description and what you know about the world, \"a no soliciting sign would stop people\" is definitely correct, incorrect, or inconclusive?", "doc_id": 42, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [213, 69, 43, 71], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: Um, yeah, I guess that's not an easy solution. there's no easy solution for that. B: Uh-huh. I don't know that there is an easy solution, but if you could find a way to prevent some of it, and I'm not sure what it would be. It would be money better spent than, A: Uh-huh. B: do you know it costs more to keep an inmate on death row than it does to send a kid to Harvard? Using only the above description and what you know about the world, \"it costs more to keep an inmate on death row that it does to send a kid of Harvard\" is definitely correct, incorrect, or inconclusive? Correct\n###\nI ducked so fast I wasn't sure whether he 'd seen me or not, but it gave me a prickly feeling just to imagine it, so I scuttled for the door and legged it up the spiral stairway three steps at a time, just in case. As I ran, I remember thinking stupid thoughts like. How did he know I was up here looking down? Using only the above description and what you know about the world, \"he was up there looking down\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. Using only the above description and what you know about the world, \"he is having an affair with Lexy\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``I wanted to tell you. But the Bookman asked me to keep our meeting a secret.'' How did you know I 'd met him? Using only the above description and what you know about the world, \"he had met the Bookman\" is definitely correct, incorrect, or inconclusive? Correct\n###\nObeying his instruction, I proffered my hand, open palm upwards, towards the animal. The ratbird climbed on and began to preen its fur unconcernedly. Nobody will blame me if I say that in the circumstances I became very uneasy. Using only the above description and what you know about the world, \"in the circumstances she became very uneasy\" is definitely correct, incorrect, or inconclusive?", "doc_id": 34, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [21, 2, 156, 112], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "It seemed impossible that anyone could endure such pain for so long, but at last the doors of the Renault slammed and there was comparative silence. The engine was started up, revving violently as the car was turned round on the narrow road. John could tell that it was being driven back up the hill towards Putna. Using only the above description and what you know about the world, \"the car was being driven back up the hill towards Putna\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. Using only the above description and what you know about the world, \"Gustave was shepherded into creative retreat at Croisset by epilepsy\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: That's fairly interesting. B: I bet that would be, rather interesting. Uh, that's, uh, self improvement, well, that's kind of a hobby but it is self improvement from the standpoint of probably relaxing, uh. A: Yeah, I don't know that I read anything strictly labeled self improvement. Using only the above description and what you know about the world, \"she reads anything strictly labeled self improvement\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nFor a while the notion gripped him, and he prowled the many floors of Hamleys looking for toys. He bought a magic set for Sebastian (although his ideal present for the kid would have been a brand-new name) and a marionette for Louise. He could remember that there was an age for puppets and magic just as he could remember the time that he 'd spent trying to fan a deck of cards or sitting in front of a mirror trying to get the hard consonants down like a real ventriloquist. Using only the above description and what you know about the world, \"there was an age for puppets and magic\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: uh I picked up a bunch of Craftsman tools from the forties that my wife's father owned when he was alive B: Uh-huh. A: and so I do have a band saw and a router and, uh, things like that out in the garage. But I can't say I use them very often. Using only the above description and what you know about the world, \"he uses them very often\" is definitely correct, incorrect, or inconclusive?", "doc_id": 53, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [133, 145, 109, 141], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: But, uh, if the wind comes basically from the south it can be really bad. A: Uh-huh. B: Uh, the State of Wisconsin, as a matter of fact, uh, started some litigation against Illinois because of the air pollution we were getting. A: Uh-huh. B: Uh, I don't think it's going to go very far, Using only the above description and what you know about the world, \"it's going to go very far\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. Using only the above description and what you know about the world, \"she wants to see that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nAnd I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. Using only the above description and what you know about the world, \"Frank Donovan was not the jovial easy-going character she remembered\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: That was kind of a funny movie with, uh, Richard Dreyfuss and Bill Murray. A: Uh-huh. B: That was fun. A: Golly, I don't think that I've ever heard of that movie. Using only the above description and what you know about the world, \"he has heard of that movie\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nFirstly, I didn't know about the SAS soldiers in the British Embassy, and I am very surprised about it. Very surprised indeed, Ambassador. Secondly I do not think it is a good idea to attack a plane with a hundred and seven passengers in it and ``take it apart'' as you say. Using only the above description and what you know about the world, \"it is a good idea to attack a plane with a hundred and seven passengers in it and 'take it apart'\" is definitely correct, incorrect, or inconclusive?", "doc_id": 10, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [20, 32, 205, 195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Chopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. Using only the above description and what you know about the world, \"something had happened\" is definitely correct, incorrect, or inconclusive? Correct\n###\nHis mother driving the car, so happy, young-looking and fashionably dressed and his father, a big, confident man in a smart suit, smiling and turning round to say something to Simon in the back seat. Marie thought of her own mother with her frumpy clothes and ageing, lined face. No one would have guessed that she was only forty-two. Using only the above description and what you know about the world, \"Marie's mother was only forty-two\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: So, let's talk about the, uh, wonderful abuses in the State of Pennsylvania of personal property taxes whereby you can purchase something mail order and after the fact, the State of Pennsylvania can find out about it and send you a bill for the sales tax appropriate to that item that you purchased as well as interest and penalties from the time that you bought it. What do you think? Is Pennsylvania kind of out of line there? A: Well, actually, I do n't think they're out of line. Using only the above description and what you know about the world, \"they're out of line\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: I look at our big green containers, and I say, well, they work fine and I keep mine outside the garage so that I don't have any odors but it's clearly a place where, uh, uh, A: Oh, right. B: it will be interesting to see how well that works and I'm glad the community is doing it. Uh, it's one of those things that kind of has to be forced on people. Uh, I don't know what you saw back, uh, years ago, but for me the thing that strikes me is uh, growing up in rural South Dakota where, hey the farmers brought their eggs to town and the local hatchery would candle them and package them is that, uh, in the fifties, uh, you could say we had the recycling going on then that we should have now. Which was all the milk bottles were glass Using only the above description and what you know about the world, \"they had the recycling going on then that they should have now\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe assassin's tone and bearing were completely confident. If he noticed that Zukov was now edging further to the side widening the arc of fire he did not appear to be troubled. Using only the above description and what you know about the world, \"Zukov was edging further to the side\" is definitely correct, incorrect, or inconclusive?", "doc_id": 47, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [212, 145, 46, 65], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: So again, it never really penalizes, the company doing the wrong. A: Right. That will, B: They can go right on doing the same old thing they always used to. A: Huh. B: And if they know some practice is wrong, you know, Using only the above description and what you know about the world, \"some practice is wrong\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. Using only the above description and what you know about the world, \"she wants to see that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nStrasbourg, Vienna, Bucharest, Istanbul, not stopping, not looking back. I saw her tossing newly gauffred curls as the open roadster headed east, away from Ollie... Temporarily I managed to re-erect my jocular facade, but inside I was panicking. He could take her away I thought he could just do that he has such power to hurt me this little furry creature who hasn't even noticed that I 've given up the weed. Using only the above description and what you know about the world, \"he has given up the weed\" is definitely correct, incorrect, or inconclusive? Correct\n###\nIf there are spirits at work at the time, they come only from yourself, not from the fume of the incense. Why should spirits aid living beings? What arrogance is it that drives people to believe they can have power over them? Using only the above description and what you know about the world, \"people can have power over spirits\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThere was a group of curious onlookers... Marie felt her legs give way beneath her, she sat down on the edge of the pavement, feet in the gutter, doubled-up, sick and winded as if someone had punched her in the stomach. She lifted up her head and looked again. She had watched scenes like this so often in detective films and police series on television that she could hardly believe that this was real life. Using only the above description and what you know about the world, \"this was real life\" is definitely correct, incorrect, or inconclusive?", "doc_id": 45, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [144, 145, 220, 127], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: I am right outside Baltimore. I am less than a mile from the Baltimore line. B: Um. A: And I go to a campus of the University of Maryland that is just, less than a mile from my house. So I'm actually in Baltimore, yeah, you could say I'm in Baltimore. Using only the above description and what you know about the world, \"he is in Baltimore\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. Using only the above description and what you know about the world, \"she wants to see that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: So, we're comparable. B: Yeah. A: As a matter of fact, I just paid my Richardson taxes because I live in Richardson and supplemented the Robin Hoods very thoroughly, I think. B: Yeah, I think Yeah, we have got it on the line, don't we. Using only the above description and what you know about the world, \"they have got it on the line\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Uh, well then you must know a lot more about this than I do. B: Uh, I think, uh, the system right now, you know, you know, is fine. I think it should be by a jury. I don't think the judge should have, I mean he's just there kind of like the referee. A: Uh-huh, Uh-huh. B: Uh, I don't even think that it should be unanimous either. Uh, Using only the above description and what you know about the world, \"it should be unanimous\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: If, uh, you've some kid uh, who's from a broken family, the schools are supposed to fill that void. It's gotten way beyond uh, you know, teaching classes and maybe providing extracurricular sports activities or something like, oh, were kind of traditionally the school's roles. A: Yeah. Yeah, it's interesting because, uh, we're just having conversation on this, uh, with a couple of people yesterday. And I was expressing my frustrations that, uh, so many problems, I work in a high school, are that kids don't have a degree of self-discipline which may be reflected in society at large. Uh, and you can't expect in a classroom for a particular course an hour a day to counteract, uh, sixteen or seventeen years of influence at home. B: Right. A: Um, and, it's seen more so because when you call parents up, many parents won't even recognize that there is a problem Using only the above description and what you know about the world, \"there is a problem\" is definitely correct, incorrect, or inconclusive?", "doc_id": 35, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [83, 45, 131, 158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "This was a sheer waste of time. He would probably land and then tell them to walk back. When she glanced at him again he looked very grim and she wondered if she should have told Mitch that he might well have a lot of opportunity to photograph Spain - on foot as he walked back to Malaga. Using only the above description and what you know about the world, \"Mitch might well have a lot of opportunity to photograph Spain\" is definitely correct, incorrect, or inconclusive? Correct\n###\nWhat must it be like to be imprisoned here, day after day, month after month? I wonder, does he keep them chained and manacled, thought Fenella, or does he use sorcery? And so utterly immersed was she in this strange blue and green land that was not feeling strange any more that she did not even notice that she was weighing sorcery against steel chains and seriously considering the likely outcome. Using only the above description and what you know about the world, \"Fenella was weighing sorcery against steel chains\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I did, too. A: I mean, it was just more for my money. B: Yeah. I didn't think it was too long at all. Using only the above description and what you know about the world, \"it was too long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: But, uh, B: Okay. Uh, uh, I've had one or two American cars I think, and they were okay. I had a Pontiac once and I never had a problem with it, but, uh, my mother had a Dodge at one point and I had driven it a few times and I really did not feel that I would buy a Dodge just from, Using only the above description and what you know about the world, \"she would buy a Dodge\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Okay. So Frank, what, uh, type of, uh, budget do you or your family have? B: Well, uh I don't know that we really have a budget. Using only the above description and what you know about the world, \"he and his family really have a budget\" is definitely correct, incorrect, or inconclusive?", "doc_id": 21, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [108, 63, 22, 137], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "I didn't really like the way the other boys treated him. I was new at the school and still observing, still beginning friendships. Perhaps Alec noticed that I did not ridicule him as the others did. Using only the above description and what you know about the world, \"he did not ridicule Alec as the others did\" is definitely correct, incorrect, or inconclusive? Correct\n###\nIt's where the bands practise. I can't remember what band Petra's in, but I seen them practise once. They were OK but I didn't think they was brilliant. Using only the above description and what you know about the world, \"Petra's band was brilliant\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nJust when you think you 've got it straight, along comes the Fool with his pig's bladder and whops you on the nose. By the way, I'm no idiot. I could tell Gillian and Stuart weren't thrilled to see me at the airport. Using only the above description and what you know about the world, \"Gillian and Stuart weren't thrilled to see her at the airport\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. Using only the above description and what you know about the world, \"people should be allowed to carry guns in their vehicles\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nBut what we may not know is just what makes somebody a sucker. What makes people blurt out their credit-card numbers to a caller they 've never heard of? Do they really believe that the number is just for verification and is simply a formality on the road to being a grand-prize winner? Using only the above description and what you know about the world, \"the number is just for verification and is simply a formality on the road to being a grand-prize winner\" is definitely correct, incorrect, or inconclusive?", "doc_id": 24, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [89, 206, 60, 181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``Molly likes having an audience for her tales and it passes the hours for them.'' When Miss Louisa had a second more severe stroke at the end of August, and Miss Ellen another heart attack, both old ladies died within a few days of each other. Their friends could only feel that death was merciful in the circumstances especially with war imminent and that Molly had made the closing months of their lives very happy. Using only the above description and what you know about the world, \"death was merciful in the circumstances\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I do too. I believe about ten years ago that we went through a terrible time, but I don't, I believe that they're better now, you know, wh-, B: I think so. I don't think they're shoddy Using only the above description and what you know about the world, \"they're shoddy\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nI'm sorry, I 've put you in an invidious position. If you're being run by Morton, he 'll want to hear all this. It won't do any harm but I 'd rather not give him food for thought because I consider him an idiot and I don't think he's capable of interpreting it correctly. Using only the above description and what you know about the world, \"Morton is capable of interpreting this food for thought correctly\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: but, uh, I can definitely, uh, see on down the road, you know, where we do have kids and are getting to that age, that's going to be a definite concern. A: Yeah, you talked before, about the school funding. I think there's only going to be one solution to school funding which I don't think will be necessarily the best way Using only the above description and what you know about the world, \"the one solution to school funding will be necessarily the best way\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nModify the arachnids, said the researchers. Change their bodies and conditions, and you could get fibres like glass, still monofilament, but with logarithmic progressions of possibilities of strength and flexibility, and the ability to resonate light-particles or sound-waves undistorted, scarcely weakened over thousands of miles. Who said the arachnids had to be totally organic? Using only the above description and what you know about the world, \"arachnids had to be totally organic\" is definitely correct, incorrect, or inconclusive?", "doc_id": 46, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [40, 157, 240, 97], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Did he intend everyone in the castle to know he did not want the wife he had married in such a hurry? Did he intend to ignore her completely? Then Isabel saw Ellen's stunned face and realised that her maid at least did not know she had spent the night alone. Using only the above description and what you know about the world, \"Isabel had spent the night alone\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: They might be, but not at the human factors level. they're, B: Well, I heard it on the news today, I could swear it was IBM. Using only the above description and what you know about the world, \"it was IBM\" is definitely correct, incorrect, or inconclusive? Correct\n###\nRobert Erwin, president of Biosource, called Plant Genetic's approach ``interesting'' and ``novel,'' and ``complementary rather than competitive.'' ``There is a large market out there hungry for hybrid seeds,'' he said. Mr. Robinson of Delta & Pine, the seed producer in Scott, Miss., said Plant Genetic's success in creating genetically engineered male steriles doesn't automatically mean it would be simple to create hybrids in all crops. Using only the above description and what you know about the world, \"it would be simple to create hybrids in all crops\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nMatthew rode on feeling a little more at peace with himself. He skirted the spruce plantation and supposed that at some point he should tell Sara about it. He could imagine that she might be interested in its money-making propensity at the end of the year. Using only the above description and what you know about the world, \"Sara might be interested in its money-making propensity at the end of the year\" is definitely correct, incorrect, or inconclusive? Correct\n###\nShe hated to think of his sister lying in hospital waiting for her husband to come to her while all the time he was with Dana. She gripped her hands tightly together. Dana didn't know Berenice was in danger of losing her child. Using only the above description and what you know about the world, \"Berenice was in danger of losing her child\" is definitely correct, incorrect, or inconclusive?", "doc_id": 38, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [107, 84, 228, 204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "There was no answer. Moving carefully, Benny stepped around the edges of the room, and opened the window shutters. Her momentary horror at seeing the unmistakable form of General Etienne was only slightly dulled by the realization that the stiff posture he was in could only mean he was dead. Using only the above description and what you know about the world, \"General Etienne was dead\" is definitely correct, incorrect, or inconclusive? Correct\n###\nNicky approached her with the assumption that men are naturally right and it is the role of women to follow their lead. Constance, whose confidence was growing daily, was not prepared to give in to Nicky's wishes merely because of his sex. If she felt he was right then she agreed with him. Using only the above description and what you know about the world, \"Nicky was right\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nB: so there's only been really one working. A: Uh-huh, same here. Uh-huh. B: And, uh, it works for me but I can't see that it would work for probably the majority of people. Using only the above description and what you know about the world, \"it would work for probably the majority of people\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: How did Radio Shack work? B: If you go in and buy anything they want your phone number. And I don't think they're going to call me and ask me how it's functioning, Using only the above description and what you know about the world, \"they're going to call him\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nWhat had brought Gharr and Ten-huc and Pulvidon to the planet at the same time? Why were all of them so interested in why I was there? And if they somehow suspected that I was picking up something valuable why would any of them try to kill me before the pick-up? Using only the above description and what you know about the world, \"she was picking up something valuable\" is definitely correct, incorrect, or inconclusive?", "doc_id": 40, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [17, 147, 119, 203], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "She might have sat all afternoon, nibbling and stuporous, exhausted but not sleepy. But the glazier finally came down from the upper floor, cheerfully announcing that all was now right and tight and he would be on his way. Maggie could tell that he would have liked to stop for a chat that he felt sorry for her left on her own but she lacked either her grandmother's grace or her mother's energy so she did not offer him tea. Using only the above description and what you know about the world, \"the glazier would have liked to stop for a chat\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I wouldn't be surprised. A: You know, because they don't want to send them to daycare. B: I doubt if they would say it was too long. Using only the above description and what you know about the world, \"it was too long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Well, how do you feel about the immigration laws? B: At, currently, I think they are a little restrictive. Uh, particularly for, uh, certain ethnic groups or from certain countries. Um, I think we should permit, uh, more immigration from eastern Europe, for example, uh, particularly uh, the Jewish, uh, uh, people from Russia. I think we could permit more of them in than we have permitted in the last, uh, several years. And, I think we have, uh, uh, too much restriction uh, on the Orientals also, but, of course, that's just my opinion. A: Yeah, well, I'm not real sure why I got this topic, because I don't think I checked it off on the list because I know very little about the current immigration laws. Using only the above description and what you know about the world, \"he checked the topic off on the list\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Yeah, you're probably right, two years might be a little too long. B: Yeah, and there will be a lot of rebellion in that and when you get people who have no desire to be there in the first place, I don't think that they're going to be serving anybody. Using only the above description and what you know about the world, \"they're going to be serving somebody\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nDuring that pause I realized that Mala had not been offered a seat, nor any food or drink. The Emissary and his people were behaving as if she simply wasn't there. I could see that she was scowling and stiffening into a Mark 2 temper so I gave her an encouraging smile - which raised her as I expected to a Mark 3. Using only the above description and what you know about the world, \"Mala was scowling and stiffening into a Mark 2 temper\" is definitely correct, incorrect, or inconclusive?", "doc_id": 19, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [71, 68, 161, 162], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``I wanted to tell you. But the Bookman asked me to keep our meeting a secret.'' How did you know I 'd met him? Using only the above description and what you know about the world, \"he had met the Bookman\" is definitely correct, incorrect, or inconclusive? Correct\n###\nBut the horror of losing was as much to do with money as with pride. Biddy had never let them down, come without fail all through the bad weather, and now was giving Nails an intensive course on her own horse which - in terms of money - was worth another couple of hundred pounds. Yet surely she knew they had no way of paying should she demand it? Using only the above description and what you know about the world, \"they had no way of paying\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: It's divided, yeah. B: Wow! A: It really is, so we've got our Cowboys here and, uh, I don't think anybody roots differently Using only the above description and what you know about the world, \"somebody roots differently\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: if you get it on sale, A: Yeah, yeah, so we bought that or we bought the filets, and then the chicken, or turkey nuggets, and I don't think anybody in my house knows the difference, unless you tell them. Using only the above description and what you know about the world, \"someone in his house knows the difference\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Well, got any problems on Mockingbird with crime or is that a crime free zone there? B: No, I don't think there is any such thing, as a crime free zone any longer. Using only the above description and what you know about the world, \"there is some such thing as a crime free zone\" is definitely correct, incorrect, or inconclusive?", "doc_id": 36, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [183, 62, 3, 139], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. Using only the above description and what you know about the world, \"the Soviet Union itself is a threat still\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nYou really don't know anything about me, do you, despite all that wallowing in my mind? As it happens I don't think I'm the right person to lead humanity into the future no. Using only the above description and what you know about the world, \"she is the right person to lead humanity into the future\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nPart of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. You could also say he was driven there by the railway. Using only the above description and what you know about the world, \"Gustave was driven to creative retreat in Croisset by the railway\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: And, uh, I got to stay home with my kids, which I really wanted to do, but now I could not go back and do it. B: Yeah. A: I really couldn't, I don't think I could stay home all the time and do nothing. Using only the above description and what you know about the world, \"he could stay home all the time and do nothing\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: I do too, so she couldn't possibly turn them out like some of these popular writers, B: Huh-uh. A: but oh, her books are just incredible. I don't think they've ever made a movie, do you? Using only the above description and what you know about the world, \"they've ever made a movie\" is definitely correct, incorrect, or inconclusive?", "doc_id": 12, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [43, 58, 141, 107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. Using only the above description and what you know about the world, \"he is having an affair with Lexy\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nBut he ended up eating it himself. I was reluctant to kiss my mother, afraid that somehow her weakness and unhappiness would infect me. Naturally I didn't think for a minute that my life and spirit could stimulate her. Using only the above description and what you know about the world, \"her life and spirit could stimulate her mother\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: That was kind of a funny movie with, uh, Richard Dreyfuss and Bill Murray. A: Uh-huh. B: That was fun. A: Golly, I don't think that I've ever heard of that movie. Using only the above description and what you know about the world, \"he has heard of that movie\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThere was no answer. Moving carefully, Benny stepped around the edges of the room, and opened the window shutters. Her momentary horror at seeing the unmistakable form of General Etienne was only slightly dulled by the realization that the stiff posture he was in could only mean he was dead. Using only the above description and what you know about the world, \"General Etienne was dead\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: And she got kind of fearful of being on her own. She never really ate very well. It was one thing. She hardly ever took care of herself A: Yeah. B: and she didn't eat. She ate very poor so I think she was, you know, bad, uh, nutrition on top of it. And, uh, she got to the point she didn't want to alone anymore. So, A: So often I think though, elderly people don't realize that their diet is that bad. Using only the above description and what you know about the world, \"elderly people's diet is that bad\" is definitely correct, incorrect, or inconclusive?", "doc_id": 48, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [182, 196, 183, 226], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: I think so, I think, B: I really do. Oh, yeah, it's going to take, uh, you know, the police, I don't think can do it alone, you know. Using only the above description and what you know about the world, \"the police can do it alone\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: And they go down the line ten years and then on some little technicality they get out and on the streets again doing the same they did before. A: Uh-huh. B: And, you know, that's about the only thing. Like for theft and stuff like that or manslaughter, you know, I don't think they should do that. Using only the above description and what you know about the world, \"they should do that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. Using only the above description and what you know about the world, \"the Soviet Union itself is a threat still\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Yeah, it's interesting because, uh, we're just having conversation on this, uh, with a couple of people yesterday. And I was expressing my frustrations that, uh, so many problems, I work in a high school, are that kids don't have a degree of self-discipline which may be reflected in society at large. Uh, and you can't expect in a classroom for a particular course an hour a day to counteract, uh, sixteen or seventeen years of influence at home. B: Right. A: Um, and, it's seen more so because when you call parents up, many parents won't even recognize that there is a problem and they'll say, oh, well, my kid, I've never heard anything about this before. This is the first time there have been problems. and, you wonder, don't these parents know that teachers talk, Using only the above description and what you know about the world, \"teachers talk\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: And other than that I do not think it should be allowable. I think it should be illegal for them to want to do that. it's kind of the big brother syndrome, I mean, I just, anything like that just kind of scares me. A: I tend to view it, even though I don't think I'd work for a company that did that, I sort of want to defend an employer's rights uh, in addition to an individual's rights, Using only the above description and what you know about the world, \"she would work for a company that did that\" is definitely correct, incorrect, or inconclusive?", "doc_id": 16, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [79, 249, 98, 193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "She didn't know if they had given themselves sufficient time to think things over before they married - that was the kind of question her sister Louise asked. Edward stayed in the Engineers for a bit, then came out and was not very successful in finding a job to suit him. That wasn't his fault and if anyone said that it was Nenna would still feel like poking a hole in them. Using only the above description and what you know about the world, \"it was Edward's fault\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nUnder the Racketeer Influenced and Corrupt Organizations law, or RICO, the government has the authority to seek to freeze or seize a defendant's assets before trial. According to individuals familiar with Mr. Antar's case, prosecutors issued their warning this week after one of Mr. Antar's attorneys asked whether legal fees might be subject to seizure. In a letter, prosecutors told Mr. Antar's lawyers that because of the recent Supreme Court rulings, they could expect that any fees collected from Mr. Antar may be seized. Using only the above description and what you know about the world, \"any fees collected from Mr. Antar may be seized\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``That's good.'' Ruth stood looking at her. Rachaela could imagine Emma would have been all congratulations and the joys of womanhood. Using only the above description and what you know about the world, \"Emma would have been all congratulations and the joys of womanhood\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: but that is one of my solutions. Uh... B: I know here in Dallas that they have just instituted in the last couple of years, uh, a real long period of time that you can absentee vote before the elections. And I do not think they have seen a really high improvement. Using only the above description and what you know about the world, \"they have seen a really high improvement\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nMr. Steinberg made a $59.7 million profit on the sale to Disney of his investment in the company in 1984. But lawyers said Mr. Steinberg probably faced much more potential liability because, when he sued Disney during his takeover battle, he filed on behalf of all shareholders. When Disney offered to pay Mr. Steinberg a premium for his shares, the New York investor didn't demand the company also pay a premium to other shareholders. Using only the above description and what you know about the world, \"the company would also pay a premium to other shareholders\" is definitely correct, incorrect, or inconclusive?", "doc_id": 26, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [238, 222, 104, 215], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: Well, I was never there for any sentencing. Uh, I finally got empaneled on one case, uh, on my next to the last day, and, uh, we got into the, uh, jury room to, uh, decide the case, and there was one guy on the jury who announced to everybody that he didn't need to deliberate, because he'd already decided that the guy was, uh, not guilty, and he would never vote for guilty. A: Huh. B: So, uh, they appointed me jury foreman and I, uh, didn't think that, uh, going in without deliberating allowed us to reach a verdict, Using only the above description and what you know about the world, \"going in without deliberating allowed them to reach a verdict\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? Using only the above description and what you know about the world, \"they're a security threat\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\n``For such a person, finding a protector might not be so difficult, even in Edinburgh.'' Jean smiled. He might have known that even someone as sensible as Miss van Wiliamsburgh would try to make a play of this sort. Using only the above description and what you know about the world, \"even someone as sensible as Miss van Williamsburgh would try to make a play of this sort\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I thought Carter was good too, and that was, yeah, B: Did you? I always liked him, I thought he was great at the time and I just couldn't get over the fact that Reagan beat him. you know, that I just couldn't believe that he got voted out. Using only the above description and what you know about the world, \"Carter got voted out\" is definitely correct, incorrect, or inconclusive? Correct\n###\nBut there was little chance of discovering who had killed him without help. Kelly decided that she had to talk to Annie, even if there was a risk that she would tell her husband. Bill would have a fit if he knew his apprentice was turning supersleuth. Using only the above description and what you know about the world, \"Bill's apprentice was turning supersleuth\" is definitely correct, incorrect, or inconclusive?", "doc_id": 43, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Inconclusive", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [191, 114, 20, 133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: You know, back around, you know, in the twenties and thirties when they were growing up, uh, you know, they were all located together, in one small community. A: Right, right. Right. B: And I mean when time went on the family grew and moved away and so forth. And now when they come together it's generally, you know, like say the kids of those people who are not, you know, anywhere near one another and I do not think they feel the closeness that they used to be there. Which is a shame Using only the above description and what you know about the world, \"they feel the closeness that they used to be there\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThe Deputy Under Secretary could talk of his host's prospects and disappointments, he could learn of the problems of digging out foreign exchange and hard currency in the Third World, the tribulations over the renewal of Residence Permits, the difficulties of keeping reliable servants, but of his own world he must remain silent. The Deputy Under Secretary headed the Secret Intelligence Service of the United Kingdom, and that was not a subject matter for gossip and conversation on a bougainvillaea-fringed veranda as the lights of the fishermen's dug-outs floated inside the coral reef... No bloody way. He was a man who could be honest with himself and in honesty he could say that he was both pleased and relieved to be back at his desk on a grey Monday morning in London. Using only the above description and what you know about the world, \"the Deputy Under Secretary was both pleased and relieved to be back at his desk on a grey Monday morning in London\" is definitely correct, incorrect, or inconclusive? Correct\n###\nChopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. Using only the above description and what you know about the world, \"something had happened\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: But, uh, if the wind comes basically from the south it can be really bad. A: Uh-huh. B: Uh, the State of Wisconsin, as a matter of fact, uh, started some litigation against Illinois because of the air pollution we were getting. A: Uh-huh. B: Uh, I don't think it's going to go very far, Using only the above description and what you know about the world, \"it's going to go very far\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: how'd you like to own a piece of property where your lake is going sour because of acid rain. A: Right. Right. B: It's, uh, really a serious issue for those of us up in this, uh, sector up here. A: um, or do you hypothesize that most of the, uh, smog or air pollution comes from vehicles Using only the above description and what you know about the world, \"most of the smog or air pollution comes from vehicles\" is definitely correct, incorrect, or inconclusive?", "doc_id": 25, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [140, 74, 193, 230], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: nanny, sort of? Uh-huh. Uh-huh. B: and you know, I could envision a society where that would happen and make an interesting, uh, uh, story or whatever. A: Yeah. B: I don't think I have a philosophical problem with that. Using only the above description and what you know about the world, \"she has a philosophical problem with that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nAnna looked at Peter again and said to herself in a guilty whisper, ``Will he become even more difficult?'' She wondered if a stranger could tell that he was difficult, just by looking at him. Would such a person watching Peter now reading the prayers of Rite B in his level pleasant voice notice that resentment lay like his blood just under his skin because the life he had chosen had not turned out as he had expected it to? Using only the above description and what you know about the world, \"resentment lay just under Peter's skin\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: but that is one of my solutions. Uh... B: I know here in Dallas that they have just instituted in the last couple of years, uh, a real long period of time that you can absentee vote before the elections. And I do not think they have seen a really high improvement. Using only the above description and what you know about the world, \"they have seen a really high improvement\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. Using only the above description and what you know about the world, \"she really enjoys it\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``There's still room for boutique properties,'' says James Barrett, president of MarCor Resorts Inc. Off the Strip, MarCor is building the Rio, a hotel-casino with a Brazilian theme and only 430 rooms -- all of them suites. Despite the proliferation of tourist distractions, Las Vegans haven't forgot that gambling is still what the town is all about. Using only the above description and what you know about the world, \"gambling is still what the town is all about\" is definitely correct, incorrect, or inconclusive?", "doc_id": 22, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [142, 89, 114, 131], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: so it's nice to get away. It's just amazing, how much you miss. B: Yeah, it, Yeah, it, yeah, it really is. I mean, I don't think I ever see the Little Dipper, Using only the above description and what you know about the world, \"she has seen the Little Dipper\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``Molly likes having an audience for her tales and it passes the hours for them.'' When Miss Louisa had a second more severe stroke at the end of August, and Miss Ellen another heart attack, both old ladies died within a few days of each other. Their friends could only feel that death was merciful in the circumstances especially with war imminent and that Molly had made the closing months of their lives very happy. Using only the above description and what you know about the world, \"death was merciful in the circumstances\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe Deputy Under Secretary could talk of his host's prospects and disappointments, he could learn of the problems of digging out foreign exchange and hard currency in the Third World, the tribulations over the renewal of Residence Permits, the difficulties of keeping reliable servants, but of his own world he must remain silent. The Deputy Under Secretary headed the Secret Intelligence Service of the United Kingdom, and that was not a subject matter for gossip and conversation on a bougainvillaea-fringed veranda as the lights of the fishermen's dug-outs floated inside the coral reef... No bloody way. He was a man who could be honest with himself and in honesty he could say that he was both pleased and relieved to be back at his desk on a grey Monday morning in London. Using only the above description and what you know about the world, \"the Deputy Under Secretary was both pleased and relieved to be back at his desk on a grey Monday morning in London\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I did, too. A: I mean, it was just more for my money. B: Yeah. I didn't think it was too long at all. Using only the above description and what you know about the world, \"it was too long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: So, I don't know I'm looking for a good year. I guess we're always looking for a good year. B: So, obviously though, do you think they're going to do anything in the playoffs to make it to the Super Bowl this year or who do you like to do that this year? A: Uh, no I don't think the Cowboys have got a chance. Using only the above description and what you know about the world, \"the Cowboys have got a chance\" is definitely correct, incorrect, or inconclusive?", "doc_id": 37, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Inconclusive", "pred_idx": 1, "target_idx": 2, "fewshot_idx": [139, 155, 66, 129], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: And, uh, I got to stay home with my kids, which I really wanted to do, but now I could not go back and do it. B: Yeah. A: I really couldn't, I don't think I could stay home all the time and do nothing. Using only the above description and what you know about the world, \"he could stay home all the time and do nothing\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: What am I afraid of? A: Yes. B: Um, I don't know if I'm really afraid of spending too much. I just, uh, don't think that I need them, you know. Using only the above description and what you know about the world, \"she needs them\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nWhy should this topic matter? You talked about everything else as you usually do. Why should I feel Maelmuire is important? Using only the above description and what you know about the world, \"Maelmuire is important\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: but I found that, uh, it was made of some material which actually ended up rusting uh, after, A: Oh. B: even, despite, you know, diligent washing, it got rusty after about, uh, three weeks of use. And I don't think it was my fault because you know, I had made a point of like drying it off and cleaning it Using only the above description and what you know about the world, \"it was his fault\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: And yet, uh, I we-, I hope to see employer based, you know, helping out. You know, child, uh, care centers at the place of employment and things like that, that will help out. A: Uh-huh. B: What do you think, do you think we are, setting a trend? Using only the above description and what you know about the world, \"they are setting a trend\" is definitely correct, incorrect, or inconclusive?", "doc_id": 15, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [51, 168, 147, 123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Ockleton, Morpurgo, Cornelius, Dysart and half a dozen others too drunk to mention. But there was so much coming and going that any one of us could have slipped out, pushed Everett through the window and slipped back again without being noticed. Damn it all we didn't even notice Everett was missing until a porter tripped over him in the quad so anything's theoretically possible. Using only the above description and what you know about the world, \"Everett was missing\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: They have to for international trade. B: Yeah. A: But, I guess it's easier to switch back and forth than it used to be, uh, because uh, of computers coming into everything. B: Uh-huh. Yeah, I don't think switching back and forth is that big a deal. Using only the above description and what you know about the world, \"switching back and forth is that big a deal\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: I wouldn't be surprised. A: You know, because they don't want to send them to daycare. B: I doubt if they would say it was too long. Using only the above description and what you know about the world, \"it was too long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: And I don't think that rehabilitation is effective. A: Right. Have to agree with you, and I'm kind of in favor of capital punishment also. I just don't think that it acts much as a deterrent to these people because, uh, you still see them committing the same crimes, Using only the above description and what you know about the world, \"it acts much as a deterrent to these people\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nWhether the relationship had gone beyond friendship Dalgliesh would now never know. She had, apparently, spent little of the money on herself, had been a dependable benefactress of the few eccentric charities of which she approved, had remembered them in her will, but without egregious generosity, and had left the residue of her estate to him without explanation, admonition or peculiar protestations of affection, although he had no doubt that the words ``my dearly beloved nephew'' meant exactly what they said. He had liked her respected her had always been at ease in her company but he had never thought that he really knew her and now he never would. Using only the above description and what you know about the world, \"Dalgliesh really knew his aunt\" is definitely correct, incorrect, or inconclusive?", "doc_id": 5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [70, 102, 138, 82], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "At the heart of the universe there is cruelty. We are predators and are preyed upon, every living thing. Did you know that wasps lay their eggs in ladybirds piercing the weak spot in their armour? Using only the above description and what you know about the world, \"wasps lay their eggs in ladybirds\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``Oh, very well,'' he said wearily. He might have known that it was useless to argue with McAllister - her tongue was as long as her will was strong. Using only the above description and what you know about the world, \"it was useless to argue with McAllister\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: but if they get the little tiny kids saving it now, in five years, when they get bigger, it'll work a little bit more, too. A: Yeah. True. B: Because it's, we've all got to do it right now. I just, I really amazed to find out that, eighty per cent are filled now, in garbage fills. In five years we're supposed to be at max. A: Uh-huh. B: I don't think I can keep my own garbage. Using only the above description and what you know about the world, \"she can keep her own garbage\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nHowever, I will let the gynandrous renegade stay here under one condition. If Penumbra questions either of us, we will not lie. If she suspects Crevecoeur is here and asks we let him go back. Using only the above description and what you know about the world, \"Crevecoeur is here\" is definitely correct, incorrect, or inconclusive? Correct\n###\nAnd I resent what happened to my flat. A couple of guys think they can stake it out and wait for me, rub their filthy fingers on my clothes, piss in my bathroom, and I'm supposed to ignore it. I know what I said about possessions being like leeches but that don't mean I 'll surrender them to a pair of punks. Using only the above description and what you know about the world, \"he will surrender his possessions to a pair of punks\" is definitely correct, incorrect, or inconclusive?", "doc_id": 44, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [93, 183, 121, 85], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "O listened. As he listened he could smell the man. And he could hear that the song was almost turning into a sob. Using only the above description and what you know about the world, \"the song was almost turning into a sob\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. Using only the above description and what you know about the world, \"the Soviet Union itself is a threat still\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: and then once they do get elected, they don't have the power or the authority or the willingness to do those things that they promised, you know, beforehand. B: Right. A: You know, maybe it just wasn't possible at all in the first place, you know, like the no new taxes thing. You know, that's, uh, with the economy going the way it is and everything, that was nearly ridiculous thing to, even try to do. B: Yeah. Yeah. Well, I don't think he's going to have to worry about that next year. Using only the above description and what you know about the world, \"he's going to have to worry about that next year\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nYour honesty shines out of your face, my darling. It isn't your fault that cynical men like myself won't let themselves believe what they see! I just wish you could believe that Eddie's death was an accident and nothing to do with me. Using only the above description and what you know about the world, \"Eddie's death was an accident and nothing to do with him\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: And you also get a lot of, uh, juries are extremely, uh, and from what I hear, I have some friends who do expert witness testimony and they say that, uh, juries are extremely vulnerable to, uh, sort of emotional pitches, you know, the prosecutor will want to, oh, I don't know show the mugging victim, you know, show the nice person he was and what a family life, and basically get the jury to be very sympathetic with the victim, or, uh, if it's a corporation, that was, uh, you know, harming some individual or something like that, they get very much, well, you know, it's just a big faceless corporation. let's make them pay as much as possible. Things like that. B: Uh-huh. A: So, not, I mean, I'm, the problem is I can't guarantee that a judge would necessarily be much better than a jury, but I'd be real nervous having a jury not at least fully agree on what the settlements would be, things like that. B: Ri-, I don't think the judge should just make the decision alone. Using only the above description and what you know about the world, \"the judge should just make the decision alone\" is definitely correct, incorrect, or inconclusive?", "doc_id": 50, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [145, 229, 91, 4], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. Using only the above description and what you know about the world, \"she wants to see that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: Well, that's kind of the way I feel about rock and roll sometimes, too, I guess. They don't really, has kind of the same sound over and over, and the other thing I don't like about it is they have a tendency to play the instrumental so loud that you can't understand what the lyrics are A: Um. Right. B: you can't understand what they're saying on some of those songs which probably is just as well on some of them, too. A: Yeah. And I can't say that I like a lot of the very modern, uh, rock and roll, Using only the above description and what you know about the world, \"she likes a lot of the very modern rock and roll\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nHe also liked swimming and cycling. He said that he wrote stories, though he had to admit that he had never got further than the first two pages. Willie meanwhile not only remained silent during these conversations but picked his berries slowly so that they might forget that he was there but he reckoned without Zach. Using only the above description and what you know about the world, \"Willie was there\" is definitely correct, incorrect, or inconclusive? Correct\n###\nSome of them, like for instance the farm in Connecticut, are quite small. If I like a place I buy it. I guess you could say it's a hobby. Using only the above description and what you know about the world, \"buying places is a hobby\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I've never heard that one, that's very nice. Oh, so I'm all for the metric system and converting over and I think, I guess, my feeling is the way to do it is to just start giving weights, you know, have a very brief transition period and then just start giving weights and kilometers, er, just as in kilometers and weights and kilograms and everything like that and, uh, just have people start using it rather than having people constantly trying to convert. Remember me getting a package of something that said one pound, this is a package of dates mind you, was, presumably something you weigh fairly precisely, it said one pound and then in parenthesis it said four hundred fifty-four point six grams. B: Right, right. A: And, as near as I could tell, seeing that was basically anti-metric propaganda cause anyone who would say, well look I can either buy a pound of something at four hundred sixty-four point six grams which, of course, they couldn't weigh it out accurately anyway, um, every time I see something like that I think, well, that's an anti-metric argument. B: Yeah. Well, uh, I don't think it could ever happen with a quick transition. Using only the above description and what you know about the world, \"it could happen with a quick transition\" is definitely correct, incorrect, or inconclusive?", "doc_id": 23, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [148, 222, 216, 78], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: But, uh, uh, I don't understand, I guess, why the schools seem to have such a high dropout rate in the big cities. B: Uh, well, I don't pretend to understand that either. Uh, but I'm not quite sure that it's the kind of thing that ought to be blamed on the schools. But then, again, I'm not quite sure where the blame ought to be put. Uh, because the dropout rate is, in those areas, w-, it's high in those areas where also there's poverty and crime. And they all seem to go together. And it seems like if you could eliminate one of the parts of that circle, where you have the dropout rate and crime and, you know, general poverty kind of conditions, that things ought to get better. So, uh, the other two a-, they're all three social issues and could be addressed by the government in any ways. And clearly, to me, is a kind of government thing to fix but it's just like, I don't expect them to know which part is best to fix just like I don't know. it's a complicated issue. I still don't think I would blame it directly on the school. Using only the above description and what you know about the world, \"he would blame it directly on the school\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? Using only the above description and what you know about the world, \"they're a security threat\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nA: Well I, uh, when is your next one, uh, scheduled now. B: Well it's like, the last one was my high school graduation the next one was when I graduated from college, so I guess about two more years. A: Yes, well, and do you think you'll have a baby to take back with you. Using only the above description and what you know about the world, \"speaker B will have a baby to take back with her\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nHow do you know? she was going to ask, but his smile was answer enough. If DeVore said there was going to be a vacancy there would be a vacancy. Using only the above description and what you know about the world, \"there was going to be a vacancy\" is definitely correct, incorrect, or inconclusive? Correct\n###\n'Very well, I'll go. But I pick my own men, and if we ever have to fight, you obey my word.'' Jehan did not think that Sidacai was in any position to impose conditions but he sat back in his chair considering. Using only the above description and what you know about the world, \"Sidacai was in a position to impose conditions\" is definitely correct, incorrect, or inconclusive?", "doc_id": 54, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [233, 198, 0, 189], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: That is the reason, I don't play over there. B: Yeah. A: I like the course, but I don't play over there because, they don't, uh, you know don't allow you to pull a cart. B: Right. A: And, I don't think a cart damages the turf. Using only the above description and what you know about the world, \"a cart damages the turf\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: Uh, I have, uh, I guess a lot of thoughts about the Vietnam War, um, I guess I feel like I was pretty young while it was going on and so there's probably a lot of things I remember and a lot of things that I really didn't have a clue as to what was happening. B: Yeah. A: Um, looking back, like maybe some of the things that I know now, I'm not sure I do believe it was worth the cost in dollars and lives. That was one of the questions that she asked us to think about, because we never went to war. I don't think we were committed to winning it and getting out Using only the above description and what you know about the world, \"they were committed to winning the Vietnam War and getting out\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nIt was a complex language. Not written down but handed down. One might say it was peeled down. Using only the above description and what you know about the world, \"the language was peeled down\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: The one thing I sometimes wonder about, um, in civil cases is, uh, whether, especially sort of in, uh, maybe like product liability, or medical malpractice, where there's, um, sort of a very technical decision to be made sometimes B: Yes. A: you know, it's not just a matter um, of, you know, did this guy rip off this guy, and it's just a matter of interpreting a contract, it's sort of a matter of, um, you know, sometimes getting into very technical issues, and I wonder um, if the system works adequately in educating the jurors about, uh, whatever, um, you know, issue is under discussion. B: I don't think that they educate them enough to really know what's going on. Using only the above description and what you know about the world, \"they educate the jurors enough to really know what's going on\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: Because too often, there can be extremism that hurts from any direction, regardless of whatever you're arguing or concerned about. A: Yeah. Right. Yeah, I know, you're right, they would lobby that and I see that, and that's why, you know, I'm like, okay, what's my role in this thing,, you know, what's my part, B: Yeah. A: because I don't think the system is going to get fixed. Using only the above description and what you know about the world, \"the system is going to get fixed\" is definitely correct, incorrect, or inconclusive?", "doc_id": 33, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [115, 113, 166, 90], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "I WAS really only a bystander in the tragedy of young Mr and Mrs McLeod. It was not really my business although it could be said that I had known them both - had seen them about - for most of their lives. Using only the above description and what you know about the world, \"she had known Mr. and Mrs. McLeod for most of their lives\" is definitely correct, incorrect, or inconclusive? Correct\n###\nI spent just over an hour with Patterson which, I found out later, turned out to be another first. Time is money in the City and few people are worth an hour unless it's over lunch and only then if you're involved in a takeover bid. It was also I learned one of the few occasions anyone at PKB could remember that Patterson had a meeting with his door shut and nobody got fired. Using only the above description and what you know about the world, \"Patterson had a meeting with his door shut and nobody got fired\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: but at the same time I think it would do them a world of good. B: Yeah. A: But there's a, B: I don't know that you could require everyone yeah, to do it for a whole year, or two years or something like that, Using only the above description and what you know about the world, \"speaker A could require everyone to do it for a whole year\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nJoseph spat and spluttered blood. He had lost the two centre top teeth and with the tip of his tongue he could feel that the two on either side were also loose. Using only the above description and what you know about the world, \"the two teeth on either side were also loose\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I suppose so. Let me push the button. B: All right. A: Okay, uh, I guess I'm supposed to be all for switching to the metric system, but, uh, I sense that it's not going to happen anytime soon. B: Yeah, I don't think it's going to happen either, Using only the above description and what you know about the world, \"it's going to happen\" is definitely correct, incorrect, or inconclusive?", "doc_id": 32, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [207, 96, 109, 174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: you know, sometimes I would go over, but you know, it wouldn't hit me in a big way because I knew that, uh, I would have it covered in that respect. A: Right. Right. That's good. I don't think we've gone that far, to pay it you know, in advance before we spend it, Using only the above description and what you know about the world, \"they've gone that far\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThen it cried. It was another girl. I was a little disappointed but I could only hope that Celia was still a bit hazy from the drugs. Using only the above description and what you know about the world, \"Celia was still a bit hazy from the drugs\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nAnd I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. Using only the above description and what you know about the world, \"Frank Donovan was not the jovial easy-going character she remembered\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I think in s-, and it, just would depend upon the circumstances and the extent of the abuse and if another alternative was available. A: Uh-huh. Uh-huh. Um. Uh-huh. You know, now, I wonder what you think about this and, uh, unfortunately, we don't get to do it, but, uh, it used to be a long time ago, I guess in Biblical times when they had punishment, if somebody did something, for example, to your family, then you had the right to administer the punishment. So if somebody killed somebody in your family, then uh, if that person was caught and found guilty, you had the right to, uh, execute that person. And I know that, uh, if somebody had done something to my family, I would feel that I had the right to get revenge on them uh, but, I don't think that's done much anywhere. Using only the above description and what you know about the world, \"that's done much anywhere\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThen they would awake, terrified and sweating, to find themselves in white starched linen, in a comfortable bed, in peaceful England. And all would be well. It may be said that although he survived it the siege nevertheless had a bad effect on the Collector. Using only the above description and what you know about the world, \"the siege nevertheless had a bad effect on the Collector\" is definitely correct, incorrect, or inconclusive?", "doc_id": 30, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [148, 1, 47, 41], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: But, uh, uh, I don't understand, I guess, why the schools seem to have such a high dropout rate in the big cities. B: Uh, well, I don't pretend to understand that either. Uh, but I'm not quite sure that it's the kind of thing that ought to be blamed on the schools. But then, again, I'm not quite sure where the blame ought to be put. Uh, because the dropout rate is, in those areas, w-, it's high in those areas where also there's poverty and crime. And they all seem to go together. And it seems like if you could eliminate one of the parts of that circle, where you have the dropout rate and crime and, you know, general poverty kind of conditions, that things ought to get better. So, uh, the other two a-, they're all three social issues and could be addressed by the government in any ways. And clearly, to me, is a kind of government thing to fix but it's just like, I don't expect them to know which part is best to fix just like I don't know. it's a complicated issue. I still don't think I would blame it directly on the school. Using only the above description and what you know about the world, \"he would blame it directly on the school\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nIt is part of their religion, a religion I do not scoff at as it holds many elements which match our own even though it lacks the truth of ours. At one of their great festivals they have the ritual of driving out the devils from their bodies. First the drummers come on - I may say that no women are allowed to take part in this ritual and the ladies here will perhaps agree with me that they are fortunate in that omission. Using only the above description and what you know about the world, \"no women are allowed to take part in this ritual\" is definitely correct, incorrect, or inconclusive? Correct\n###\nIt - the tractor, the boys and the bulbs in the earth - knew she had chosen for them and was coming back to them. Of course there was still love, there was healthy, growing love and its name was called Work. She had fallen in love with it so slowly and gently and sweetly that she had never noticed it had happened. Using only the above description and what you know about the world, \"falling in love had happened\" is definitely correct, incorrect, or inconclusive? Correct\n###\nMost of them young, about his age, stood and talked and drank and laughed. The two girls he had noticed earlier were standing talking to some other girls. Graham hoped they all realised that just because he was standing talking to Slater that didn't mean he was gay too. Using only the above description and what you know about the world, \"Graham was gay too\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nValence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping? Using only the above description and what you know about the world, \"Valence was helping\" is definitely correct, incorrect, or inconclusive?", "doc_id": 0, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [230, 91, 215, 39], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. Using only the above description and what you know about the world, \"she really enjoys it\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nHe also liked swimming and cycling. He said that he wrote stories, though he had to admit that he had never got further than the first two pages. Willie meanwhile not only remained silent during these conversations but picked his berries slowly so that they might forget that he was there but he reckoned without Zach. Using only the above description and what you know about the world, \"Willie was there\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I thought Carter was good too, and that was, yeah, B: Did you? I always liked him, I thought he was great at the time and I just couldn't get over the fact that Reagan beat him. you know, that I just couldn't believe that he got voted out. Using only the above description and what you know about the world, \"Carter got voted out\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThey 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. Using only the above description and what you know about the world, \"Miss Lavant was in love with Dr Greenslade\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: I'm like, I'll get a job some day and my boss will pay for it, I'll be needed. B: Yeah. A: Because, um, I didn't want to go do it myself because I didn't think I was really going to use it. Using only the above description and what you know about the world, \"he was really going to use it\" is definitely correct, incorrect, or inconclusive?", "doc_id": 11, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [190, 56, 233, 39], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: The kind you have sounds very interesting, though. A: Well, um, they're really sweet dogs. uh, we thought probably our neighborhood in Houston had more of this breed than any other place just because of, um, the family that had them and bred them. Using only the above description and what you know about the world, \"their neighborhood in Houston had more of this breed than any other place because of the family that had them and bred them\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``It's not your day, is it, dear?'' ``No, but they 've got to be done, and Shirley's making the tea.'' Ianthe had not told her mother that she sometimes had to dust the books in the library. Using only the above description and what you know about the world, \"Ianthe sometimes had to dust the books in the library\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: That is the reason, I don't play over there. B: Yeah. A: I like the course, but I don't play over there because, they don't, uh, you know don't allow you to pull a cart. B: Right. A: And, I don't think a cart damages the turf. Using only the above description and what you know about the world, \"a cart damages the turf\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThey 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. Using only the above description and what you know about the world, \"Miss Lavant was in love with Dr Greenslade\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Well, actually, uh, A: I don't think I'm in the, uh, majority in Texas Using only the above description and what you know about the world, \"she is in the majority in Texas\" is definitely correct, incorrect, or inconclusive?", "doc_id": 39, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [86, 214, 239, 222], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "I can't afford to get bogged down in the weeds. But at least you know she did leave. Maybe a coincidence maybe the two girls talked on the phone decided they 'd both had enough. Using only the above description and what you know about the world, \"the two girls had both had enough\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nB: So am I. A: Are you, B: You know, I think it's kind of coming back around to that, don't you, Using only the above description and what you know about the world, \"it's kind of coming back around to that\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: And the tanks came in and, you know, pretty much took care of that. A: Exactly. B: And, A: Yeah, uh, that, personally I don't see as Gorbachev as being maybe a threat, and I think he's actually, honestly trying to do some change. B: Uh-huh. A: But I don't believe that he, in this first pass around, you know, being the first one to really turn things around or attempt to is going to be allowed to get away with it either. Using only the above description and what you know about the world, \"Gorbachev is going to be allowed to get away with doing some change\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? Using only the above description and what you know about the world, \"they're a security threat\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nThe other set being in the pocket of Giovanna's overall from which she now drew them and held them up making it clear that they would be relinquished only upon her death and then only into the hands of Signor Kettering. What had occurred was quite contrary to the wishes of the padrone who would be outraged if he ever got to hear of it. Despite this disastrous beginning however Giovanna would be there in the morning her own family circumstances permitting and she would be much obliged if the Signora would make sure that her children were up and dressed and the breakfast eaten so that she could see that the house was returned to something like the order which Signor Kettering expected of it. Using only the above description and what you know about the world, \"the house was returned to something like the order which Signor Kettering expected of it\" is definitely correct, incorrect, or inconclusive?", "doc_id": 20, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [17, 123, 85, 11], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "She might have sat all afternoon, nibbling and stuporous, exhausted but not sleepy. But the glazier finally came down from the upper floor, cheerfully announcing that all was now right and tight and he would be on his way. Maggie could tell that he would have liked to stop for a chat that he felt sorry for her left on her own but she lacked either her grandmother's grace or her mother's energy so she did not offer him tea. Using only the above description and what you know about the world, \"the glazier would have liked to stop for a chat\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: And I don't think that rehabilitation is effective. A: Right. Have to agree with you, and I'm kind of in favor of capital punishment also. I just don't think that it acts much as a deterrent to these people because, uh, you still see them committing the same crimes, Using only the above description and what you know about the world, \"it acts much as a deterrent to these people\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nYour honesty shines out of your face, my darling. It isn't your fault that cynical men like myself won't let themselves believe what they see! I just wish you could believe that Eddie's death was an accident and nothing to do with me. Using only the above description and what you know about the world, \"Eddie's death was an accident and nothing to do with him\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``Look, lady, give me a break. I just deliver the stuff, I don't interview it for the Sunday papers.'' He waved the paper at her and even at a distance she could see that it said very little. Using only the above description and what you know about the world, \"the paper said very little\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Really. If they were to take half of what they spend on that and put it on some economic, you know, intergovernmental or inter United States like programs that one really might. B: Yeah. A: I believe in paying my share, and I don't mind, uh, paying for some of these fringe benefits that people are entitled to. But I just sometimes feel like I'm being used. But, uh, again I don't think we'll be able to do anything about it, Using only the above description and what you know about the world, \"they'll be able to do anything about it\" is definitely correct, incorrect, or inconclusive?", "doc_id": 17, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [126, 230, 59, 236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: Yeah, they did. They put a lot of pressure on him from the outside and from the inside. Uh, it's funny watching them play, he's probably like a lot of quarterbacks, uh, when the pressure is really on when it's down to the last few minutes of the game for the season is when the guys seem to really do their best. B: Uh-huh. A: And I haven't quite figured that out, if they figure they have got it won or if there's no real hurry because the first three quarters or, uh, uh, if something happens that that adrenalin starts flowing. Using only the above description and what you know about the world, \"they have got it won\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nA: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. Using only the above description and what you know about the world, \"she really enjoys it\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``Ely,'' I said (that was her name and the first time I 'd ever used it), ``I want to be free.'' She looked stunned. I don't think she 'd considered this. Using only the above description and what you know about the world, \"Ely had considered him wanting to be free\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Yeah, I think that's what aggravates a lot of people, is somebody does get a life sentence in place of the death penalty, and they wind up back on the streets after five years or six years or like the kid on the news tonight out in Mesquite who was out in six months.. B: Uh-huh. Yeah, it's just our criminal system is just so, I guess, overloaded, but the problem is not so much with the prison system, you know, I mean, because the cops are out there doing their job enforcing the laws, and the prison system are just, you know, they're trying to cope with them, but, you know, the thing about capital punishment I, you know, a lot of people don't think it would be a deterrent, uh, to future crime, Using only the above description and what you know about the world, \"capital punishment would be a deterrent to future crimes\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nLouise was equally anxious to see this man who had had the power to persuade her niece to go against her upbringing and character and behave so recklessly after such a brief acquaintance. Waiting in the airport she was suddenly aware of Nora striding towards her. She was impressed by how elegant she looked and could tell by her cousin's walk that Nora also felt that she was looking good. Using only the above description and what you know about the world, \"that Nora also felt that she was looking good\" is definitely correct, incorrect, or inconclusive?", "doc_id": 41, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [18, 112, 243, 33], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Jim waited. He waited a long time, and when the young doctor finally came out, it was almost dark. Jim could nonetheless tell by his anxious face that something was wrong. Using only the above description and what you know about the world, \"something was wrong\" is definitely correct, incorrect, or inconclusive? Correct\n###\nFor a while the notion gripped him, and he prowled the many floors of Hamleys looking for toys. He bought a magic set for Sebastian (although his ideal present for the kid would have been a brand-new name) and a marionette for Louise. He could remember that there was an age for puppets and magic just as he could remember the time that he 'd spent trying to fan a deck of cards or sitting in front of a mirror trying to get the hard consonants down like a real ventriloquist. Using only the above description and what you know about the world, \"there was an age for puppets and magic\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe South Korean government is signing a protocol today establishing formal diplomatic relations with Poland. The two are also signing a trade agreement. South Korean government officials said they don't expect that Seoul can loan money to Warsaw, but it can ``offer experience.'' Using only the above description and what you know about the world, \"Seoul can loan money to Warsaw\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``I hope you are settling down and the cat is well.'' This was a lie. She did not hope the cat was well. Using only the above description and what you know about the world, \"the cat was well\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nA: so I don't know if I wasn't drug tested based on that or because the man who hired me didn't request the drug test, because I know that my company does drug testing on occasion. B: Right. Well, for instance, does the company you worked for before have the right or do they have the ability to say, hey, we've already drug tested her and she came up negative. A: Well, no, I don't think they can force another company to not drug test me just by saying that I didn't, I mean, Using only the above description and what you know about the world, \"they can force another company to not drug test her\" is definitely correct, incorrect, or inconclusive?", "doc_id": 3, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Inconclusive", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [231, 49, 26, 152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: No, it was, I didn't like the way it ended. A: I know, well the only reason I know why it ended is on Arsenio Hall one night, Christopher Reeves told, that, you know, B: Uh-huh. A: I can't believe they killed them. Using only the above description and what you know about the world, \"they killed them\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``His name is Matthew Blake,'' Mandy informed Charity as they descended the steps from their cabin on to the paved pathway that led to the lodge. Thankfully she hadn't even noticed that Charity had changed from the blue wrap-around skirt and was now wearing red shorts with her white silk blouse. Using only the above description and what you know about the world, \"Charity had changed from the blue wrap-around skirt\" is definitely correct, incorrect, or inconclusive? Correct\n###\nI should dearly have liked to know whether they were Europeans or Americans, but I couldn't hear the accents. They appeared to be arguing. I hoped the white men weren't telling him to eliminate all witnesses because I don't believe it would have needed much persuasion. Using only the above description and what you know about the world, \"eliminating all witnesses would have needed much persuasion\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: That might be kind of tough, huh. A: It really would, yes, yes, and like I said, my sister's still in it, and I really don't think my mother'd want to be there, either. Using only the above description and what you know about the world, \"his mother would want to be there\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: And I haven't quite figured that out, if they figure they have got it won or if there's no real hurry because the first three quarters or, uh, uh, if something happens that that adrenalin starts flowing. They say, hey, we got to do something now. And then start playing the game the way the game should be played toward the last few minutes. B: Yeah. A: So, I don't know I'm looking for a good year. I guess we're always looking for a good year. B: So, obviously though, do you think they're going to do anything in the playoffs to make it to the Super Bowl this year Using only the above description and what you know about the world, \"they're going to do anything in the playoffs to make it to the Super Bowl this year\" is definitely correct, incorrect, or inconclusive?", "doc_id": 8, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [14, 75, 15, 8], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "And why bother to write anyway? What was there to say? Mary had some vague idea that Adam's parents might suspect he was down here and come to see him. Using only the above description and what you know about the world, \"Adam was down here\" is definitely correct, incorrect, or inconclusive? Correct\n###\nJean was tough and liked to drink. She would endure for a long while yet. But what would she do when she realized that with things as they were she was on a life sentence not just a temporary suspension of essential pleasure? Using only the above description and what you know about the world, \"Jean was on a life sentence\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``And you're not having this dress,'' Nora said, bending down to look at the price tag. ``It's two and a half guineas!'' she hissed at Louise who could tell that she was genuinely appalled. Using only the above description and what you know about the world, \"Nora was genuinely appalled\" is definitely correct, incorrect, or inconclusive? Correct\n###\nLike now. The Community in Knockglen would defend Eve vociferously. Even some of the Sisters here in Dublin might see that the girl had a point. Using only the above description and what you know about the world, \"the girl had a point\" is definitely correct, incorrect, or inconclusive? Correct\n###\nHe left his own number, calling himself Alain - the name of her Malaysian-French ``business manager''. The next day Gina rang. She had obviously not noticed that it was her own number. Using only the above description and what you know about the world, \"it was Gina's own number\" is definitely correct, incorrect, or inconclusive?", "doc_id": 29, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [31, 99, 45, 21], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Nora calculated that there must be lots of single men up there so she decided it was ideal for the ``manhunt'', as we called it, even though the train fare was a serious consideration. I remember we went up to Euston together one Saturday morning, very excited, to buy the ticket in advance. It was a secret - if my parents had known she was going away alone they would have soon put the kybosh on it. Using only the above description and what you know about the world, \"Nora was going away alone\" is definitely correct, incorrect, or inconclusive? Correct\n###\nMeh ' Lindi did not develop the lower set of arms nor the bony, sinuous tail. Too much to expect a new pair of arms to grow out of her ribs, or her coccyx to elongate so enormously. Nor could Jaq imagine that she could attain the full strength of a purestrain Stealer - though her own strength was formidable even when unenhanced. Using only the above description and what you know about the world, \"Meh ' Lindi could attain the full strength of a purestrain Stealer\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nWhat must it be like to be imprisoned here, day after day, month after month? I wonder, does he keep them chained and manacled, thought Fenella, or does he use sorcery? And so utterly immersed was she in this strange blue and green land that was not feeling strange any more that she did not even notice that she was weighing sorcery against steel chains and seriously considering the likely outcome. Using only the above description and what you know about the world, \"Fenella was weighing sorcery against steel chains\" is definitely correct, incorrect, or inconclusive? Correct\n###\nIt seemed impossible that anyone could endure such pain for so long, but at last the doors of the Renault slammed and there was comparative silence. The engine was started up, revving violently as the car was turned round on the narrow road. John could tell that it was being driven back up the hill towards Putna. Using only the above description and what you know about the world, \"the car was being driven back up the hill towards Putna\" is definitely correct, incorrect, or inconclusive? Correct\n###\nAnd I don't want to have to lie to them. The kidnappers have given us until October the eleventh to deliver the document and I haven't despaired of finding it before then. But if the police learn I 've been to America they 'll ask why. Using only the above description and what you know about the world, \"he's been to America\" is definitely correct, incorrect, or inconclusive?", "doc_id": 13, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [49, 88, 236, 174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``His name is Matthew Blake,'' Mandy informed Charity as they descended the steps from their cabin on to the paved pathway that led to the lodge. Thankfully she hadn't even noticed that Charity had changed from the blue wrap-around skirt and was now wearing red shorts with her white silk blouse. Using only the above description and what you know about the world, \"Charity had changed from the blue wrap-around skirt\" is definitely correct, incorrect, or inconclusive? Correct\n###\nShe longed for a weapon, for even a hairpin, and knowing that she did not have one, she knew too that she was totally defenceless, unarmed and alone. She could feel the great flight of the dragon and sensed that she was high in air and travelling fast towards the sunset. She could feel the great muscles of the dragon's wings send ripplings down the stomach walls and she gave herself over to death. Using only the above description and what you know about the world, \"the great muscles of the dragon's wings sent ripplings down the stomach walls\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Yeah, I think that's what aggravates a lot of people, is somebody does get a life sentence in place of the death penalty, and they wind up back on the streets after five years or six years or like the kid on the news tonight out in Mesquite who was out in six months.. B: Uh-huh. Yeah, it's just our criminal system is just so, I guess, overloaded, but the problem is not so much with the prison system, you know, I mean, because the cops are out there doing their job enforcing the laws, and the prison system are just, you know, they're trying to cope with them, but, you know, the thing about capital punishment I, you know, a lot of people don't think it would be a deterrent, uh, to future crime, Using only the above description and what you know about the world, \"capital punishment would be a deterrent to future crimes\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: I think in s-, and it, just would depend upon the circumstances and the extent of the abuse and if another alternative was available. A: Uh-huh. Uh-huh. Um. Uh-huh. You know, now, I wonder what you think about this and, uh, unfortunately, we don't get to do it, but, uh, it used to be a long time ago, I guess in Biblical times when they had punishment, if somebody did something, for example, to your family, then you had the right to administer the punishment. So if somebody killed somebody in your family, then uh, if that person was caught and found guilty, you had the right to, uh, execute that person. And I know that, uh, if somebody had done something to my family, I would feel that I had the right to get revenge on them uh, but, I don't think that's done much anywhere. Using only the above description and what you know about the world, \"that's done much anywhere\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``But my father always taught me never to be afraid of pointing out the obvious. I'm sure you have noticed the implication of the letter, that the writer has in fact observed Jenny undressing for bed?'' I just wondered if you also knew as I'm sure you do that her bedroom's at the rear of the house? Using only the above description and what you know about the world, \"Jenny's bedroom's at the rear of the house\" is definitely correct, incorrect, or inconclusive?", "doc_id": 2, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [31, 54, 39, 125], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Nora calculated that there must be lots of single men up there so she decided it was ideal for the ``manhunt'', as we called it, even though the train fare was a serious consideration. I remember we went up to Euston together one Saturday morning, very excited, to buy the ticket in advance. It was a secret - if my parents had known she was going away alone they would have soon put the kybosh on it. Using only the above description and what you know about the world, \"Nora was going away alone\" is definitely correct, incorrect, or inconclusive? Correct\n###\nHer priggishness. I admire it. I know she does wrong things she tries to organize other people's lives she can't see Mr Knightley is a man in a million. Using only the above description and what you know about the world, \"Mr Knightley is a man in a million\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThey 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. Using only the above description and what you know about the world, \"Miss Lavant was in love with Dr Greenslade\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: And I've worked in the hospital for fifteen years and I've taken care of a few AIDS patients. A: Uh-huh. B: Uh, when they asked us did we want to, uh, keep it the same or, uh, spend more, spend less, uh, I think right now what they're spending is adequate. Uh, for my personal opinion. Uh, because I think it's something that's going to take them a while to come up with a, uh, vaccine for. A: Yeah. Uh-huh. Uh-huh. B: I don't think it's going to be that easy to come up with Using only the above description and what you know about the world, \"it is going to be that easy to come up with\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: You never see them on nationally basketball. A: You know, that's true. I don't think I've ever seen them nationally on basketball. Using only the above description and what you know about the world, \"he has seen them nationally on basketball\" is definitely correct, incorrect, or inconclusive?", "doc_id": 28, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [86, 29, 170, 110], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "I can't afford to get bogged down in the weeds. But at least you know she did leave. Maybe a coincidence maybe the two girls talked on the phone decided they 'd both had enough. Using only the above description and what you know about the world, \"the two girls had both had enough\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nJed wondered. He 'd scarcely set eyes on him since the night they 'd had dinner together at the house in Westwood. Nobody had mentioned him either and Jed didn't feel he should ask. Using only the above description and what you know about the world, \"Jed should ask\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: they did things for people, you know, for their communities, for their, uh, families, for their friends, where now, I'm not sure they really do. A: Yes. Yeah. Well, I think sometimes through groups and organizations, um, uh, when they asked the question I thought, well that sounds wonderful. And then, I wondered if people were unwilling but I think even if you went in with a negative attitude I don't think it would stay negative very long. Using only the above description and what you know about the world, \"that attitude would stay negative very long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``And you're wrong about us. We're not children and I 'd say we're learning the rules pretty quickly.'' You may have noticed I'm not shaking any more! Using only the above description and what you know about the world, \"he's not shaking any more\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Yes, um, I think that Plano has really done a fantastic job. I mean, at least their plans are good. Um, however, I was, maybe you saw in the paper this morning that, um, they've had some problems with, the recycling on plastic, Using only the above description and what you know about the world, \"they've had some problem with the recycling on plastic\" is definitely correct, incorrect, or inconclusive?", "doc_id": 49, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Incorrect", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [55, 87, 69, 183], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Only she herself knew the evil thoughts she had and how effortlessly they could be translated into action. ``I 'll make a cup of tea.'' No she would not tell Peter that the person he loved most in the world was dead. Using only the above description and what you know about the world, \"the person Peter loved most in the world was dead\" is definitely correct, incorrect, or inconclusive? Correct\n###\nCountry churches were never locked. You could wander in at any time. Perhaps Cromwell when he passed also found the door of Coldingham Priory locked and decided that he would get in anyway even if it meant removing a whole wall in order to do so. Using only the above description and what you know about the world, \"Cromwell would get in anyway\" is definitely correct, incorrect, or inconclusive? Correct\n###\nI ducked so fast I wasn't sure whether he 'd seen me or not, but it gave me a prickly feeling just to imagine it, so I scuttled for the door and legged it up the spiral stairway three steps at a time, just in case. As I ran, I remember thinking stupid thoughts like. How did he know I was up here looking down? Using only the above description and what you know about the world, \"he was up there looking down\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. Using only the above description and what you know about the world, \"the Soviet Union itself is a threat still\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: No, I don't either. B: Uh, I mean it's, you know it, A: I don't think it's going to change very much Using only the above description and what you know about the world, \"it's going to change very much\" is definitely correct, incorrect, or inconclusive?", "doc_id": 27, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [137, 158, 20, 11], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. Using only the above description and what you know about the world, \"people should be allowed to carry guns in their vehicles\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: But, uh, B: Okay. Uh, uh, I've had one or two American cars I think, and they were okay. I had a Pontiac once and I never had a problem with it, but, uh, my mother had a Dodge at one point and I had driven it a few times and I really did not feel that I would buy a Dodge just from, Using only the above description and what you know about the world, \"she would buy a Dodge\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nChopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. Using only the above description and what you know about the world, \"something had happened\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``Look, lady, give me a break. I just deliver the stuff, I don't interview it for the Sunday papers.'' He waved the paper at her and even at a distance she could see that it said very little. Using only the above description and what you know about the world, \"the paper said very little\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe lunch trade had mostly disappeared so he wasn't hard to spot. He was at a window table but he was ignoring the river, being deep in conversation with a middle-aged man wearing a suit and a short sheepskin car coat with matching brown suede shoes. Even from this distance you could guess the guy's tailor was based in Dublin. Using only the above description and what you know about the world, \"the guy's tailor was based in Dublin\" is definitely correct, incorrect, or inconclusive?", "doc_id": 55, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [199, 140, 96, 169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: um, they try to encourage you to follow a specific curriculum, although you don't have to. A: Uh-huh. B: And then if you have particular religious beliefs they're kind of monitored. You know, they will allow you to, I can't think of any examples but certain religious groups don't want their children in public schools because the influence. And maybe they were a group of Mennonites or something like that. A: Uh-huh. B: I don't think they're were in this area Using only the above description and what you know about the world, \"they were in this area\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: nanny, sort of? Uh-huh. Uh-huh. B: and you know, I could envision a society where that would happen and make an interesting, uh, uh, story or whatever. A: Yeah. B: I don't think I have a philosophical problem with that. Using only the above description and what you know about the world, \"she has a philosophical problem with that\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThen it cried. It was another girl. I was a little disappointed but I could only hope that Celia was still a bit hazy from the drugs. Using only the above description and what you know about the world, \"Celia was still a bit hazy from the drugs\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nA: And now it's election time again so they're trying to lower them. B: Oh. A: So they're just talk about lowering them but they never do, they just keep raising them. B: I've never seen taxes really go down. Using only the above description and what you know about the world, \"taxes would really go down\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nIt grew bigger with incredible speed, she was whizzing towards it. She must slow down or she 'd miss it. She took her foot off the accelerator and put it on the brake and as the car slowed she could see now that it was a child a toddler with a red woolly hat on. Using only the above description and what you know about the world, \"it was a child\" is definitely correct, incorrect, or inconclusive?", "doc_id": 7, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [104, 188, 43, 178], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``For such a person, finding a protector might not be so difficult, even in Edinburgh.'' Jean smiled. He might have known that even someone as sensible as Miss van Wiliamsburgh would try to make a play of this sort. Using only the above description and what you know about the world, \"even someone as sensible as Miss van Williamsburgh would try to make a play of this sort\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: Right, you know, like In packaging A: Yeah. B: and, uh, you know, just goodness. A: Yeah, I don't think they do the packaging at this plant, Using only the above description and what you know about the world, \"they do the packaging at this plant\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\n``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. Using only the above description and what you know about the world, \"he is having an affair with Lexy\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: Boy that's scary, isn't it. B: Oh, can you imagine, because it happens in the middle of the night, so you know, these parents didn't know the kid was gone until the kid is knocking on the door screaming, let me in. Using only the above description and what you know about the world, \"the kid was gone\" is definitely correct, incorrect, or inconclusive? Correct\n###\nBiddy was right. In London, I did some serious thinking. I could see that my character had not improved since I had heard about my expectations. Using only the above description and what you know about the world, \"his character had not improved since he had heard about his expectations\" is definitely correct, incorrect, or inconclusive?", "doc_id": 31, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [189, 70, 210, 155], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: The one thing I sometimes wonder about, um, in civil cases is, uh, whether, especially sort of in, uh, maybe like product liability, or medical malpractice, where there's, um, sort of a very technical decision to be made sometimes B: Yes. A: you know, it's not just a matter um, of, you know, did this guy rip off this guy, and it's just a matter of interpreting a contract, it's sort of a matter of, um, you know, sometimes getting into very technical issues, and I wonder um, if the system works adequately in educating the jurors about, uh, whatever, um, you know, issue is under discussion. B: I don't think that they educate them enough to really know what's going on. Using only the above description and what you know about the world, \"they educate the jurors enough to really know what's going on\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nAt the heart of the universe there is cruelty. We are predators and are preyed upon, every living thing. Did you know that wasps lay their eggs in ladybirds piercing the weak spot in their armour? Using only the above description and what you know about the world, \"wasps lay their eggs in ladybirds\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: All right, well. A: Um, short term, I don't think anything's going to be done about it or probably should be done about it. B: Right. Uh, are you saying you don't think anything should be done in the short term? Using only the above description and what you know about the world, \"anything should be done in the short term\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nB: What am I afraid of? A: Yes. B: Um, I don't know if I'm really afraid of spending too much. I just, uh, don't think that I need them, you know. Using only the above description and what you know about the world, \"she needs them\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: and that rolling kind of, uh, B: Terrain. A: Yeah. is fairly famili-,. The thing that I thought was interesting was that the critics, apparently it's going to win everything. B: Really? A: Uh, and I had been told, you know, you wouldn't notice that it was three hours long, and all this, kind of, Using only the above description and what you know about the world, \"it was three hours long\" is definitely correct, incorrect, or inconclusive?", "doc_id": 6, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [184, 118, 6, 188], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: That's what I've heard too. Uh, A: It seems bizarre to me. I don't quite understand it, although I think probably the worst thing that's happening at least the modern world today, is television. Using only the above description and what you know about the world, \"the worst thing that's happening in the modern world is television\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I don't know how my parents did it. A: Yeah. B: I mean, there were five of us and I don't recall, you know, wanting anything in particular. Uh, but I don't know how my father did it. He worked at a truck line and he just didn't make that kind of money with five children. But we did okay. We had a house and a home and, but now, my wife and I both work and I don't believe we have as much as my parents did. Using only the above description and what you know about the world, \"he and his wife have as much as his parents did\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThen the silence in the Zoo became complete. Woil stared around him and then suddenly with a push of his wings raised himself into the air, turned, and landed ten feet away on the back of a green bench. Creggan could see that he was afraid and that his fear was making him terribly uncertain. Using only the above description and what you know about the world, \"Woil was afraid\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: Right, you know, like In packaging A: Yeah. B: and, uh, you know, just goodness. A: Yeah, I don't think they do the packaging at this plant, Using only the above description and what you know about the world, \"they do the packaging at this plant\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: for the moment, and that's what really is getting me about what George Bush's stand on the budget is right now is that he is saying, I am going to give you this ludicrous little tax cut so that you'll be happy come November, and you'll elect me again B: Uh-huh. Uh-huh. A: and then I'm going to go on and just forget everything that I said B: Uh-huh. A: or you know, it doesn't seem that it's going to make much of a difference. Using only the above description and what you know about the world, \"it's going to make much of a difference\" is definitely correct, incorrect, or inconclusive?", "doc_id": 4, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Correct", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [38, 116, 183, 67], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "And what she had said, and went on saying quietly, calmly, efficiently, was that she loved Maggie. She paid attention. At eight Maggie had not known that her grandmother was famous but she had seen that people had something in their manner when they looked at Rachel. Using only the above description and what you know about the world, \"Maggie's grandmother was famous\" is definitely correct, incorrect, or inconclusive? Correct\n###\nPaula could not help herself. It was just the way she was. Others might say they hated her and mean it. Using only the above description and what you know about the world, \"others hated Paula\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. Using only the above description and what you know about the world, \"the Soviet Union itself is a threat still\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nIt is all very well, in these changing times, to adapt one's work to take in duties not traditionally within one's realm. But bantering is of another dimension altogether. For one thing how would one know for sure that at any given moment a response of the bantering sort is truly what is expected? Using only the above description and what you know about the world, \"at any given moment a response of the bantering sort is truly what is expected\" is definitely correct, incorrect, or inconclusive? Inconclusive\n###\nWho knows how many quarrels, false accusations, unnecessary dismissals, how many promising careers cut short can be attributed to a butler's slovenliness at the stage of drawing up the staff plan? Indeed, I can say I am in agreement with those who say that the ability to draw up a good staff plan is the cornerstone of any decent butler's skills. I have myself devised many staff plans over the years and I do not believe I am being unduly boastful if I say that very few ever needed amendment. Using only the above description and what you know about the world, \"very few plans ever needed amendment\" is definitely correct, incorrect, or inconclusive?", "doc_id": 18, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Incorrect", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [213, 119, 28, 2], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "A: Um, yeah, I guess that's not an easy solution. there's no easy solution for that. B: Uh-huh. I don't know that there is an easy solution, but if you could find a way to prevent some of it, and I'm not sure what it would be. It would be money better spent than, A: Uh-huh. B: do you know it costs more to keep an inmate on death row than it does to send a kid to Harvard? Using only the above description and what you know about the world, \"it costs more to keep an inmate on death row that it does to send a kid of Harvard\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Well, how do you feel about the immigration laws? B: At, currently, I think they are a little restrictive. Uh, particularly for, uh, certain ethnic groups or from certain countries. Um, I think we should permit, uh, more immigration from eastern Europe, for example, uh, particularly uh, the Jewish, uh, uh, people from Russia. I think we could permit more of them in than we have permitted in the last, uh, several years. And, I think we have, uh, uh, too much restriction uh, on the Orientals also, but, of course, that's just my opinion. A: Yeah, well, I'm not real sure why I got this topic, because I don't think I checked it off on the list because I know very little about the current immigration laws. Using only the above description and what you know about the world, \"he checked the topic off on the list\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nHe had seen something I should have, which was a car turning in from Soho Square and coming up behind me. My right foot hovered over the accelerator pedal and I balanced Armstrong on the clutch. I wasn't as convinced as Malpass that Nevil was out of harm's way. Using only the above description and what you know about the world, \"Nevil was out of harm's way\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThe Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. Using only the above description and what you know about the world, \"Gustave was shepherded into creative retreat at Croisset by epilepsy\" is definitely correct, incorrect, or inconclusive? Correct\n###\nA: Your turn. B: Okay. Uh, I don't think they should abolish it. Using only the above description and what you know about the world, \"they should abolish it\" is definitely correct, incorrect, or inconclusive?", "doc_id": 14, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Inconclusive", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [164, 167, 234, 109], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: All right, well. A: Um, short term, I don't think anything's going to be done about it or probably should be done about it. Using only the above description and what you know about the world, \"something's going to be done about it\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: I got a friend who goes there, by the way. I want to talk to you about that afterward, okay. B: Okay. Uh, I've, the high school I went to uh, was a good one also. And well, I guess you could say one of the problems with the public education system is the disparity between different schools. Using only the above description and what you know about the world, \"one of the problems with the public education system is the disparity between different schools\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: I do not know. I wonder where he gets it? You know, you must, I think TV is bad. Because they, uh, show all sorts of violence on, A: That and I do not think a lot of parents, I mean, I do not know how it is in the Air Force base. But, uh, I just do not think a lot of people, because of the economy, both need to work, you know. I just do not think a lot of parents are that involved any more. Using only the above description and what you know about the world, \"a lot of parents are that involved\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nAnd I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. Using only the above description and what you know about the world, \"Frank Donovan was not the jovial easy-going character she remembered\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: uh, but it's worked out for my family, to have my cake and eat it too, kind of thing. A: Yeah. Yeah, that's a good deal. Where do you think this is going in the future, I mean, do you think things are going to change, Using only the above description and what you know about the world, \"things are going to change\" is definitely correct, incorrect, or inconclusive?", "doc_id": 52, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Correct", "target": "Inconclusive", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [110, 170, 242, 73], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "``And you're wrong about us. We're not children and I 'd say we're learning the rules pretty quickly.'' You may have noticed I'm not shaking any more! Using only the above description and what you know about the world, \"he's not shaking any more\" is definitely correct, incorrect, or inconclusive? Correct\n###\nB: they did things for people, you know, for their communities, for their, uh, families, for their friends, where now, I'm not sure they really do. A: Yes. Yeah. Well, I think sometimes through groups and organizations, um, uh, when they asked the question I thought, well that sounds wonderful. And then, I wondered if people were unwilling but I think even if you went in with a negative attitude I don't think it would stay negative very long. Using only the above description and what you know about the world, \"that attitude would stay negative very long\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nThe trend toward lower rents may seem surprising given that some communities in New York are bemoaning the loss of favorite local businesses to high rents. But, despite the recent softening, for many of these retailers there's still been too big a jump from the rental rates of the late 1970s, when their leases were signed. Certainly, the recent drop in prices doesn't mean Manhattan comes cheap. Using only the above description and what you know about the world, \"Manhattan comes cheap\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nAt length she decided that there was nothing to be gained by worrying her. Probably there was some quite innocent explanation, which Roger Kenyon would give her when she returned the wallet - if, indeed, it were his. And yet why had his manner changed so abruptly when he learned that the girl whose hat he had rescued was going to live at Sunset Cottage? Using only the above description and what you know about the world, \"the girl whose hat Roger Kenyon had rescued was going to live at Sunset Cottage\" is definitely correct, incorrect, or inconclusive? Correct\n###\n``Who knows? The point is, do we go with it or not?'' Do we assume there is a shipment? Using only the above description and what you know about the world, \"there is a shipment\" is definitely correct, incorrect, or inconclusive?", "doc_id": 1, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Incorrect", "target": "Correct", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [186, 82, 243, 204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "B: Well, you've got, well, any of the big cities you've got the different rival gangs and they're having their little turf wars over their little drug kingdoms and such, A: Uh-huh. B: And they get out their little Mac tens, they get out their little uzis and they're going to fight with them. And it doesn't matter what restrictions you put on that type of weapon or a class three firearm. If they want it they'll get it. I don't care if they've got to go down into New Mexico to get it they'll get it and they'll get across the border. Now my position, although, I have absolutely no use for a fully automatic weapon, anyway. A: Uh-huh. B: Since I am a law-abiding citizen and I have never had a felony, if I wanted to buy one, I don't think there should be that big of a restriction on it. Using only the above description and what you know about the world, \"there should be that big of a restriction on it\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nHowever, I will let the gynandrous renegade stay here under one condition. If Penumbra questions either of us, we will not lie. If she suspects Crevecoeur is here and asks we let him go back. Using only the above description and what you know about the world, \"Crevecoeur is here\" is definitely correct, incorrect, or inconclusive? Correct\n###\nThe South Korean government is signing a protocol today establishing formal diplomatic relations with Poland. The two are also signing a trade agreement. South Korean government officials said they don't expect that Seoul can loan money to Warsaw, but it can ``offer experience.'' Using only the above description and what you know about the world, \"Seoul can loan money to Warsaw\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nA: How did Radio Shack work? B: If you go in and buy anything they want your phone number. And I don't think they're going to call me and ask me how it's functioning, Using only the above description and what you know about the world, \"they're going to call him\" is definitely correct, incorrect, or inconclusive? Incorrect\n###\nB: and, you know, they just love kittens. A: Yeah. B: They just are fascinated. A: Oh, yeah. B: So she doesn't know that this is a cat yet. Using only the above description and what you know about the world, \"this is a cat\" is definitely correct, incorrect, or inconclusive?", "doc_id": 51, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "MNLI crowdsource", "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a30a974d063a06539dddec4ec9ed861e918f9392 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e93cdef3f725b08346bb83eecee59b728ea685a71a5a31ecb0b837ef701ba6 +size 56312 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d +size 79780 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69bf449ef326048f5e5a0e1874a3b02cfe5c5eee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25076723b87eea3f196aa930dc1da5ee42eb1963640d8932a078dfd63dc420db +size 102063 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..554a029799cdefd636ceb14a56c881fafdf9a923 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e4c8993b1d40953c9446bb83400d6e8bd5ece831ced93f1ef75036425bfcd3 +size 123875 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c4fad532a27b8a9774815c07421a717ad2dd585 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2878ed25b4cfe65bba1cc7b5cb93483aec3d71d37ead0930478c83b28a84469 +size 63931 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..359574cbe38cafff662442ac82adc74eca6d9ab9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae71f03468e292dd477a7b0ae2cbc114794347c14c8b24b2bb874d06a111f761 +size 89654 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ee4ecaee1eadc77b12ccc48e9cfb2b0c00f7e562 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a86196667de264fbcdf37f8a99e616b809b2dd7e1ea6b49341402b6a489fce +size 114133 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..733b04c078abdd1aff6450486610b0953ecbf115 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcdbd3a3d9e157a9a14cabea5f398e866902ca2f8e4a31ff6e0d5fac1e02c1 +size 138158 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..db8346c1d0cb23f4334d6b5def5cdeef3ef62868 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_4.jsonl @@ -0,0 +1,56 @@ +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [224, 152, 13, 207], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: Yeah. The radio doesn't really have much news sometimes. The stations I listen to are just mainly music. B: Yeah, I think you pretty much have to listen to all news station to get any news at all. A: Yeah. Do you think that TV is, uh, pretty accurate. \n\nTherefore, \"TV is pretty accurate\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that B: That might be kind of tough, huh. A: It really would, yes, yes, and like I said, my sister's still in it, and I really don't think my mother'd want to be there, either. \n\nTherefore, \"his mother would want to be there\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that It was Alan's idea. He made a sour kind of joke out of it, that they must wait until their wedding night. Carolyn agreed because she could see he meant it although she didn't understand why. \n\nTherefore, \"Alan meant it\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: you know, sometimes I would go over, but you know, it wouldn't hit me in a big way because I knew that, uh, I would have it covered in that respect. A: Right. Right. That's good. I don't think we've gone that far, to pay it you know, in advance before we spend it, \n\nTherefore, \"they've gone that far\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: Yeah. I understood that. you know, I work full time and I have two kids so my spare time usually involves something with the kids. A: Yeah. B: You know, hobbies, I can't really say that we have hobbies. \n\nTherefore, \"they have hobbies\" is guaranteed, possible, or impossible?", "doc_id": 9, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [34, 209, 137, 151], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Jane ate without pausing. Hunger was an unknown experience. She had never imagined it could actually hurt. \n\nTherefore, \"hunger could actually hurt\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: It was just a side benefit. B: Yeah, yeah, because, I'm not big or anything, but I'm not in great shape, But when I worked out, I got in pretty good shape. I didn't build up muscle, though, I just got real good and toned. A: Yeah. B: I don't think women look good with muscles. \n\nTherefore, \"women look good with muscles\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. \n\nTherefore, \"people should be allowed to carry guns in their vehicles\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Well, presumably those who find out such information, if they are doing it, I would prefer to not to be known, and, I mean, you know, the classic, oh, I don't know C I conspiracy theories or whatever, would have such parties trying to do it without your knowledge. So there's,, things that invade that second type of privacy where you do know about them and possibly things that invade that second type of privacy without you knowing about it, and I can't talk about the second one other than to generate paranoia. It's a surmise and, I'd like to think that it's quite low, at least in this country. B: to surmise. It is there. A: I don't think I'd like the KGB monitoring my phone or anything like that. \n\nTherefore, \"he would like the KGB monitoring his phone\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Now the part about where you said the apartment complex puts up signs that says no soliciting, I've even gone so far as to put that, I've got a storm door on the front of the house and I've put, in, I don't know how much clearer it can be, it's a red sign with silver letters saying no soliciting. I guess I should make another one that says religious or otherwise, cause I still get, B: Yeah, yeah, that's true, yeah. No I didn't go that far but, uh, yeah I probably could do the same thing, uh, you know, I don't have a storm door, but I'm sure I could rig up something. But you know I don't think that that would stop people. \n\nTherefore, \"a no soliciting sign would stop people\" is guaranteed, possible, or impossible?", "doc_id": 42, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [213, 69, 43, 71], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: Um, yeah, I guess that's not an easy solution. there's no easy solution for that. B: Uh-huh. I don't know that there is an easy solution, but if you could find a way to prevent some of it, and I'm not sure what it would be. It would be money better spent than, A: Uh-huh. B: do you know it costs more to keep an inmate on death row than it does to send a kid to Harvard? \n\nTherefore, \"it costs more to keep an inmate on death row that it does to send a kid of Harvard\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that I ducked so fast I wasn't sure whether he 'd seen me or not, but it gave me a prickly feeling just to imagine it, so I scuttled for the door and legged it up the spiral stairway three steps at a time, just in case. As I ran, I remember thinking stupid thoughts like. How did he know I was up here looking down? \n\nTherefore, \"he was up there looking down\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. \n\nTherefore, \"he is having an affair with Lexy\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``I wanted to tell you. But the Bookman asked me to keep our meeting a secret.'' How did you know I 'd met him? \n\nTherefore, \"he had met the Bookman\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Obeying his instruction, I proffered my hand, open palm upwards, towards the animal. The ratbird climbed on and began to preen its fur unconcernedly. Nobody will blame me if I say that in the circumstances I became very uneasy. \n\nTherefore, \"in the circumstances she became very uneasy\" is guaranteed, possible, or impossible?", "doc_id": 34, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [21, 2, 156, 112], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that It seemed impossible that anyone could endure such pain for so long, but at last the doors of the Renault slammed and there was comparative silence. The engine was started up, revving violently as the car was turned round on the narrow road. John could tell that it was being driven back up the hill towards Putna. \n\nTherefore, \"the car was being driven back up the hill towards Putna\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. \n\nTherefore, \"Gustave was shepherded into creative retreat at Croisset by epilepsy\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: That's fairly interesting. B: I bet that would be, rather interesting. Uh, that's, uh, self improvement, well, that's kind of a hobby but it is self improvement from the standpoint of probably relaxing, uh. A: Yeah, I don't know that I read anything strictly labeled self improvement. \n\nTherefore, \"she reads anything strictly labeled self improvement\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that For a while the notion gripped him, and he prowled the many floors of Hamleys looking for toys. He bought a magic set for Sebastian (although his ideal present for the kid would have been a brand-new name) and a marionette for Louise. He could remember that there was an age for puppets and magic just as he could remember the time that he 'd spent trying to fan a deck of cards or sitting in front of a mirror trying to get the hard consonants down like a real ventriloquist. \n\nTherefore, \"there was an age for puppets and magic\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: uh I picked up a bunch of Craftsman tools from the forties that my wife's father owned when he was alive B: Uh-huh. A: and so I do have a band saw and a router and, uh, things like that out in the garage. But I can't say I use them very often. \n\nTherefore, \"he uses them very often\" is guaranteed, possible, or impossible?", "doc_id": 53, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [133, 145, 109, 141], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: But, uh, if the wind comes basically from the south it can be really bad. A: Uh-huh. B: Uh, the State of Wisconsin, as a matter of fact, uh, started some litigation against Illinois because of the air pollution we were getting. A: Uh-huh. B: Uh, I don't think it's going to go very far, \n\nTherefore, \"it's going to go very far\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. \n\nTherefore, \"she wants to see that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that And I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. \n\nTherefore, \"Frank Donovan was not the jovial easy-going character she remembered\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: That was kind of a funny movie with, uh, Richard Dreyfuss and Bill Murray. A: Uh-huh. B: That was fun. A: Golly, I don't think that I've ever heard of that movie. \n\nTherefore, \"he has heard of that movie\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Firstly, I didn't know about the SAS soldiers in the British Embassy, and I am very surprised about it. Very surprised indeed, Ambassador. Secondly I do not think it is a good idea to attack a plane with a hundred and seven passengers in it and ``take it apart'' as you say. \n\nTherefore, \"it is a good idea to attack a plane with a hundred and seven passengers in it and 'take it apart'\" is guaranteed, possible, or impossible?", "doc_id": 10, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [20, 32, 205, 195], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Chopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. \n\nTherefore, \"something had happened\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that His mother driving the car, so happy, young-looking and fashionably dressed and his father, a big, confident man in a smart suit, smiling and turning round to say something to Simon in the back seat. Marie thought of her own mother with her frumpy clothes and ageing, lined face. No one would have guessed that she was only forty-two. \n\nTherefore, \"Marie's mother was only forty-two\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: So, let's talk about the, uh, wonderful abuses in the State of Pennsylvania of personal property taxes whereby you can purchase something mail order and after the fact, the State of Pennsylvania can find out about it and send you a bill for the sales tax appropriate to that item that you purchased as well as interest and penalties from the time that you bought it. What do you think? Is Pennsylvania kind of out of line there? A: Well, actually, I do n't think they're out of line. \n\nTherefore, \"they're out of line\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: I look at our big green containers, and I say, well, they work fine and I keep mine outside the garage so that I don't have any odors but it's clearly a place where, uh, uh, A: Oh, right. B: it will be interesting to see how well that works and I'm glad the community is doing it. Uh, it's one of those things that kind of has to be forced on people. Uh, I don't know what you saw back, uh, years ago, but for me the thing that strikes me is uh, growing up in rural South Dakota where, hey the farmers brought their eggs to town and the local hatchery would candle them and package them is that, uh, in the fifties, uh, you could say we had the recycling going on then that we should have now. Which was all the milk bottles were glass \n\nTherefore, \"they had the recycling going on then that they should have now\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The assassin's tone and bearing were completely confident. If he noticed that Zukov was now edging further to the side widening the arc of fire he did not appear to be troubled. \n\nTherefore, \"Zukov was edging further to the side\" is guaranteed, possible, or impossible?", "doc_id": 47, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [212, 145, 46, 65], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: So again, it never really penalizes, the company doing the wrong. A: Right. That will, B: They can go right on doing the same old thing they always used to. A: Huh. B: And if they know some practice is wrong, you know, \n\nTherefore, \"some practice is wrong\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. \n\nTherefore, \"she wants to see that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Strasbourg, Vienna, Bucharest, Istanbul, not stopping, not looking back. I saw her tossing newly gauffred curls as the open roadster headed east, away from Ollie... Temporarily I managed to re-erect my jocular facade, but inside I was panicking. He could take her away I thought he could just do that he has such power to hurt me this little furry creature who hasn't even noticed that I 've given up the weed. \n\nTherefore, \"he has given up the weed\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that If there are spirits at work at the time, they come only from yourself, not from the fume of the incense. Why should spirits aid living beings? What arrogance is it that drives people to believe they can have power over them? \n\nTherefore, \"people can have power over spirits\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that There was a group of curious onlookers... Marie felt her legs give way beneath her, she sat down on the edge of the pavement, feet in the gutter, doubled-up, sick and winded as if someone had punched her in the stomach. She lifted up her head and looked again. She had watched scenes like this so often in detective films and police series on television that she could hardly believe that this was real life. \n\nTherefore, \"this was real life\" is guaranteed, possible, or impossible?", "doc_id": 45, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [144, 145, 220, 127], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: I am right outside Baltimore. I am less than a mile from the Baltimore line. B: Um. A: And I go to a campus of the University of Maryland that is just, less than a mile from my house. So I'm actually in Baltimore, yeah, you could say I'm in Baltimore. \n\nTherefore, \"he is in Baltimore\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. \n\nTherefore, \"she wants to see that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: So, we're comparable. B: Yeah. A: As a matter of fact, I just paid my Richardson taxes because I live in Richardson and supplemented the Robin Hoods very thoroughly, I think. B: Yeah, I think Yeah, we have got it on the line, don't we. \n\nTherefore, \"they have got it on the line\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Uh, well then you must know a lot more about this than I do. B: Uh, I think, uh, the system right now, you know, you know, is fine. I think it should be by a jury. I don't think the judge should have, I mean he's just there kind of like the referee. A: Uh-huh, Uh-huh. B: Uh, I don't even think that it should be unanimous either. Uh, \n\nTherefore, \"it should be unanimous\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: If, uh, you've some kid uh, who's from a broken family, the schools are supposed to fill that void. It's gotten way beyond uh, you know, teaching classes and maybe providing extracurricular sports activities or something like, oh, were kind of traditionally the school's roles. A: Yeah. Yeah, it's interesting because, uh, we're just having conversation on this, uh, with a couple of people yesterday. And I was expressing my frustrations that, uh, so many problems, I work in a high school, are that kids don't have a degree of self-discipline which may be reflected in society at large. Uh, and you can't expect in a classroom for a particular course an hour a day to counteract, uh, sixteen or seventeen years of influence at home. B: Right. A: Um, and, it's seen more so because when you call parents up, many parents won't even recognize that there is a problem \n\nTherefore, \"there is a problem\" is guaranteed, possible, or impossible?", "doc_id": 35, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [83, 45, 131, 158], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that This was a sheer waste of time. He would probably land and then tell them to walk back. When she glanced at him again he looked very grim and she wondered if she should have told Mitch that he might well have a lot of opportunity to photograph Spain - on foot as he walked back to Malaga. \n\nTherefore, \"Mitch might well have a lot of opportunity to photograph Spain\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that What must it be like to be imprisoned here, day after day, month after month? I wonder, does he keep them chained and manacled, thought Fenella, or does he use sorcery? And so utterly immersed was she in this strange blue and green land that was not feeling strange any more that she did not even notice that she was weighing sorcery against steel chains and seriously considering the likely outcome. \n\nTherefore, \"Fenella was weighing sorcery against steel chains\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I did, too. A: I mean, it was just more for my money. B: Yeah. I didn't think it was too long at all. \n\nTherefore, \"it was too long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: But, uh, B: Okay. Uh, uh, I've had one or two American cars I think, and they were okay. I had a Pontiac once and I never had a problem with it, but, uh, my mother had a Dodge at one point and I had driven it a few times and I really did not feel that I would buy a Dodge just from, \n\nTherefore, \"she would buy a Dodge\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Okay. So Frank, what, uh, type of, uh, budget do you or your family have? B: Well, uh I don't know that we really have a budget. \n\nTherefore, \"he and his family really have a budget\" is guaranteed, possible, or impossible?", "doc_id": 21, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [108, 63, 22, 137], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that I didn't really like the way the other boys treated him. I was new at the school and still observing, still beginning friendships. Perhaps Alec noticed that I did not ridicule him as the others did. \n\nTherefore, \"he did not ridicule Alec as the others did\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that It's where the bands practise. I can't remember what band Petra's in, but I seen them practise once. They were OK but I didn't think they was brilliant. \n\nTherefore, \"Petra's band was brilliant\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Just when you think you 've got it straight, along comes the Fool with his pig's bladder and whops you on the nose. By the way, I'm no idiot. I could tell Gillian and Stuart weren't thrilled to see me at the airport. \n\nTherefore, \"Gillian and Stuart weren't thrilled to see her at the airport\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. \n\nTherefore, \"people should be allowed to carry guns in their vehicles\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that But what we may not know is just what makes somebody a sucker. What makes people blurt out their credit-card numbers to a caller they 've never heard of? Do they really believe that the number is just for verification and is simply a formality on the road to being a grand-prize winner? \n\nTherefore, \"the number is just for verification and is simply a formality on the road to being a grand-prize winner\" is guaranteed, possible, or impossible?", "doc_id": 24, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [89, 206, 60, 181], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``Molly likes having an audience for her tales and it passes the hours for them.'' When Miss Louisa had a second more severe stroke at the end of August, and Miss Ellen another heart attack, both old ladies died within a few days of each other. Their friends could only feel that death was merciful in the circumstances especially with war imminent and that Molly had made the closing months of their lives very happy. \n\nTherefore, \"death was merciful in the circumstances\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I do too. I believe about ten years ago that we went through a terrible time, but I don't, I believe that they're better now, you know, wh-, B: I think so. I don't think they're shoddy \n\nTherefore, \"they're shoddy\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that I'm sorry, I 've put you in an invidious position. If you're being run by Morton, he 'll want to hear all this. It won't do any harm but I 'd rather not give him food for thought because I consider him an idiot and I don't think he's capable of interpreting it correctly. \n\nTherefore, \"Morton is capable of interpreting this food for thought correctly\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: but, uh, I can definitely, uh, see on down the road, you know, where we do have kids and are getting to that age, that's going to be a definite concern. A: Yeah, you talked before, about the school funding. I think there's only going to be one solution to school funding which I don't think will be necessarily the best way \n\nTherefore, \"the one solution to school funding will be necessarily the best way\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Modify the arachnids, said the researchers. Change their bodies and conditions, and you could get fibres like glass, still monofilament, but with logarithmic progressions of possibilities of strength and flexibility, and the ability to resonate light-particles or sound-waves undistorted, scarcely weakened over thousands of miles. Who said the arachnids had to be totally organic? \n\nTherefore, \"arachnids had to be totally organic\" is guaranteed, possible, or impossible?", "doc_id": 46, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [40, 157, 240, 97], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Did he intend everyone in the castle to know he did not want the wife he had married in such a hurry? Did he intend to ignore her completely? Then Isabel saw Ellen's stunned face and realised that her maid at least did not know she had spent the night alone. \n\nTherefore, \"Isabel had spent the night alone\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: They might be, but not at the human factors level. they're, B: Well, I heard it on the news today, I could swear it was IBM. \n\nTherefore, \"it was IBM\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Robert Erwin, president of Biosource, called Plant Genetic's approach ``interesting'' and ``novel,'' and ``complementary rather than competitive.'' ``There is a large market out there hungry for hybrid seeds,'' he said. Mr. Robinson of Delta & Pine, the seed producer in Scott, Miss., said Plant Genetic's success in creating genetically engineered male steriles doesn't automatically mean it would be simple to create hybrids in all crops. \n\nTherefore, \"it would be simple to create hybrids in all crops\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Matthew rode on feeling a little more at peace with himself. He skirted the spruce plantation and supposed that at some point he should tell Sara about it. He could imagine that she might be interested in its money-making propensity at the end of the year. \n\nTherefore, \"Sara might be interested in its money-making propensity at the end of the year\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that She hated to think of his sister lying in hospital waiting for her husband to come to her while all the time he was with Dana. She gripped her hands tightly together. Dana didn't know Berenice was in danger of losing her child. \n\nTherefore, \"Berenice was in danger of losing her child\" is guaranteed, possible, or impossible?", "doc_id": 38, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [107, 84, 228, 204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that There was no answer. Moving carefully, Benny stepped around the edges of the room, and opened the window shutters. Her momentary horror at seeing the unmistakable form of General Etienne was only slightly dulled by the realization that the stiff posture he was in could only mean he was dead. \n\nTherefore, \"General Etienne was dead\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Nicky approached her with the assumption that men are naturally right and it is the role of women to follow their lead. Constance, whose confidence was growing daily, was not prepared to give in to Nicky's wishes merely because of his sex. If she felt he was right then she agreed with him. \n\nTherefore, \"Nicky was right\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that B: so there's only been really one working. A: Uh-huh, same here. Uh-huh. B: And, uh, it works for me but I can't see that it would work for probably the majority of people. \n\nTherefore, \"it would work for probably the majority of people\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: How did Radio Shack work? B: If you go in and buy anything they want your phone number. And I don't think they're going to call me and ask me how it's functioning, \n\nTherefore, \"they're going to call him\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that What had brought Gharr and Ten-huc and Pulvidon to the planet at the same time? Why were all of them so interested in why I was there? And if they somehow suspected that I was picking up something valuable why would any of them try to kill me before the pick-up? \n\nTherefore, \"she was picking up something valuable\" is guaranteed, possible, or impossible?", "doc_id": 40, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [17, 147, 119, 203], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that She might have sat all afternoon, nibbling and stuporous, exhausted but not sleepy. But the glazier finally came down from the upper floor, cheerfully announcing that all was now right and tight and he would be on his way. Maggie could tell that he would have liked to stop for a chat that he felt sorry for her left on her own but she lacked either her grandmother's grace or her mother's energy so she did not offer him tea. \n\nTherefore, \"the glazier would have liked to stop for a chat\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I wouldn't be surprised. A: You know, because they don't want to send them to daycare. B: I doubt if they would say it was too long. \n\nTherefore, \"it was too long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Well, how do you feel about the immigration laws? B: At, currently, I think they are a little restrictive. Uh, particularly for, uh, certain ethnic groups or from certain countries. Um, I think we should permit, uh, more immigration from eastern Europe, for example, uh, particularly uh, the Jewish, uh, uh, people from Russia. I think we could permit more of them in than we have permitted in the last, uh, several years. And, I think we have, uh, uh, too much restriction uh, on the Orientals also, but, of course, that's just my opinion. A: Yeah, well, I'm not real sure why I got this topic, because I don't think I checked it off on the list because I know very little about the current immigration laws. \n\nTherefore, \"he checked the topic off on the list\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Yeah, you're probably right, two years might be a little too long. B: Yeah, and there will be a lot of rebellion in that and when you get people who have no desire to be there in the first place, I don't think that they're going to be serving anybody. \n\nTherefore, \"they're going to be serving somebody\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that During that pause I realized that Mala had not been offered a seat, nor any food or drink. The Emissary and his people were behaving as if she simply wasn't there. I could see that she was scowling and stiffening into a Mark 2 temper so I gave her an encouraging smile - which raised her as I expected to a Mark 3. \n\nTherefore, \"Mala was scowling and stiffening into a Mark 2 temper\" is guaranteed, possible, or impossible?", "doc_id": 19, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [71, 68, 161, 162], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``I wanted to tell you. But the Bookman asked me to keep our meeting a secret.'' How did you know I 'd met him? \n\nTherefore, \"he had met the Bookman\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that But the horror of losing was as much to do with money as with pride. Biddy had never let them down, come without fail all through the bad weather, and now was giving Nails an intensive course on her own horse which - in terms of money - was worth another couple of hundred pounds. Yet surely she knew they had no way of paying should she demand it? \n\nTherefore, \"they had no way of paying\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: It's divided, yeah. B: Wow! A: It really is, so we've got our Cowboys here and, uh, I don't think anybody roots differently \n\nTherefore, \"somebody roots differently\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: if you get it on sale, A: Yeah, yeah, so we bought that or we bought the filets, and then the chicken, or turkey nuggets, and I don't think anybody in my house knows the difference, unless you tell them. \n\nTherefore, \"someone in his house knows the difference\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Well, got any problems on Mockingbird with crime or is that a crime free zone there? B: No, I don't think there is any such thing, as a crime free zone any longer. \n\nTherefore, \"there is some such thing as a crime free zone\" is guaranteed, possible, or impossible?", "doc_id": 36, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [183, 62, 3, 139], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. \n\nTherefore, \"the Soviet Union itself is a threat still\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that You really don't know anything about me, do you, despite all that wallowing in my mind? As it happens I don't think I'm the right person to lead humanity into the future no. \n\nTherefore, \"she is the right person to lead humanity into the future\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. You could also say he was driven there by the railway. \n\nTherefore, \"Gustave was driven to creative retreat in Croisset by the railway\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: And, uh, I got to stay home with my kids, which I really wanted to do, but now I could not go back and do it. B: Yeah. A: I really couldn't, I don't think I could stay home all the time and do nothing. \n\nTherefore, \"he could stay home all the time and do nothing\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: I do too, so she couldn't possibly turn them out like some of these popular writers, B: Huh-uh. A: but oh, her books are just incredible. I don't think they've ever made a movie, do you? \n\nTherefore, \"they've ever made a movie\" is guaranteed, possible, or impossible?", "doc_id": 12, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [43, 58, 141, 107], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. \n\nTherefore, \"he is having an affair with Lexy\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that But he ended up eating it himself. I was reluctant to kiss my mother, afraid that somehow her weakness and unhappiness would infect me. Naturally I didn't think for a minute that my life and spirit could stimulate her. \n\nTherefore, \"her life and spirit could stimulate her mother\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: That was kind of a funny movie with, uh, Richard Dreyfuss and Bill Murray. A: Uh-huh. B: That was fun. A: Golly, I don't think that I've ever heard of that movie. \n\nTherefore, \"he has heard of that movie\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that There was no answer. Moving carefully, Benny stepped around the edges of the room, and opened the window shutters. Her momentary horror at seeing the unmistakable form of General Etienne was only slightly dulled by the realization that the stiff posture he was in could only mean he was dead. \n\nTherefore, \"General Etienne was dead\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: And she got kind of fearful of being on her own. She never really ate very well. It was one thing. She hardly ever took care of herself A: Yeah. B: and she didn't eat. She ate very poor so I think she was, you know, bad, uh, nutrition on top of it. And, uh, she got to the point she didn't want to alone anymore. So, A: So often I think though, elderly people don't realize that their diet is that bad. \n\nTherefore, \"elderly people's diet is that bad\" is guaranteed, possible, or impossible?", "doc_id": 48, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [182, 196, 183, 226], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: I think so, I think, B: I really do. Oh, yeah, it's going to take, uh, you know, the police, I don't think can do it alone, you know. \n\nTherefore, \"the police can do it alone\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: And they go down the line ten years and then on some little technicality they get out and on the streets again doing the same they did before. A: Uh-huh. B: And, you know, that's about the only thing. Like for theft and stuff like that or manslaughter, you know, I don't think they should do that. \n\nTherefore, \"they should do that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. \n\nTherefore, \"the Soviet Union itself is a threat still\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Yeah, it's interesting because, uh, we're just having conversation on this, uh, with a couple of people yesterday. And I was expressing my frustrations that, uh, so many problems, I work in a high school, are that kids don't have a degree of self-discipline which may be reflected in society at large. Uh, and you can't expect in a classroom for a particular course an hour a day to counteract, uh, sixteen or seventeen years of influence at home. B: Right. A: Um, and, it's seen more so because when you call parents up, many parents won't even recognize that there is a problem and they'll say, oh, well, my kid, I've never heard anything about this before. This is the first time there have been problems. and, you wonder, don't these parents know that teachers talk, \n\nTherefore, \"teachers talk\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: And other than that I do not think it should be allowable. I think it should be illegal for them to want to do that. it's kind of the big brother syndrome, I mean, I just, anything like that just kind of scares me. A: I tend to view it, even though I don't think I'd work for a company that did that, I sort of want to defend an employer's rights uh, in addition to an individual's rights, \n\nTherefore, \"she would work for a company that did that\" is guaranteed, possible, or impossible?", "doc_id": 16, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [79, 249, 98, 193], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that She didn't know if they had given themselves sufficient time to think things over before they married - that was the kind of question her sister Louise asked. Edward stayed in the Engineers for a bit, then came out and was not very successful in finding a job to suit him. That wasn't his fault and if anyone said that it was Nenna would still feel like poking a hole in them. \n\nTherefore, \"it was Edward's fault\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Under the Racketeer Influenced and Corrupt Organizations law, or RICO, the government has the authority to seek to freeze or seize a defendant's assets before trial. According to individuals familiar with Mr. Antar's case, prosecutors issued their warning this week after one of Mr. Antar's attorneys asked whether legal fees might be subject to seizure. In a letter, prosecutors told Mr. Antar's lawyers that because of the recent Supreme Court rulings, they could expect that any fees collected from Mr. Antar may be seized. \n\nTherefore, \"any fees collected from Mr. Antar may be seized\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``That's good.'' Ruth stood looking at her. Rachaela could imagine Emma would have been all congratulations and the joys of womanhood. \n\nTherefore, \"Emma would have been all congratulations and the joys of womanhood\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: but that is one of my solutions. Uh... B: I know here in Dallas that they have just instituted in the last couple of years, uh, a real long period of time that you can absentee vote before the elections. And I do not think they have seen a really high improvement. \n\nTherefore, \"they have seen a really high improvement\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Mr. Steinberg made a $59.7 million profit on the sale to Disney of his investment in the company in 1984. But lawyers said Mr. Steinberg probably faced much more potential liability because, when he sued Disney during his takeover battle, he filed on behalf of all shareholders. When Disney offered to pay Mr. Steinberg a premium for his shares, the New York investor didn't demand the company also pay a premium to other shareholders. \n\nTherefore, \"the company would also pay a premium to other shareholders\" is guaranteed, possible, or impossible?", "doc_id": 26, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Possible", "target": "Guaranteed", "pred_idx": 2, "target_idx": 0, "fewshot_idx": [238, 222, 104, 215], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: Well, I was never there for any sentencing. Uh, I finally got empaneled on one case, uh, on my next to the last day, and, uh, we got into the, uh, jury room to, uh, decide the case, and there was one guy on the jury who announced to everybody that he didn't need to deliberate, because he'd already decided that the guy was, uh, not guilty, and he would never vote for guilty. A: Huh. B: So, uh, they appointed me jury foreman and I, uh, didn't think that, uh, going in without deliberating allowed us to reach a verdict, \n\nTherefore, \"going in without deliberating allowed them to reach a verdict\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? \n\nTherefore, \"they're a security threat\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that ``For such a person, finding a protector might not be so difficult, even in Edinburgh.'' Jean smiled. He might have known that even someone as sensible as Miss van Wiliamsburgh would try to make a play of this sort. \n\nTherefore, \"even someone as sensible as Miss van Williamsburgh would try to make a play of this sort\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I thought Carter was good too, and that was, yeah, B: Did you? I always liked him, I thought he was great at the time and I just couldn't get over the fact that Reagan beat him. you know, that I just couldn't believe that he got voted out. \n\nTherefore, \"Carter got voted out\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that But there was little chance of discovering who had killed him without help. Kelly decided that she had to talk to Annie, even if there was a risk that she would tell her husband. Bill would have a fit if he knew his apprentice was turning supersleuth. \n\nTherefore, \"Bill's apprentice was turning supersleuth\" is guaranteed, possible, or impossible?", "doc_id": 43, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Possible", "pred_idx": 1, "target_idx": 2, "fewshot_idx": [191, 114, 20, 133], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: You know, back around, you know, in the twenties and thirties when they were growing up, uh, you know, they were all located together, in one small community. A: Right, right. Right. B: And I mean when time went on the family grew and moved away and so forth. And now when they come together it's generally, you know, like say the kids of those people who are not, you know, anywhere near one another and I do not think they feel the closeness that they used to be there. Which is a shame \n\nTherefore, \"they feel the closeness that they used to be there\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that The Deputy Under Secretary could talk of his host's prospects and disappointments, he could learn of the problems of digging out foreign exchange and hard currency in the Third World, the tribulations over the renewal of Residence Permits, the difficulties of keeping reliable servants, but of his own world he must remain silent. The Deputy Under Secretary headed the Secret Intelligence Service of the United Kingdom, and that was not a subject matter for gossip and conversation on a bougainvillaea-fringed veranda as the lights of the fishermen's dug-outs floated inside the coral reef... No bloody way. He was a man who could be honest with himself and in honesty he could say that he was both pleased and relieved to be back at his desk on a grey Monday morning in London. \n\nTherefore, \"the Deputy Under Secretary was both pleased and relieved to be back at his desk on a grey Monday morning in London\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Chopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. \n\nTherefore, \"something had happened\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: But, uh, if the wind comes basically from the south it can be really bad. A: Uh-huh. B: Uh, the State of Wisconsin, as a matter of fact, uh, started some litigation against Illinois because of the air pollution we were getting. A: Uh-huh. B: Uh, I don't think it's going to go very far, \n\nTherefore, \"it's going to go very far\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: how'd you like to own a piece of property where your lake is going sour because of acid rain. A: Right. Right. B: It's, uh, really a serious issue for those of us up in this, uh, sector up here. A: um, or do you hypothesize that most of the, uh, smog or air pollution comes from vehicles \n\nTherefore, \"most of the smog or air pollution comes from vehicles\" is guaranteed, possible, or impossible?", "doc_id": 25, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [140, 74, 193, 230], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: nanny, sort of? Uh-huh. Uh-huh. B: and you know, I could envision a society where that would happen and make an interesting, uh, uh, story or whatever. A: Yeah. B: I don't think I have a philosophical problem with that. \n\nTherefore, \"she has a philosophical problem with that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Anna looked at Peter again and said to herself in a guilty whisper, ``Will he become even more difficult?'' She wondered if a stranger could tell that he was difficult, just by looking at him. Would such a person watching Peter now reading the prayers of Rite B in his level pleasant voice notice that resentment lay like his blood just under his skin because the life he had chosen had not turned out as he had expected it to? \n\nTherefore, \"resentment lay just under Peter's skin\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: but that is one of my solutions. Uh... B: I know here in Dallas that they have just instituted in the last couple of years, uh, a real long period of time that you can absentee vote before the elections. And I do not think they have seen a really high improvement. \n\nTherefore, \"they have seen a really high improvement\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. \n\nTherefore, \"she really enjoys it\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``There's still room for boutique properties,'' says James Barrett, president of MarCor Resorts Inc. Off the Strip, MarCor is building the Rio, a hotel-casino with a Brazilian theme and only 430 rooms -- all of them suites. Despite the proliferation of tourist distractions, Las Vegans haven't forgot that gambling is still what the town is all about. \n\nTherefore, \"gambling is still what the town is all about\" is guaranteed, possible, or impossible?", "doc_id": 22, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [142, 89, 114, 131], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: so it's nice to get away. It's just amazing, how much you miss. B: Yeah, it, Yeah, it, yeah, it really is. I mean, I don't think I ever see the Little Dipper, \n\nTherefore, \"she has seen the Little Dipper\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``Molly likes having an audience for her tales and it passes the hours for them.'' When Miss Louisa had a second more severe stroke at the end of August, and Miss Ellen another heart attack, both old ladies died within a few days of each other. Their friends could only feel that death was merciful in the circumstances especially with war imminent and that Molly had made the closing months of their lives very happy. \n\nTherefore, \"death was merciful in the circumstances\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The Deputy Under Secretary could talk of his host's prospects and disappointments, he could learn of the problems of digging out foreign exchange and hard currency in the Third World, the tribulations over the renewal of Residence Permits, the difficulties of keeping reliable servants, but of his own world he must remain silent. The Deputy Under Secretary headed the Secret Intelligence Service of the United Kingdom, and that was not a subject matter for gossip and conversation on a bougainvillaea-fringed veranda as the lights of the fishermen's dug-outs floated inside the coral reef... No bloody way. He was a man who could be honest with himself and in honesty he could say that he was both pleased and relieved to be back at his desk on a grey Monday morning in London. \n\nTherefore, \"the Deputy Under Secretary was both pleased and relieved to be back at his desk on a grey Monday morning in London\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I did, too. A: I mean, it was just more for my money. B: Yeah. I didn't think it was too long at all. \n\nTherefore, \"it was too long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: So, I don't know I'm looking for a good year. I guess we're always looking for a good year. B: So, obviously though, do you think they're going to do anything in the playoffs to make it to the Super Bowl this year or who do you like to do that this year? A: Uh, no I don't think the Cowboys have got a chance. \n\nTherefore, \"the Cowboys have got a chance\" is guaranteed, possible, or impossible?", "doc_id": 37, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Possible", "pred_idx": 1, "target_idx": 2, "fewshot_idx": [139, 155, 66, 129], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: And, uh, I got to stay home with my kids, which I really wanted to do, but now I could not go back and do it. B: Yeah. A: I really couldn't, I don't think I could stay home all the time and do nothing. \n\nTherefore, \"he could stay home all the time and do nothing\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: What am I afraid of? A: Yes. B: Um, I don't know if I'm really afraid of spending too much. I just, uh, don't think that I need them, you know. \n\nTherefore, \"she needs them\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Why should this topic matter? You talked about everything else as you usually do. Why should I feel Maelmuire is important? \n\nTherefore, \"Maelmuire is important\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: but I found that, uh, it was made of some material which actually ended up rusting uh, after, A: Oh. B: even, despite, you know, diligent washing, it got rusty after about, uh, three weeks of use. And I don't think it was my fault because you know, I had made a point of like drying it off and cleaning it \n\nTherefore, \"it was his fault\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: And yet, uh, I we-, I hope to see employer based, you know, helping out. You know, child, uh, care centers at the place of employment and things like that, that will help out. A: Uh-huh. B: What do you think, do you think we are, setting a trend? \n\nTherefore, \"they are setting a trend\" is guaranteed, possible, or impossible?", "doc_id": 15, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [51, 168, 147, 123], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Ockleton, Morpurgo, Cornelius, Dysart and half a dozen others too drunk to mention. But there was so much coming and going that any one of us could have slipped out, pushed Everett through the window and slipped back again without being noticed. Damn it all we didn't even notice Everett was missing until a porter tripped over him in the quad so anything's theoretically possible. \n\nTherefore, \"Everett was missing\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: They have to for international trade. B: Yeah. A: But, I guess it's easier to switch back and forth than it used to be, uh, because uh, of computers coming into everything. B: Uh-huh. Yeah, I don't think switching back and forth is that big a deal. \n\nTherefore, \"switching back and forth is that big a deal\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: I wouldn't be surprised. A: You know, because they don't want to send them to daycare. B: I doubt if they would say it was too long. \n\nTherefore, \"it was too long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: And I don't think that rehabilitation is effective. A: Right. Have to agree with you, and I'm kind of in favor of capital punishment also. I just don't think that it acts much as a deterrent to these people because, uh, you still see them committing the same crimes, \n\nTherefore, \"it acts much as a deterrent to these people\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Whether the relationship had gone beyond friendship Dalgliesh would now never know. She had, apparently, spent little of the money on herself, had been a dependable benefactress of the few eccentric charities of which she approved, had remembered them in her will, but without egregious generosity, and had left the residue of her estate to him without explanation, admonition or peculiar protestations of affection, although he had no doubt that the words ``my dearly beloved nephew'' meant exactly what they said. He had liked her respected her had always been at ease in her company but he had never thought that he really knew her and now he never would. \n\nTherefore, \"Dalgliesh really knew his aunt\" is guaranteed, possible, or impossible?", "doc_id": 5, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [70, 102, 138, 82], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that At the heart of the universe there is cruelty. We are predators and are preyed upon, every living thing. Did you know that wasps lay their eggs in ladybirds piercing the weak spot in their armour? \n\nTherefore, \"wasps lay their eggs in ladybirds\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``Oh, very well,'' he said wearily. He might have known that it was useless to argue with McAllister - her tongue was as long as her will was strong. \n\nTherefore, \"it was useless to argue with McAllister\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: but if they get the little tiny kids saving it now, in five years, when they get bigger, it'll work a little bit more, too. A: Yeah. True. B: Because it's, we've all got to do it right now. I just, I really amazed to find out that, eighty per cent are filled now, in garbage fills. In five years we're supposed to be at max. A: Uh-huh. B: I don't think I can keep my own garbage. \n\nTherefore, \"she can keep her own garbage\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that However, I will let the gynandrous renegade stay here under one condition. If Penumbra questions either of us, we will not lie. If she suspects Crevecoeur is here and asks we let him go back. \n\nTherefore, \"Crevecoeur is here\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that And I resent what happened to my flat. A couple of guys think they can stake it out and wait for me, rub their filthy fingers on my clothes, piss in my bathroom, and I'm supposed to ignore it. I know what I said about possessions being like leeches but that don't mean I 'll surrender them to a pair of punks. \n\nTherefore, \"he will surrender his possessions to a pair of punks\" is guaranteed, possible, or impossible?", "doc_id": 44, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [93, 183, 121, 85], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that O listened. As he listened he could smell the man. And he could hear that the song was almost turning into a sob. \n\nTherefore, \"the song was almost turning into a sob\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. \n\nTherefore, \"the Soviet Union itself is a threat still\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: and then once they do get elected, they don't have the power or the authority or the willingness to do those things that they promised, you know, beforehand. B: Right. A: You know, maybe it just wasn't possible at all in the first place, you know, like the no new taxes thing. You know, that's, uh, with the economy going the way it is and everything, that was nearly ridiculous thing to, even try to do. B: Yeah. Yeah. Well, I don't think he's going to have to worry about that next year. \n\nTherefore, \"he's going to have to worry about that next year\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Your honesty shines out of your face, my darling. It isn't your fault that cynical men like myself won't let themselves believe what they see! I just wish you could believe that Eddie's death was an accident and nothing to do with me. \n\nTherefore, \"Eddie's death was an accident and nothing to do with him\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: And you also get a lot of, uh, juries are extremely, uh, and from what I hear, I have some friends who do expert witness testimony and they say that, uh, juries are extremely vulnerable to, uh, sort of emotional pitches, you know, the prosecutor will want to, oh, I don't know show the mugging victim, you know, show the nice person he was and what a family life, and basically get the jury to be very sympathetic with the victim, or, uh, if it's a corporation, that was, uh, you know, harming some individual or something like that, they get very much, well, you know, it's just a big faceless corporation. let's make them pay as much as possible. Things like that. B: Uh-huh. A: So, not, I mean, I'm, the problem is I can't guarantee that a judge would necessarily be much better than a jury, but I'd be real nervous having a jury not at least fully agree on what the settlements would be, things like that. B: Ri-, I don't think the judge should just make the decision alone. \n\nTherefore, \"the judge should just make the decision alone\" is guaranteed, possible, or impossible?", "doc_id": 50, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [145, 229, 91, 4], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: Now see I. A: I'm intrigued by it, but I'm not sure I want to go see it yet. B: Yeah, I don't think I want to see that either. \n\nTherefore, \"she wants to see that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: Well, that's kind of the way I feel about rock and roll sometimes, too, I guess. They don't really, has kind of the same sound over and over, and the other thing I don't like about it is they have a tendency to play the instrumental so loud that you can't understand what the lyrics are A: Um. Right. B: you can't understand what they're saying on some of those songs which probably is just as well on some of them, too. A: Yeah. And I can't say that I like a lot of the very modern, uh, rock and roll, \n\nTherefore, \"she likes a lot of the very modern rock and roll\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that He also liked swimming and cycling. He said that he wrote stories, though he had to admit that he had never got further than the first two pages. Willie meanwhile not only remained silent during these conversations but picked his berries slowly so that they might forget that he was there but he reckoned without Zach. \n\nTherefore, \"Willie was there\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Some of them, like for instance the farm in Connecticut, are quite small. If I like a place I buy it. I guess you could say it's a hobby. \n\nTherefore, \"buying places is a hobby\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I've never heard that one, that's very nice. Oh, so I'm all for the metric system and converting over and I think, I guess, my feeling is the way to do it is to just start giving weights, you know, have a very brief transition period and then just start giving weights and kilometers, er, just as in kilometers and weights and kilograms and everything like that and, uh, just have people start using it rather than having people constantly trying to convert. Remember me getting a package of something that said one pound, this is a package of dates mind you, was, presumably something you weigh fairly precisely, it said one pound and then in parenthesis it said four hundred fifty-four point six grams. B: Right, right. A: And, as near as I could tell, seeing that was basically anti-metric propaganda cause anyone who would say, well look I can either buy a pound of something at four hundred sixty-four point six grams which, of course, they couldn't weigh it out accurately anyway, um, every time I see something like that I think, well, that's an anti-metric argument. B: Yeah. Well, uh, I don't think it could ever happen with a quick transition. \n\nTherefore, \"it could happen with a quick transition\" is guaranteed, possible, or impossible?", "doc_id": 23, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Possible", "target": "Impossible", "pred_idx": 2, "target_idx": 1, "fewshot_idx": [148, 222, 216, 78], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: But, uh, uh, I don't understand, I guess, why the schools seem to have such a high dropout rate in the big cities. B: Uh, well, I don't pretend to understand that either. Uh, but I'm not quite sure that it's the kind of thing that ought to be blamed on the schools. But then, again, I'm not quite sure where the blame ought to be put. Uh, because the dropout rate is, in those areas, w-, it's high in those areas where also there's poverty and crime. And they all seem to go together. And it seems like if you could eliminate one of the parts of that circle, where you have the dropout rate and crime and, you know, general poverty kind of conditions, that things ought to get better. So, uh, the other two a-, they're all three social issues and could be addressed by the government in any ways. And clearly, to me, is a kind of government thing to fix but it's just like, I don't expect them to know which part is best to fix just like I don't know. it's a complicated issue. I still don't think I would blame it directly on the school. \n\nTherefore, \"he would blame it directly on the school\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? \n\nTherefore, \"they're a security threat\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that A: Well I, uh, when is your next one, uh, scheduled now. B: Well it's like, the last one was my high school graduation the next one was when I graduated from college, so I guess about two more years. A: Yes, well, and do you think you'll have a baby to take back with you. \n\nTherefore, \"speaker B will have a baby to take back with her\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that How do you know? she was going to ask, but his smile was answer enough. If DeVore said there was going to be a vacancy there would be a vacancy. \n\nTherefore, \"there was going to be a vacancy\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that 'Very well, I'll go. But I pick my own men, and if we ever have to fight, you obey my word.'' Jehan did not think that Sidacai was in any position to impose conditions but he sat back in his chair considering. \n\nTherefore, \"Sidacai was in a position to impose conditions\" is guaranteed, possible, or impossible?", "doc_id": 54, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [233, 198, 0, 189], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: That is the reason, I don't play over there. B: Yeah. A: I like the course, but I don't play over there because, they don't, uh, you know don't allow you to pull a cart. B: Right. A: And, I don't think a cart damages the turf. \n\nTherefore, \"a cart damages the turf\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: Uh, I have, uh, I guess a lot of thoughts about the Vietnam War, um, I guess I feel like I was pretty young while it was going on and so there's probably a lot of things I remember and a lot of things that I really didn't have a clue as to what was happening. B: Yeah. A: Um, looking back, like maybe some of the things that I know now, I'm not sure I do believe it was worth the cost in dollars and lives. That was one of the questions that she asked us to think about, because we never went to war. I don't think we were committed to winning it and getting out \n\nTherefore, \"they were committed to winning the Vietnam War and getting out\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that It was a complex language. Not written down but handed down. One might say it was peeled down. \n\nTherefore, \"the language was peeled down\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: The one thing I sometimes wonder about, um, in civil cases is, uh, whether, especially sort of in, uh, maybe like product liability, or medical malpractice, where there's, um, sort of a very technical decision to be made sometimes B: Yes. A: you know, it's not just a matter um, of, you know, did this guy rip off this guy, and it's just a matter of interpreting a contract, it's sort of a matter of, um, you know, sometimes getting into very technical issues, and I wonder um, if the system works adequately in educating the jurors about, uh, whatever, um, you know, issue is under discussion. B: I don't think that they educate them enough to really know what's going on. \n\nTherefore, \"they educate the jurors enough to really know what's going on\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: Because too often, there can be extremism that hurts from any direction, regardless of whatever you're arguing or concerned about. A: Yeah. Right. Yeah, I know, you're right, they would lobby that and I see that, and that's why, you know, I'm like, okay, what's my role in this thing,, you know, what's my part, B: Yeah. A: because I don't think the system is going to get fixed. \n\nTherefore, \"the system is going to get fixed\" is guaranteed, possible, or impossible?", "doc_id": 33, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [115, 113, 166, 90], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that I WAS really only a bystander in the tragedy of young Mr and Mrs McLeod. It was not really my business although it could be said that I had known them both - had seen them about - for most of their lives. \n\nTherefore, \"she had known Mr. and Mrs. McLeod for most of their lives\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that I spent just over an hour with Patterson which, I found out later, turned out to be another first. Time is money in the City and few people are worth an hour unless it's over lunch and only then if you're involved in a takeover bid. It was also I learned one of the few occasions anyone at PKB could remember that Patterson had a meeting with his door shut and nobody got fired. \n\nTherefore, \"Patterson had a meeting with his door shut and nobody got fired\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: but at the same time I think it would do them a world of good. B: Yeah. A: But there's a, B: I don't know that you could require everyone yeah, to do it for a whole year, or two years or something like that, \n\nTherefore, \"speaker A could require everyone to do it for a whole year\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Joseph spat and spluttered blood. He had lost the two centre top teeth and with the tip of his tongue he could feel that the two on either side were also loose. \n\nTherefore, \"the two teeth on either side were also loose\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I suppose so. Let me push the button. B: All right. A: Okay, uh, I guess I'm supposed to be all for switching to the metric system, but, uh, I sense that it's not going to happen anytime soon. B: Yeah, I don't think it's going to happen either, \n\nTherefore, \"it's going to happen\" is guaranteed, possible, or impossible?", "doc_id": 32, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [207, 96, 109, 174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: you know, sometimes I would go over, but you know, it wouldn't hit me in a big way because I knew that, uh, I would have it covered in that respect. A: Right. Right. That's good. I don't think we've gone that far, to pay it you know, in advance before we spend it, \n\nTherefore, \"they've gone that far\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Then it cried. It was another girl. I was a little disappointed but I could only hope that Celia was still a bit hazy from the drugs. \n\nTherefore, \"Celia was still a bit hazy from the drugs\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that And I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. \n\nTherefore, \"Frank Donovan was not the jovial easy-going character she remembered\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I think in s-, and it, just would depend upon the circumstances and the extent of the abuse and if another alternative was available. A: Uh-huh. Uh-huh. Um. Uh-huh. You know, now, I wonder what you think about this and, uh, unfortunately, we don't get to do it, but, uh, it used to be a long time ago, I guess in Biblical times when they had punishment, if somebody did something, for example, to your family, then you had the right to administer the punishment. So if somebody killed somebody in your family, then uh, if that person was caught and found guilty, you had the right to, uh, execute that person. And I know that, uh, if somebody had done something to my family, I would feel that I had the right to get revenge on them uh, but, I don't think that's done much anywhere. \n\nTherefore, \"that's done much anywhere\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Then they would awake, terrified and sweating, to find themselves in white starched linen, in a comfortable bed, in peaceful England. And all would be well. It may be said that although he survived it the siege nevertheless had a bad effect on the Collector. \n\nTherefore, \"the siege nevertheless had a bad effect on the Collector\" is guaranteed, possible, or impossible?", "doc_id": 30, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Impossible", "pred_idx": 1, "target_idx": 1, "fewshot_idx": [148, 1, 47, 41], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: But, uh, uh, I don't understand, I guess, why the schools seem to have such a high dropout rate in the big cities. B: Uh, well, I don't pretend to understand that either. Uh, but I'm not quite sure that it's the kind of thing that ought to be blamed on the schools. But then, again, I'm not quite sure where the blame ought to be put. Uh, because the dropout rate is, in those areas, w-, it's high in those areas where also there's poverty and crime. And they all seem to go together. And it seems like if you could eliminate one of the parts of that circle, where you have the dropout rate and crime and, you know, general poverty kind of conditions, that things ought to get better. So, uh, the other two a-, they're all three social issues and could be addressed by the government in any ways. And clearly, to me, is a kind of government thing to fix but it's just like, I don't expect them to know which part is best to fix just like I don't know. it's a complicated issue. I still don't think I would blame it directly on the school. \n\nTherefore, \"he would blame it directly on the school\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that It is part of their religion, a religion I do not scoff at as it holds many elements which match our own even though it lacks the truth of ours. At one of their great festivals they have the ritual of driving out the devils from their bodies. First the drummers come on - I may say that no women are allowed to take part in this ritual and the ladies here will perhaps agree with me that they are fortunate in that omission. \n\nTherefore, \"no women are allowed to take part in this ritual\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that It - the tractor, the boys and the bulbs in the earth - knew she had chosen for them and was coming back to them. Of course there was still love, there was healthy, growing love and its name was called Work. She had fallen in love with it so slowly and gently and sweetly that she had never noticed it had happened. \n\nTherefore, \"falling in love had happened\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Most of them young, about his age, stood and talked and drank and laughed. The two girls he had noticed earlier were standing talking to some other girls. Graham hoped they all realised that just because he was standing talking to Slater that didn't mean he was gay too. \n\nTherefore, \"Graham was gay too\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Valence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping? \n\nTherefore, \"Valence was helping\" is guaranteed, possible, or impossible?", "doc_id": 0, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [230, 91, 215, 39], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. \n\nTherefore, \"she really enjoys it\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that He also liked swimming and cycling. He said that he wrote stories, though he had to admit that he had never got further than the first two pages. Willie meanwhile not only remained silent during these conversations but picked his berries slowly so that they might forget that he was there but he reckoned without Zach. \n\nTherefore, \"Willie was there\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I thought Carter was good too, and that was, yeah, B: Did you? I always liked him, I thought he was great at the time and I just couldn't get over the fact that Reagan beat him. you know, that I just couldn't believe that he got voted out. \n\nTherefore, \"Carter got voted out\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that They 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. \n\nTherefore, \"Miss Lavant was in love with Dr Greenslade\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: I'm like, I'll get a job some day and my boss will pay for it, I'll be needed. B: Yeah. A: Because, um, I didn't want to go do it myself because I didn't think I was really going to use it. \n\nTherefore, \"he was really going to use it\" is guaranteed, possible, or impossible?", "doc_id": 11, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [190, 56, 233, 39], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: The kind you have sounds very interesting, though. A: Well, um, they're really sweet dogs. uh, we thought probably our neighborhood in Houston had more of this breed than any other place just because of, um, the family that had them and bred them. \n\nTherefore, \"their neighborhood in Houston had more of this breed than any other place because of the family that had them and bred them\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``It's not your day, is it, dear?'' ``No, but they 've got to be done, and Shirley's making the tea.'' Ianthe had not told her mother that she sometimes had to dust the books in the library. \n\nTherefore, \"Ianthe sometimes had to dust the books in the library\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: That is the reason, I don't play over there. B: Yeah. A: I like the course, but I don't play over there because, they don't, uh, you know don't allow you to pull a cart. B: Right. A: And, I don't think a cart damages the turf. \n\nTherefore, \"a cart damages the turf\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that They 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. \n\nTherefore, \"Miss Lavant was in love with Dr Greenslade\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Well, actually, uh, A: I don't think I'm in the, uh, majority in Texas \n\nTherefore, \"she is in the majority in Texas\" is guaranteed, possible, or impossible?", "doc_id": 39, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Possible", "target": "Guaranteed", "pred_idx": 2, "target_idx": 0, "fewshot_idx": [86, 214, 239, 222], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that I can't afford to get bogged down in the weeds. But at least you know she did leave. Maybe a coincidence maybe the two girls talked on the phone decided they 'd both had enough. \n\nTherefore, \"the two girls had both had enough\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that B: So am I. A: Are you, B: You know, I think it's kind of coming back around to that, don't you, \n\nTherefore, \"it's kind of coming back around to that\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: And the tanks came in and, you know, pretty much took care of that. A: Exactly. B: And, A: Yeah, uh, that, personally I don't see as Gorbachev as being maybe a threat, and I think he's actually, honestly trying to do some change. B: Uh-huh. A: But I don't believe that he, in this first pass around, you know, being the first one to really turn things around or attempt to is going to be allowed to get away with it either. \n\nTherefore, \"Gorbachev is going to be allowed to get away with doing some change\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: when you've lost something or uh, uh, don't have what other people have that's when you tend to realize, you know, what's out there and you know, what you have and what you don't have. A: Yeah I agree. B: So the original question, do we think they're you know, a security threat? \n\nTherefore, \"they're a security threat\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that The other set being in the pocket of Giovanna's overall from which she now drew them and held them up making it clear that they would be relinquished only upon her death and then only into the hands of Signor Kettering. What had occurred was quite contrary to the wishes of the padrone who would be outraged if he ever got to hear of it. Despite this disastrous beginning however Giovanna would be there in the morning her own family circumstances permitting and she would be much obliged if the Signora would make sure that her children were up and dressed and the breakfast eaten so that she could see that the house was returned to something like the order which Signor Kettering expected of it. \n\nTherefore, \"the house was returned to something like the order which Signor Kettering expected of it\" is guaranteed, possible, or impossible?", "doc_id": 20, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [17, 123, 85, 11], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that She might have sat all afternoon, nibbling and stuporous, exhausted but not sleepy. But the glazier finally came down from the upper floor, cheerfully announcing that all was now right and tight and he would be on his way. Maggie could tell that he would have liked to stop for a chat that he felt sorry for her left on her own but she lacked either her grandmother's grace or her mother's energy so she did not offer him tea. \n\nTherefore, \"the glazier would have liked to stop for a chat\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: And I don't think that rehabilitation is effective. A: Right. Have to agree with you, and I'm kind of in favor of capital punishment also. I just don't think that it acts much as a deterrent to these people because, uh, you still see them committing the same crimes, \n\nTherefore, \"it acts much as a deterrent to these people\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Your honesty shines out of your face, my darling. It isn't your fault that cynical men like myself won't let themselves believe what they see! I just wish you could believe that Eddie's death was an accident and nothing to do with me. \n\nTherefore, \"Eddie's death was an accident and nothing to do with him\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``Look, lady, give me a break. I just deliver the stuff, I don't interview it for the Sunday papers.'' He waved the paper at her and even at a distance she could see that it said very little. \n\nTherefore, \"the paper said very little\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Really. If they were to take half of what they spend on that and put it on some economic, you know, intergovernmental or inter United States like programs that one really might. B: Yeah. A: I believe in paying my share, and I don't mind, uh, paying for some of these fringe benefits that people are entitled to. But I just sometimes feel like I'm being used. But, uh, again I don't think we'll be able to do anything about it, \n\nTherefore, \"they'll be able to do anything about it\" is guaranteed, possible, or impossible?", "doc_id": 17, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [126, 230, 59, 236], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: Yeah, they did. They put a lot of pressure on him from the outside and from the inside. Uh, it's funny watching them play, he's probably like a lot of quarterbacks, uh, when the pressure is really on when it's down to the last few minutes of the game for the season is when the guys seem to really do their best. B: Uh-huh. A: And I haven't quite figured that out, if they figure they have got it won or if there's no real hurry because the first three quarters or, uh, uh, if something happens that that adrenalin starts flowing. \n\nTherefore, \"they have got it won\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that A: so I watch the fish, you know. Whatever I can do to keep myself occupied. I like to have the TV on, because that usually keeps me, um, more occupied. It kind of takes the time away and I don't realize, that's really the only time I ever watch TV, is when I'm on the bike. and then usually after I'm done riding the bike, just to cool myself down, I usually take a walk, you know, and that just kind of uh, gets me, you know, to where I'm not quite as tired I guess. But it's definitely a task. B: You think so? A: I can't say that I really enjoy it. \n\nTherefore, \"she really enjoys it\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``Ely,'' I said (that was her name and the first time I 'd ever used it), ``I want to be free.'' She looked stunned. I don't think she 'd considered this. \n\nTherefore, \"Ely had considered him wanting to be free\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Yeah, I think that's what aggravates a lot of people, is somebody does get a life sentence in place of the death penalty, and they wind up back on the streets after five years or six years or like the kid on the news tonight out in Mesquite who was out in six months.. B: Uh-huh. Yeah, it's just our criminal system is just so, I guess, overloaded, but the problem is not so much with the prison system, you know, I mean, because the cops are out there doing their job enforcing the laws, and the prison system are just, you know, they're trying to cope with them, but, you know, the thing about capital punishment I, you know, a lot of people don't think it would be a deterrent, uh, to future crime, \n\nTherefore, \"capital punishment would be a deterrent to future crimes\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Louise was equally anxious to see this man who had had the power to persuade her niece to go against her upbringing and character and behave so recklessly after such a brief acquaintance. Waiting in the airport she was suddenly aware of Nora striding towards her. She was impressed by how elegant she looked and could tell by her cousin's walk that Nora also felt that she was looking good. \n\nTherefore, \"that Nora also felt that she was looking good\" is guaranteed, possible, or impossible?", "doc_id": 41, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Possible", "target": "Impossible", "pred_idx": 2, "target_idx": 1, "fewshot_idx": [18, 112, 243, 33], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Jim waited. He waited a long time, and when the young doctor finally came out, it was almost dark. Jim could nonetheless tell by his anxious face that something was wrong. \n\nTherefore, \"something was wrong\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that For a while the notion gripped him, and he prowled the many floors of Hamleys looking for toys. He bought a magic set for Sebastian (although his ideal present for the kid would have been a brand-new name) and a marionette for Louise. He could remember that there was an age for puppets and magic just as he could remember the time that he 'd spent trying to fan a deck of cards or sitting in front of a mirror trying to get the hard consonants down like a real ventriloquist. \n\nTherefore, \"there was an age for puppets and magic\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The South Korean government is signing a protocol today establishing formal diplomatic relations with Poland. The two are also signing a trade agreement. South Korean government officials said they don't expect that Seoul can loan money to Warsaw, but it can ``offer experience.'' \n\nTherefore, \"Seoul can loan money to Warsaw\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``I hope you are settling down and the cat is well.'' This was a lie. She did not hope the cat was well. \n\nTherefore, \"the cat was well\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that A: so I don't know if I wasn't drug tested based on that or because the man who hired me didn't request the drug test, because I know that my company does drug testing on occasion. B: Right. Well, for instance, does the company you worked for before have the right or do they have the ability to say, hey, we've already drug tested her and she came up negative. A: Well, no, I don't think they can force another company to not drug test me just by saying that I didn't, I mean, \n\nTherefore, \"they can force another company to not drug test her\" is guaranteed, possible, or impossible?", "doc_id": 3, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Possible", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [231, 49, 26, 152], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: No, it was, I didn't like the way it ended. A: I know, well the only reason I know why it ended is on Arsenio Hall one night, Christopher Reeves told, that, you know, B: Uh-huh. A: I can't believe they killed them. \n\nTherefore, \"they killed them\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``His name is Matthew Blake,'' Mandy informed Charity as they descended the steps from their cabin on to the paved pathway that led to the lodge. Thankfully she hadn't even noticed that Charity had changed from the blue wrap-around skirt and was now wearing red shorts with her white silk blouse. \n\nTherefore, \"Charity had changed from the blue wrap-around skirt\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that I should dearly have liked to know whether they were Europeans or Americans, but I couldn't hear the accents. They appeared to be arguing. I hoped the white men weren't telling him to eliminate all witnesses because I don't believe it would have needed much persuasion. \n\nTherefore, \"eliminating all witnesses would have needed much persuasion\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: That might be kind of tough, huh. A: It really would, yes, yes, and like I said, my sister's still in it, and I really don't think my mother'd want to be there, either. \n\nTherefore, \"his mother would want to be there\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: And I haven't quite figured that out, if they figure they have got it won or if there's no real hurry because the first three quarters or, uh, uh, if something happens that that adrenalin starts flowing. They say, hey, we got to do something now. And then start playing the game the way the game should be played toward the last few minutes. B: Yeah. A: So, I don't know I'm looking for a good year. I guess we're always looking for a good year. B: So, obviously though, do you think they're going to do anything in the playoffs to make it to the Super Bowl this year \n\nTherefore, \"they're going to do anything in the playoffs to make it to the Super Bowl this year\" is guaranteed, possible, or impossible?", "doc_id": 8, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [14, 75, 15, 8], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that And why bother to write anyway? What was there to say? Mary had some vague idea that Adam's parents might suspect he was down here and come to see him. \n\nTherefore, \"Adam was down here\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Jean was tough and liked to drink. She would endure for a long while yet. But what would she do when she realized that with things as they were she was on a life sentence not just a temporary suspension of essential pleasure? \n\nTherefore, \"Jean was on a life sentence\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``And you're not having this dress,'' Nora said, bending down to look at the price tag. ``It's two and a half guineas!'' she hissed at Louise who could tell that she was genuinely appalled. \n\nTherefore, \"Nora was genuinely appalled\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Like now. The Community in Knockglen would defend Eve vociferously. Even some of the Sisters here in Dublin might see that the girl had a point. \n\nTherefore, \"the girl had a point\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that He left his own number, calling himself Alain - the name of her Malaysian-French ``business manager''. The next day Gina rang. She had obviously not noticed that it was her own number. \n\nTherefore, \"it was Gina's own number\" is guaranteed, possible, or impossible?", "doc_id": 29, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [31, 99, 45, 21], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Nora calculated that there must be lots of single men up there so she decided it was ideal for the ``manhunt'', as we called it, even though the train fare was a serious consideration. I remember we went up to Euston together one Saturday morning, very excited, to buy the ticket in advance. It was a secret - if my parents had known she was going away alone they would have soon put the kybosh on it. \n\nTherefore, \"Nora was going away alone\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Meh ' Lindi did not develop the lower set of arms nor the bony, sinuous tail. Too much to expect a new pair of arms to grow out of her ribs, or her coccyx to elongate so enormously. Nor could Jaq imagine that she could attain the full strength of a purestrain Stealer - though her own strength was formidable even when unenhanced. \n\nTherefore, \"Meh ' Lindi could attain the full strength of a purestrain Stealer\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that What must it be like to be imprisoned here, day after day, month after month? I wonder, does he keep them chained and manacled, thought Fenella, or does he use sorcery? And so utterly immersed was she in this strange blue and green land that was not feeling strange any more that she did not even notice that she was weighing sorcery against steel chains and seriously considering the likely outcome. \n\nTherefore, \"Fenella was weighing sorcery against steel chains\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that It seemed impossible that anyone could endure such pain for so long, but at last the doors of the Renault slammed and there was comparative silence. The engine was started up, revving violently as the car was turned round on the narrow road. John could tell that it was being driven back up the hill towards Putna. \n\nTherefore, \"the car was being driven back up the hill towards Putna\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that And I don't want to have to lie to them. The kidnappers have given us until October the eleventh to deliver the document and I haven't despaired of finding it before then. But if the police learn I 've been to America they 'll ask why. \n\nTherefore, \"he's been to America\" is guaranteed, possible, or impossible?", "doc_id": 13, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [49, 88, 236, 174], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``His name is Matthew Blake,'' Mandy informed Charity as they descended the steps from their cabin on to the paved pathway that led to the lodge. Thankfully she hadn't even noticed that Charity had changed from the blue wrap-around skirt and was now wearing red shorts with her white silk blouse. \n\nTherefore, \"Charity had changed from the blue wrap-around skirt\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that She longed for a weapon, for even a hairpin, and knowing that she did not have one, she knew too that she was totally defenceless, unarmed and alone. She could feel the great flight of the dragon and sensed that she was high in air and travelling fast towards the sunset. She could feel the great muscles of the dragon's wings send ripplings down the stomach walls and she gave herself over to death. \n\nTherefore, \"the great muscles of the dragon's wings sent ripplings down the stomach walls\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Yeah, I think that's what aggravates a lot of people, is somebody does get a life sentence in place of the death penalty, and they wind up back on the streets after five years or six years or like the kid on the news tonight out in Mesquite who was out in six months.. B: Uh-huh. Yeah, it's just our criminal system is just so, I guess, overloaded, but the problem is not so much with the prison system, you know, I mean, because the cops are out there doing their job enforcing the laws, and the prison system are just, you know, they're trying to cope with them, but, you know, the thing about capital punishment I, you know, a lot of people don't think it would be a deterrent, uh, to future crime, \n\nTherefore, \"capital punishment would be a deterrent to future crimes\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: I think in s-, and it, just would depend upon the circumstances and the extent of the abuse and if another alternative was available. A: Uh-huh. Uh-huh. Um. Uh-huh. You know, now, I wonder what you think about this and, uh, unfortunately, we don't get to do it, but, uh, it used to be a long time ago, I guess in Biblical times when they had punishment, if somebody did something, for example, to your family, then you had the right to administer the punishment. So if somebody killed somebody in your family, then uh, if that person was caught and found guilty, you had the right to, uh, execute that person. And I know that, uh, if somebody had done something to my family, I would feel that I had the right to get revenge on them uh, but, I don't think that's done much anywhere. \n\nTherefore, \"that's done much anywhere\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``But my father always taught me never to be afraid of pointing out the obvious. I'm sure you have noticed the implication of the letter, that the writer has in fact observed Jenny undressing for bed?'' I just wondered if you also knew as I'm sure you do that her bedroom's at the rear of the house? \n\nTherefore, \"Jenny's bedroom's at the rear of the house\" is guaranteed, possible, or impossible?", "doc_id": 2, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [31, 54, 39, 125], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Nora calculated that there must be lots of single men up there so she decided it was ideal for the ``manhunt'', as we called it, even though the train fare was a serious consideration. I remember we went up to Euston together one Saturday morning, very excited, to buy the ticket in advance. It was a secret - if my parents had known she was going away alone they would have soon put the kybosh on it. \n\nTherefore, \"Nora was going away alone\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Her priggishness. I admire it. I know she does wrong things she tries to organize other people's lives she can't see Mr Knightley is a man in a million. \n\nTherefore, \"Mr Knightley is a man in a million\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that They 'd seen Miss Lavant on the promenade and about the town, always walking slowly, sometimes with a neat wicker basket. Kate had often thought she was beautiful. She hadn't known she was in love with Dr Greenslade who had a wife already and three children. \n\nTherefore, \"Miss Lavant was in love with Dr Greenslade\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: And I've worked in the hospital for fifteen years and I've taken care of a few AIDS patients. A: Uh-huh. B: Uh, when they asked us did we want to, uh, keep it the same or, uh, spend more, spend less, uh, I think right now what they're spending is adequate. Uh, for my personal opinion. Uh, because I think it's something that's going to take them a while to come up with a, uh, vaccine for. A: Yeah. Uh-huh. Uh-huh. B: I don't think it's going to be that easy to come up with \n\nTherefore, \"it is going to be that easy to come up with\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: You never see them on nationally basketball. A: You know, that's true. I don't think I've ever seen them nationally on basketball. \n\nTherefore, \"he has seen them nationally on basketball\" is guaranteed, possible, or impossible?", "doc_id": 28, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [86, 29, 170, 110], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that I can't afford to get bogged down in the weeds. But at least you know she did leave. Maybe a coincidence maybe the two girls talked on the phone decided they 'd both had enough. \n\nTherefore, \"the two girls had both had enough\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that Jed wondered. He 'd scarcely set eyes on him since the night they 'd had dinner together at the house in Westwood. Nobody had mentioned him either and Jed didn't feel he should ask. \n\nTherefore, \"Jed should ask\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: they did things for people, you know, for their communities, for their, uh, families, for their friends, where now, I'm not sure they really do. A: Yes. Yeah. Well, I think sometimes through groups and organizations, um, uh, when they asked the question I thought, well that sounds wonderful. And then, I wondered if people were unwilling but I think even if you went in with a negative attitude I don't think it would stay negative very long. \n\nTherefore, \"that attitude would stay negative very long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``And you're wrong about us. We're not children and I 'd say we're learning the rules pretty quickly.'' You may have noticed I'm not shaking any more! \n\nTherefore, \"he's not shaking any more\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Yes, um, I think that Plano has really done a fantastic job. I mean, at least their plans are good. Um, however, I was, maybe you saw in the paper this morning that, um, they've had some problems with, the recycling on plastic, \n\nTherefore, \"they've had some problem with the recycling on plastic\" is guaranteed, possible, or impossible?", "doc_id": 49, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [55, 87, 69, 183], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that Only she herself knew the evil thoughts she had and how effortlessly they could be translated into action. ``I 'll make a cup of tea.'' No she would not tell Peter that the person he loved most in the world was dead. \n\nTherefore, \"the person Peter loved most in the world was dead\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Country churches were never locked. You could wander in at any time. Perhaps Cromwell when he passed also found the door of Coldingham Priory locked and decided that he would get in anyway even if it meant removing a whole wall in order to do so. \n\nTherefore, \"Cromwell would get in anyway\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that I ducked so fast I wasn't sure whether he 'd seen me or not, but it gave me a prickly feeling just to imagine it, so I scuttled for the door and legged it up the spiral stairway three steps at a time, just in case. As I ran, I remember thinking stupid thoughts like. How did he know I was up here looking down? \n\nTherefore, \"he was up there looking down\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. \n\nTherefore, \"the Soviet Union itself is a threat still\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: No, I don't either. B: Uh, I mean it's, you know it, A: I don't think it's going to change very much \n\nTherefore, \"it's going to change very much\" is guaranteed, possible, or impossible?", "doc_id": 27, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [137, 158, 20, 11], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: Yeah, that's crazy. B: and then you come here in the Dallas area, um, I don't believe that people should be allowed to carry guns in their vehicles. \n\nTherefore, \"people should be allowed to carry guns in their vehicles\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: But, uh, B: Okay. Uh, uh, I've had one or two American cars I think, and they were okay. I had a Pontiac once and I never had a problem with it, but, uh, my mother had a Dodge at one point and I had driven it a few times and I really did not feel that I would buy a Dodge just from, \n\nTherefore, \"she would buy a Dodge\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Chopra stood unsteadily on his feet. The shapechanger bounded around with excitement. Chopra could tell something had happened. \n\nTherefore, \"something had happened\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``Look, lady, give me a break. I just deliver the stuff, I don't interview it for the Sunday papers.'' He waved the paper at her and even at a distance she could see that it said very little. \n\nTherefore, \"the paper said very little\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The lunch trade had mostly disappeared so he wasn't hard to spot. He was at a window table but he was ignoring the river, being deep in conversation with a middle-aged man wearing a suit and a short sheepskin car coat with matching brown suede shoes. Even from this distance you could guess the guy's tailor was based in Dublin. \n\nTherefore, \"the guy's tailor was based in Dublin\" is guaranteed, possible, or impossible?", "doc_id": 55, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [199, 140, 96, 169], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: um, they try to encourage you to follow a specific curriculum, although you don't have to. A: Uh-huh. B: And then if you have particular religious beliefs they're kind of monitored. You know, they will allow you to, I can't think of any examples but certain religious groups don't want their children in public schools because the influence. And maybe they were a group of Mennonites or something like that. A: Uh-huh. B: I don't think they're were in this area \n\nTherefore, \"they were in this area\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: nanny, sort of? Uh-huh. Uh-huh. B: and you know, I could envision a society where that would happen and make an interesting, uh, uh, story or whatever. A: Yeah. B: I don't think I have a philosophical problem with that. \n\nTherefore, \"she has a philosophical problem with that\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Then it cried. It was another girl. I was a little disappointed but I could only hope that Celia was still a bit hazy from the drugs. \n\nTherefore, \"Celia was still a bit hazy from the drugs\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that A: And now it's election time again so they're trying to lower them. B: Oh. A: So they're just talk about lowering them but they never do, they just keep raising them. B: I've never seen taxes really go down. \n\nTherefore, \"taxes would really go down\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that It grew bigger with incredible speed, she was whizzing towards it. She must slow down or she 'd miss it. She took her foot off the accelerator and put it on the brake and as the car slowed she could see now that it was a child a toddler with a red woolly hat on. \n\nTherefore, \"it was a child\" is guaranteed, possible, or impossible?", "doc_id": 7, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Guaranteed", "pred_idx": 0, "target_idx": 0, "fewshot_idx": [104, 188, 43, 178], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``For such a person, finding a protector might not be so difficult, even in Edinburgh.'' Jean smiled. He might have known that even someone as sensible as Miss van Wiliamsburgh would try to make a play of this sort. \n\nTherefore, \"even someone as sensible as Miss van Williamsburgh would try to make a play of this sort\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: Right, you know, like In packaging A: Yeah. B: and, uh, you know, just goodness. A: Yeah, I don't think they do the packaging at this plant, \n\nTherefore, \"they do the packaging at this plant\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that ``Oh, my poor Folly... We 've been together for five years, Lexy and I - she practically holds that company together. Of course I gave her an ``A''. But that doesn't mean I'm having an affair with her. \n\nTherefore, \"he is having an affair with Lexy\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: Boy that's scary, isn't it. B: Oh, can you imagine, because it happens in the middle of the night, so you know, these parents didn't know the kid was gone until the kid is knocking on the door screaming, let me in. \n\nTherefore, \"the kid was gone\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Biddy was right. In London, I did some serious thinking. I could see that my character had not improved since I had heard about my expectations. \n\nTherefore, \"his character had not improved since he had heard about his expectations\" is guaranteed, possible, or impossible?", "doc_id": 31, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [189, 70, 210, 155], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: The one thing I sometimes wonder about, um, in civil cases is, uh, whether, especially sort of in, uh, maybe like product liability, or medical malpractice, where there's, um, sort of a very technical decision to be made sometimes B: Yes. A: you know, it's not just a matter um, of, you know, did this guy rip off this guy, and it's just a matter of interpreting a contract, it's sort of a matter of, um, you know, sometimes getting into very technical issues, and I wonder um, if the system works adequately in educating the jurors about, uh, whatever, um, you know, issue is under discussion. B: I don't think that they educate them enough to really know what's going on. \n\nTherefore, \"they educate the jurors enough to really know what's going on\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that At the heart of the universe there is cruelty. We are predators and are preyed upon, every living thing. Did you know that wasps lay their eggs in ladybirds piercing the weak spot in their armour? \n\nTherefore, \"wasps lay their eggs in ladybirds\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: All right, well. A: Um, short term, I don't think anything's going to be done about it or probably should be done about it. B: Right. Uh, are you saying you don't think anything should be done in the short term? \n\nTherefore, \"anything should be done in the short term\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that B: What am I afraid of? A: Yes. B: Um, I don't know if I'm really afraid of spending too much. I just, uh, don't think that I need them, you know. \n\nTherefore, \"she needs them\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: and that rolling kind of, uh, B: Terrain. A: Yeah. is fairly famili-,. The thing that I thought was interesting was that the critics, apparently it's going to win everything. B: Really? A: Uh, and I had been told, you know, you wouldn't notice that it was three hours long, and all this, kind of, \n\nTherefore, \"it was three hours long\" is guaranteed, possible, or impossible?", "doc_id": 6, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [184, 118, 6, 188], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: That's what I've heard too. Uh, A: It seems bizarre to me. I don't quite understand it, although I think probably the worst thing that's happening at least the modern world today, is television. \n\nTherefore, \"the worst thing that's happening in the modern world is television\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I don't know how my parents did it. A: Yeah. B: I mean, there were five of us and I don't recall, you know, wanting anything in particular. Uh, but I don't know how my father did it. He worked at a truck line and he just didn't make that kind of money with five children. But we did okay. We had a house and a home and, but now, my wife and I both work and I don't believe we have as much as my parents did. \n\nTherefore, \"he and his wife have as much as his parents did\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that Then the silence in the Zoo became complete. Woil stared around him and then suddenly with a push of his wings raised himself into the air, turned, and landed ten feet away on the back of a green bench. Creggan could see that he was afraid and that his fear was making him terribly uncertain. \n\nTherefore, \"Woil was afraid\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: Right, you know, like In packaging A: Yeah. B: and, uh, you know, just goodness. A: Yeah, I don't think they do the packaging at this plant, \n\nTherefore, \"they do the packaging at this plant\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: for the moment, and that's what really is getting me about what George Bush's stand on the budget is right now is that he is saying, I am going to give you this ludicrous little tax cut so that you'll be happy come November, and you'll elect me again B: Uh-huh. Uh-huh. A: and then I'm going to go on and just forget everything that I said B: Uh-huh. A: or you know, it doesn't seem that it's going to make much of a difference. \n\nTherefore, \"it's going to make much of a difference\" is guaranteed, possible, or impossible?", "doc_id": 4, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Possible", "target": "Guaranteed", "pred_idx": 2, "target_idx": 0, "fewshot_idx": [38, 116, 183, 67], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that And what she had said, and went on saying quietly, calmly, efficiently, was that she loved Maggie. She paid attention. At eight Maggie had not known that her grandmother was famous but she had seen that people had something in their manner when they looked at Rachel. \n\nTherefore, \"Maggie's grandmother was famous\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that Paula could not help herself. It was just the way she was. Others might say they hated her and mean it. \n\nTherefore, \"others hated Paula\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: How do you feel about that. B: I don't really, I more, I don't know about the government as much as, uh, the people, uh, I wouldn't consider to be a threat at all and I really don't feel much like the Soviet Union itself is a threat anymore. \n\nTherefore, \"the Soviet Union itself is a threat still\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that It is all very well, in these changing times, to adapt one's work to take in duties not traditionally within one's realm. But bantering is of another dimension altogether. For one thing how would one know for sure that at any given moment a response of the bantering sort is truly what is expected? \n\nTherefore, \"at any given moment a response of the bantering sort is truly what is expected\" is guaranteed, possible, or impossible? Possible\n###\nAssume it is true that Who knows how many quarrels, false accusations, unnecessary dismissals, how many promising careers cut short can be attributed to a butler's slovenliness at the stage of drawing up the staff plan? Indeed, I can say I am in agreement with those who say that the ability to draw up a good staff plan is the cornerstone of any decent butler's skills. I have myself devised many staff plans over the years and I do not believe I am being unduly boastful if I say that very few ever needed amendment. \n\nTherefore, \"very few plans ever needed amendment\" is guaranteed, possible, or impossible?", "doc_id": 18, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Impossible", "pred_idx": 0, "target_idx": 1, "fewshot_idx": [213, 119, 28, 2], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that A: Um, yeah, I guess that's not an easy solution. there's no easy solution for that. B: Uh-huh. I don't know that there is an easy solution, but if you could find a way to prevent some of it, and I'm not sure what it would be. It would be money better spent than, A: Uh-huh. B: do you know it costs more to keep an inmate on death row than it does to send a kid to Harvard? \n\nTherefore, \"it costs more to keep an inmate on death row that it does to send a kid of Harvard\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Well, how do you feel about the immigration laws? B: At, currently, I think they are a little restrictive. Uh, particularly for, uh, certain ethnic groups or from certain countries. Um, I think we should permit, uh, more immigration from eastern Europe, for example, uh, particularly uh, the Jewish, uh, uh, people from Russia. I think we could permit more of them in than we have permitted in the last, uh, several years. And, I think we have, uh, uh, too much restriction uh, on the Orientals also, but, of course, that's just my opinion. A: Yeah, well, I'm not real sure why I got this topic, because I don't think I checked it off on the list because I know very little about the current immigration laws. \n\nTherefore, \"he checked the topic off on the list\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that He had seen something I should have, which was a car turning in from Soho Square and coming up behind me. My right foot hovered over the accelerator pedal and I balanced Armstrong on the clutch. I wasn't as convinced as Malpass that Nevil was out of harm's way. \n\nTherefore, \"Nevil was out of harm's way\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that The Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy. \n\nTherefore, \"Gustave was shepherded into creative retreat at Croisset by epilepsy\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that A: Your turn. B: Okay. Uh, I don't think they should abolish it. \n\nTherefore, \"they should abolish it\" is guaranteed, possible, or impossible?", "doc_id": 14, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Possible", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [164, 167, 234, 109], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: All right, well. A: Um, short term, I don't think anything's going to be done about it or probably should be done about it. \n\nTherefore, \"something's going to be done about it\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: I got a friend who goes there, by the way. I want to talk to you about that afterward, okay. B: Okay. Uh, I've, the high school I went to uh, was a good one also. And well, I guess you could say one of the problems with the public education system is the disparity between different schools. \n\nTherefore, \"one of the problems with the public education system is the disparity between different schools\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: I do not know. I wonder where he gets it? You know, you must, I think TV is bad. Because they, uh, show all sorts of violence on, A: That and I do not think a lot of parents, I mean, I do not know how it is in the Air Force base. But, uh, I just do not think a lot of people, because of the economy, both need to work, you know. I just do not think a lot of parents are that involved any more. \n\nTherefore, \"a lot of parents are that involved\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that And I was excited by my new importance. Proud and pleased to be playing opposite Frank Donovan who had once stood in for Hayden Coffin. Occasionally perhaps I should notice that he was not the jovial easy-going character I remembered from my humble place in the chorus. \n\nTherefore, \"Frank Donovan was not the jovial easy-going character she remembered\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: uh, but it's worked out for my family, to have my cake and eat it too, kind of thing. A: Yeah. Yeah, that's a good deal. Where do you think this is going in the future, I mean, do you think things are going to change, \n\nTherefore, \"things are going to change\" is guaranteed, possible, or impossible?", "doc_id": 52, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Guaranteed", "target": "Possible", "pred_idx": 0, "target_idx": 2, "fewshot_idx": [110, 170, 242, 73], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that ``And you're wrong about us. We're not children and I 'd say we're learning the rules pretty quickly.'' You may have noticed I'm not shaking any more! \n\nTherefore, \"he's not shaking any more\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that B: they did things for people, you know, for their communities, for their, uh, families, for their friends, where now, I'm not sure they really do. A: Yes. Yeah. Well, I think sometimes through groups and organizations, um, uh, when they asked the question I thought, well that sounds wonderful. And then, I wondered if people were unwilling but I think even if you went in with a negative attitude I don't think it would stay negative very long. \n\nTherefore, \"that attitude would stay negative very long\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that The trend toward lower rents may seem surprising given that some communities in New York are bemoaning the loss of favorite local businesses to high rents. But, despite the recent softening, for many of these retailers there's still been too big a jump from the rental rates of the late 1970s, when their leases were signed. Certainly, the recent drop in prices doesn't mean Manhattan comes cheap. \n\nTherefore, \"Manhattan comes cheap\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that At length she decided that there was nothing to be gained by worrying her. Probably there was some quite innocent explanation, which Roger Kenyon would give her when she returned the wallet - if, indeed, it were his. And yet why had his manner changed so abruptly when he learned that the girl whose hat he had rescued was going to live at Sunset Cottage? \n\nTherefore, \"the girl whose hat Roger Kenyon had rescued was going to live at Sunset Cottage\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that ``Who knows? The point is, do we go with it or not?'' Do we assume there is a shipment? \n\nTherefore, \"there is a shipment\" is guaranteed, possible, or impossible?", "doc_id": 1, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} +{"pred": "Impossible", "target": "Guaranteed", "pred_idx": 1, "target_idx": 0, "fewshot_idx": [186, 82, 243, 204], "fewshot_target_idx": [0, 0, 0, 0], "fewshot_source": "train", "fewshot_num": 4, "ctx": "Assume it is true that B: Well, you've got, well, any of the big cities you've got the different rival gangs and they're having their little turf wars over their little drug kingdoms and such, A: Uh-huh. B: And they get out their little Mac tens, they get out their little uzis and they're going to fight with them. And it doesn't matter what restrictions you put on that type of weapon or a class three firearm. If they want it they'll get it. I don't care if they've got to go down into New Mexico to get it they'll get it and they'll get across the border. Now my position, although, I have absolutely no use for a fully automatic weapon, anyway. A: Uh-huh. B: Since I am a law-abiding citizen and I have never had a felony, if I wanted to buy one, I don't think there should be that big of a restriction on it. \n\nTherefore, \"there should be that big of a restriction on it\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that However, I will let the gynandrous renegade stay here under one condition. If Penumbra questions either of us, we will not lie. If she suspects Crevecoeur is here and asks we let him go back. \n\nTherefore, \"Crevecoeur is here\" is guaranteed, possible, or impossible? Guaranteed\n###\nAssume it is true that The South Korean government is signing a protocol today establishing formal diplomatic relations with Poland. The two are also signing a trade agreement. South Korean government officials said they don't expect that Seoul can loan money to Warsaw, but it can ``offer experience.'' \n\nTherefore, \"Seoul can loan money to Warsaw\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that A: How did Radio Shack work? B: If you go in and buy anything they want your phone number. And I don't think they're going to call me and ask me how it's functioning, \n\nTherefore, \"they're going to call him\" is guaranteed, possible, or impossible? Impossible\n###\nAssume it is true that B: and, you know, they just love kittens. A: Yeah. B: They just are fascinated. A: Oh, yeah. B: So she doesn't know that this is a cat yet. \n\nTherefore, \"this is a cat\" is guaranteed, possible, or impossible?", "doc_id": 51, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_name": "guaranteed/possible/impossible", "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""} diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1548208d30c9309868d4cd3bcdb1455a3b158898 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8bc127d250d85c46e228132f3be9909ec7d5901dee75fc674af904f9e0d59b2 +size 57301 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f58282eaf286db80b6bdc923a5417f3f99675f72 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b212213a80db0750f637a8c7c8e62801835375474671a470caa86fbd0125469c +size 81125 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d0b7b23e293f027cbc13158c7145d24277a7d3a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d45ef28e857ed22c1cc3e233a513468dfb8a0d88fc82dfcd19a46befc0e9d70 +size 103750 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cc62fafcd7bee062f20359cc29e3455b675100f5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed4d6d998e6f11e4b7543e55a7b6f4c2203b3af7e164bf234e9538386ccfbb6 +size 125907 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d7187941695949437deebeaf4cbf32315cd87f48 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e3b37bbfd9f8cc6ca1bddd0af862ff738095730fbc91f97688379d0fcdd0ba +size 92229 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9f5882a209569303fb8dcbaebfb9430d674869b4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5985ca6ccb548c3174be60e6786af9f60e33fe6e2628d01b76c63fc49b327690 +size 111593 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..758103d9ddd9ed5d1bf281217b4c804fc5414912 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:070c8568e8ecc29769a0d330102bc5b9adc7652732e22d3f47739e214f2ea474 +size 132095 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7201dc59351f1f16b656763e1ca37a055034bf66 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7faf6de4517c24eab88cdf950a819fb658b121e898d527997d5d3e07e6a62c7 +size 152227 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8504eeabc6e754890f71248ed5d8a0b5c581e88a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013425c11f75d7cd50ca3c05636ad296143812771a6c7f5ed7723a2272827274 +size 172060 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..033c8ba49696884d16c03f8c5c12711e12de0187 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8114b897074946139789cd2daa53442c712ea11803bbaf28f7c7ef5213115b0 +size 87862 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..acace0f83d99d09de4eebe183cf6e400266f19d4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a1cb9fa10303fe551daf929401c7d37958281de82750ccd165878cc9d42589 +size 105055 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..153bac95a7762eda72b49ba34e446002a7d9ddf7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4948422f564edff94f7262a5ef212005ba7db008208ba89094fc958a50a187b2 +size 123405 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dcbd38386116490b6d97a105f3c981f015f9db4b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03be26b1331750ac422fab82973790f4c4b338027aee6f0a58345ed2e5cf6bb4 +size 141421 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75471fa0e8f4f241e2be67e35f669ff7b9e99bd2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bcaf22fada0e8afbfd67fd430b751e5cf524b2b6d05c75a3a269b235aafdc83 +size 159075 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..47d09667cf066c968e3944e497c67ee473c0a3c9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed85d3602f6e788c4af798a96b32eff407d90ede7078eccd6e69113d2182709 +size 85180 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eb877530d7336e350dcccceb8ae44bc9a8d63a8f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28628fd34374548f6525a49c864fff02dd42613c996ee4e8db4d376d21b8c696 +size 101230 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72997de611044cda71d1bf18fe0cb647bbaf312a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db656677f8697b5a49cf30797691a42ec0117cb05f27ae0be91d5c32616ff045 +size 118464 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..91fc3560251bb62eba745a54e8e6214593109623 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e6575af14e33b9e46e720ab48aa661330d4c6aa133e7954da92061c9b659876 +size 135377 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..787623020dae77b3cbdb9b3448cea651e405b5b8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f38312880f99f890aa18cbff643515df44a07521abc1dfa3aa459171aa01c4 +size 151933 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a9df2b3cec15e0a8ad9a2222889a5153867012cf --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09e8b87233896402576df1ceb54c1e4d83d79cf1bd330fdd9e19e6c41e9bace +size 96762 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7c51c66556679f14333302af55d466975dd97976 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:830bb0797960f9bb8b203c23dc667ed7c0af75de5ab9f027b4afab1ac6c631cb +size 118267 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9b19c838e102280a83faf2e8fe790343422c8985 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fabca06300994f0a4e95be5d909f64837f8250910b36dc4cf7d8238160711bd8 +size 140914 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5f6ffa541a461349db0d5f1cece0fb04b48b26a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55715651ac7eb74c1a01b69e61424174c57854234abddf0bbec00e087f0b7c42 +size 163237 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a987e483119f7691a24e9f8c2a0963577b80f8b5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52783cbc8901622501f3891a1a165b00031847c3c55d94a0c0234b37cd18a5c0 +size 185190 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d89621edb19e11f31ae1e635f8c6fa31a0ce8955 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fff0896d17159ac06a45fa8d0a03932d145bc64a001407b88b0183b8daf827a +size 95789 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96d416f16937108c566ebcbbc8a9cd8568cd9be9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24d6f256a7776905caff8fb4ad5f19883362d7a7b21cc7b44f8c49686aef628d +size 115657 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2513526e7fc0037e60f0e4146ffd1fd706b1794a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3378e0248585de7c601c4a4a5acfffe5938b7de415e314251324ab5e604dd228 +size 136677 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5c6210adc4919324c450d8a32f68140ffa678ebe --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd44ced80788f4a3212fb963e154b4945184104ca5e80944e79ff9325b695852 +size 157391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4734f0118e80797494d1ff822e95df7b1368ea6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a58a5eca2650b08d2d6fe60dba14036c4f78ab879bb9cd6e36cd5e23a35743 +size 177743 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ec7e0691231b83e6483a794bde952ee6c4ce5bac --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5073c3ae149baf289a0a0789cea2184b1d9903de7572471121d02db5d3e8c1d +size 3119584 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a06a0a736464a594cdda19195bcb014845135e8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a8a581f217a2ec1554c8ab8fe8bb5f42c2961dd3e22bf366f70bce4ba2ed3e +size 4133246 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b85464917d51d8ebc364bbb1046917adfde87f6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d0c7eb17774b9bec89d06d2908f385e8311f7f45311836203815b38a9ceb80 +size 5303110 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..90dd2d5eab282340380390ddd339e779877b2086 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1297329bf3ea17284d852d13d48ecddb61bef97e235916e45fb54b3268cacd +size 3460368 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b60622d7c15959ee667e77f2661af3c8bfd6d73a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2170aa8d41d782c2ac81f06196b863d18493bb88cb3d0111c78834075eb7f8e0 +size 4354709 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b20bb950f557a47f3b75788499ed5300da7d51df --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d7dbc16c44c3f985ad4486a8351ca6b396d08d188f72bece1a1c12702a2370d +size 5272024 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..edc22921e0ccb9c3ebb0a6bc216eafa85e2a0a01 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd6f6f86a80c557cd4745baa89a3fb104109b0fd8734a1f3e3a1da1b996d6f3 +size 3596379 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..12ba5607a9a90056f4d2075e97c02cdecbb3adb6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5af9a9e53e1e005fc5265404c5fe61f4efbba048f6fcd3923a4828442187309b +size 4790230 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54948040f77c892909f6ce483c84c463085524a2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3dd8c5dcd6de672d91e6cd3744c8c05c377b20d14b0cca0058c04010ca3e46c +size 5950639 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4d0d3e6e1a0874e4d5903425bd7250ea5d0d7e8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d264090c4c2eb1ab1206044e1c9256967589786a5533c323eac087cf9709ab5 +size 3755564 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..69611fbec21025e894aa80eb91d3f6a7c24a8dbe --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd12991162519a858c395458b06faf9f71923c2a3b4ee49375eb955c1132c8e +size 5189372 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..50f6fa0d37b628acedc88dd89af815e7777720f6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e33fc4fb2a1288b43c306cc6a84eb8edccbd71b583a1b5bb1b8b09b3aea8f0f +size 6256820 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be5aeabfb94311dce610bbc2875e871ed44e8fff --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c62fd8bcc0cf98c0cc54330fbf7eae2f4142cf985c44b972d5ceb480204e430 +size 2925936 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..174da6b675682cdd2ec4a879a5196bd33f51bcc4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9bc11dde085c791ffca7f6325f7198caa8257b31d38a9bb8f25e7db556edcf +size 3894596 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fb929fec2d06adcdc0357859643eeff4d7832534 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c824b9d3bdc9a244270f4114d1173a96a7b3c15d032e9da6cf0a2423d7067bf +size 4694749 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cf23f2ef81f3a023c16bb17d37fdf01cc3a8256e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76b6305af042930c77e16ce29eb539b4b634d0efd7e0ae77f0f7ec6c1b51e98 +size 2889695 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6520b055c3452025cd53d8091199e4721416b4da --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a052a1b91bf372fd0c758fcd4763572cbc6ef5c2d843a5c6ffdfa12e0d39ce64 +size 5204384 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..00bd63f5fe763c0e058aac89d91c185312b740e9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac12bb9436ed912529606df4c1306a584a8f684ba879ba3739b2d70972dd8dc4 +size 7493588 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..110251552e8a9002627bb21f7ab22a5aef966ef9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362defafc6d843c7465a8edca0d133a5f9275d9161635646ddea1f4a4b0aeacc +size 2763850 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..607b8fff24b60db3a3a8b8e9b1eed7d9b8455359 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5960889472c9c53bbc6e7cf3e98c4a96ab1e35cf0f00e464813c8e302d01bf74 +size 5054564 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..de773a257f994fa1205aca7e8260c104949e0e0c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa65af028fc7badbdc40eff91286e159cb9cbaebdc7057760726d0e75e6500f6 +size 7317148 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9502f416f7f1a7fedd0f7d3fc5337179d345cfd2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b24993085da9bc112d74c0b32b484f7c0e92c5c6486072c76e5739e8f85ae40b +size 2829113 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b08b72f3d2249f78b29e197f790255a3c9247b3d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7c80c4bde9250d30e8bc529cba019403abbdc105b7da37debd171a3d3d1de3 +size 5106610 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77d921f0d2ea36ad1539df124a83bef58da4d5ca --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5144aa16227b6e30adfaff43cd4920d38be569802f60ac4116d5f8b3f3e89c58 +size 7380234 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..87eb1483e4bea5fd605a42255474566057aa9a79 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d350c9145164b6ab6a8cedb91b6e0d55808fc46411c3b5a6aaf608104416f39 +size 2813161 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..870347101b8bb240bb0b3b3062ad632975e437ce --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1276de53bee41c40ee1751d6ac0da6baeacee29675ff4a3e58641f86373c8c45 +size 5097887 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f08ed9704091a57f37dfa568d57d4ede1bc93ebf --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e75f531de2ca56e56e2d63801566240049a4cfcd5a098e4866304448002c5a2 +size 7366339 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c711a603e679c110c0b36347c61355e1ab802b1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d88ad1f7394dee3282195b37b7d2fb6580f33bfb7b3ed93b1f81db7488723956 +size 2867701 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4365b5b40880595a2308aa93c530140bab3f5177 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44528ccca8eda2a82273b58f9ed7c293fc6cd64c95c7a99cc62571a1292ff7e +size 5163411 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0194033186315215c0340dcde5944790e25da81e --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d21ede95f2f70c66ba39ee87a98c1ac45b78ad36e6587d3254e1834ede5b99 +size 7451071 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..001fcb81f650fc9e4979a27b731593a7f6b7ebfb --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:776c1ec5068838c611bd533bafb6b7f742a67ac411bdc0c42aebd4607d5e425a +size 2082335 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2a2df0137895d1be3978fb8e896bcfeca1b894a5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab27653c8fff2006fdc964880de34c16b5d54ac11af51dfe942d3e7eeaa3e0df +size 3176404 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..49804897c5a926a693d346287c00c1b18f06e859 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01244369ecdcd1f2ecf7d9d6b5469a787d23061f5576bd49ef99bed7f7663aa7 +size 3765961 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521 +size 2115935 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3c482c3df9decd346f5fc00abccee2b98a02e3ee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97bf18b2f511bd1d461a4f115da409b0aa8d1d71c9f6a47fc844359cf871a4c1 +size 2900909 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7df28fdda259f27061c08a8c12301a16f9aeba34 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d8128dbb8810f2a3efa247865cdf13d5fdcd072d55b1d91b4d9a0e12046a7d2 +size 3683911 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6c54cedac951965e996404f36a86a3b64b36e93d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0835bef83c7b98c533ffb1032454ebadbb03bb91aa4b8cece1dceb8732685b5 +size 4458372 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f988e2c0059e86f3a37b70de22134d7e31d34c2a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a08235840684728bed3265be364bf597f1fa1b8579b8d8f0422190b84402bc +size 3058839 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2524b2aa2985389e4cae741034d375f5c24464ef --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b093691cc8c3d96facc489fe39c344056c8a6d03624be0be027abe46b3a0c51 +size 3267036 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..06be893e3301e172fcb0b47a78d870f1a17f2a5f --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee1da054b6fd163a033185c1934942ee5f374cb32b0186f3922d6002f212558d +size 3531996 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0be79427cdbb21920be474e84af2447cada6993 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803fdff989eedc35ca2d4d166ba3d147438458fba8da3ab78970ba075c90ae7b +size 1864129 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f9404f5bccab876da277ebfa14b9d4887a658d48 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fbd5f9e40a2fb7dc1d204bcf86edd14dc5e4e83468a42bc21b9754618188ab3 +size 2557203 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d339b2b287cb055ad9dc070fe3fede990fb1ebf9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ffa581a9ffaa04db91940b63c1e2960689f12120f00380fd05a44f5ef72648f +size 3248305 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b1839869131c0a0764ae6b13e1445d1b6a5b401c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5af976a4081dc55931fba501060eb5c393e97592b8e3c1ddd97fd453ba051ac8 +size 3930866 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a5a74777d1a798346750cd7a8855568c41b28300 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c84c71fe4dd08513f1921c8351fb774c30f5b5688e4aebe279238b772d52718 +size 2254293 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4c6ee3a172eb54028d9b1317ed4a4d2d36e8e376 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab476a9f82635b2dcaef2b603e11b4cf626dfea24c8e4fb21e44520aef3b4c7b +size 3016595 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e22dd832653a42228e9da5ce20e9be0d0f422ca7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1687c13951493f984fb471ea2560897714fd422fbedefbfdb4fc930297e0903 +size 3773939 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b6ede0db6113eb0036e9be66deb9429713edbb29 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0098d642789a017cb143fbadb0e498b23de80c895f3930f41e279a6e6a8e2a47 +size 4519155 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3047771f4f5512b20bae2a36b38f36833ad3aada --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc37bdd64478e5d3d931834772ee934d8e424e204fa81195684a2e890cab4df4 +size 639704 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..676b13f187261c259d8061199caa9962dac48613 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0f749068d6307e5654d15901ac855d0fd368f819eb35c91f0fbf7bb3691d07 +size 754593 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd14701c46daf4eac6df62df30abd986a6a9a08b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65346ad16493b2718f5cd24b8f51660e285e993f2113cd52d1a585eff254172a +size 870722 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29cf53f2998b165355f11b7ce6b700d723313752 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f5abef1989dcb349707b98536a4287a6980a04c693df3eed1bdce2b2a6d0927 +size 985152 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e3e0ae3d9949365fcf2938317bd2c2118c824085 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebe26bbc7a2586a0f2cbd4d0bbf986334851306f45b3f697cf8acce502653b06 +size 1182340 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2e3d96876c0f5f21c6ea737e8f6c622f81a3d39c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dbf4ca3a9ffea78a9220a63dcee7181d245a115166c5bf6f9e856caf9f8b3b8 +size 1779315 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2680cba0b5e009ca8ee539d5a641d2d979b073e9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b3af18af7ca6421e95353badbf0429fb79840431ad5f828659ee29f900eb95 +size 2388561 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f345cf424135675f47fb363b623c8711d4ff00f3 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b0a1f55d3d5ba3f23e7c96c972d754d9c734e93bec95f106784bae4f6a7310 +size 2973516 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..71fff6e8d0a9375a609430270266dd213547e5c4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d939c59e43cb507ad03d06d210f7df8263146af40316d080e512781ea1d80a4 +size 1328537 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..750d5d7a73c3fc7c5ff57592fe376b6427c6861b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7939150cbd3e3c70297fe407a99d11cc9b15efe146c1c0a09c7e6d1b7d0cf0 +size 1523868 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e789951fdb57d678791e392c5e7531aae667cd63 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09dbd720ec961e616243c1e0a667a7dc22313359b1bf6828b17d9f3fe8132411 +size 1719953 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..61bd2a66a20f35c5fb8320d9bb480852d17b1b1b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0008ac286fa43e874252c610eb97c9aa9e0583c1418f0915e230ae1a5156bb51 +size 1915002 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..908e9065633f3c8c2e777183fec5a05253e3443a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae2cb74115c289ac0c5fccd945e8b07fee2951390fabf2e29166b1db51bdc5d +size 1934756 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d87040ec4a95bc800133020313d8a8ff8444c7d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0723d47aacf344d867d4c2c6cbcd97a919728c4a19b82d6e8e15319d41a2421 +size 2635940 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b890bceda6256f093a02f6d8e348272d2bc80c4b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a5e1d451c70f07fb964f85b68b34108e744ec4826ab8b34f97ab288cec748c +size 3348914 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0946b29917a2c46ae8d96fc4d6298ec48b10f237 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7698b2748cf21af36b93a713baab6f6e1643833c433362fb4ab4cc68891706 +size 4038415 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3d1eb02b9403dcb421c11be0fd17b415ae7f508 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2fcde383028755f51abac967b68b844cc8da5de256350cd072ab930bd044a35 +size 1870061 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7de42185340a50f031b9362fda65f41b6a13daa8 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fcbb12491f1656cf106fdc80f3bcabafeeb9e4367420945088668f334399fbd +size 2545356 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b729c553c774bed5066ee92ecaf16035f99ac8a3 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b298d35d9b9409e015358f8eee87381c52bac7f400577f5b17170879c83c83a +size 3232447 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0585eadd1b5a3bf9911641da8377202e50247c2b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076d76b8cc51cabc3a6d775cd9f86c9886893ad9bca8c5f363a3e4b6b0b73318 +size 3896045 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..97142126d061e981348bf9dda8c1a6cf9c7617f2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8ed03067ffca31ca9bad60d875c4545bae08de6441c04d16132427930307269 +size 2213673 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..81a43adeed988221300e6b2632c910c20214f56b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd859d273330438fcb18c152e8d01abcfaa0cbcca353e38e30ab1b477b76da0 +size 2974329 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..af7c1e4ee331d9145bdb1327d97c658db40abdec --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c26a52dc1cf17de31227c4784c6827f73bec639d6468eda373907b276669ff2 +size 3726477 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6a5113f611b2dfb23d74b886e65c152f0c11ffc2 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:745cc33fa2d8113266365c4bb69ec9876c9c79b1359b20d73d8afa60b87d4cf3 +size 4477886 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7d66596d54e08d6057474d0ff4620f8f64d3de1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbe516d1757e7c276f47ed9c85588da725a23337e32e7606e3ce1618fd7f8df +size 2360081 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..736a73345f3b360a92f5099019ef7b4927940c6b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e7c9d1241ca6e047a2d638872acb3a9ad983d7255f784d1e3b40d2b2df7eff +size 3193081 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0dda31ecf8d3ebddd7c9e96a516597917cfc0606 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a288dbed71bfe20426d2534db930549a8395108c24f5c6d2e05dfb73b429eb4d +size 4016336 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b96a8b8a6a23ea48578034536021a36cf6dfad6 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6bfb1d4463abe938f2c17dfbb62dc45fb54b4bc7d16fd0d4cd35ff607c88795 +size 4838859 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..309138f6f12bc9738020cc6b744779a2d8606cd7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684f73ce0d6dff92d024152a7a226a375b26c3097d9af6ed25b5d73da9238bf0 +size 1879141 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a11a26f9f7d19044f3cfb0dd53cba54a41ed2fff --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c08a3991cafe64a47f090c5f33a3f59b0e7c7c83a86b111fd66a3706264e6e7d +size 2435150 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0af8f0adb1a73496755ef9aed6c199a71a98be03 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec63005f9f82213e9730f957e66c21985e68aae75e1aac5f545c4bece2d5ed89 +size 2985685 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..278319351a0b9cdcc0c33ada826c37cf95ff8369 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d19386fd1de95303308f67f0e5c1d474eebe4e30dce109eb88237d793de7484 +size 3534220 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6e5339acab471f22cd8ec7b025f6ce3a18e46e1c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6bb78d20b5578997a9cd2840d1152b6c47569a522d51935e7c67f0221e118e +size 2370858 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4d162d4a080c2a215ff7f40477c08b27a7004287 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89a964300d5f2f446bb0edacc3ccccffc4c54bbaa8315b58a971be90770907b8 +size 3210340 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..54a6b0feb028d710722d3db74e80c87ed0ee9f98 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013ea4ccb8b64d10cf377b05d2dd4cf195974a26914c7374d5475c3808786d93 +size 4040879 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fa90077b34adee1d4bcbe7bfb1202cac03c6c358 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7435951f7ced233ab753ddeb6a168b11a62d975a4a7a89709d7e4583ca8d6f +size 4870897 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9fbf895f8ff2c91c5b62e9bd3b40dc2d30420aae --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2a2bb4171aec381739c2ec77933e4e5127eed78440e18b8019c4d5519c340be +size 2343483 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b6d099047ace24084221405c4cc9e2052055c736 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0546dd27662d1d15c2249f8ae0ea068a41a94f0b67db0afb834299c5506c7315 +size 3158169 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c12d0f9534eea17da7ede1ec3d4079759b8a71c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01299683cc91bad7b968379f2c5c00b078bea27d443e97ac925f6a39f0e4b850 +size 3962360 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8fc97032eeb339fc12328a1bd75ef2f1acb53c2b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45289dfbfb0f1357cdead7b2b3cf0a8c13d62741d46c1caeccf46b924b920b60 +size 4765910 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8f876bf56778e68f28466f50974b2b2ae715dfb --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73330d9bd9252f4b33f7fd37cdb26ca64a8f581d042485124402e3ee9193a405 +size 250548 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c976399ee0bf5c951bb9afc74cdf36abb185fdcd --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f3f3e302ec07ddd94866a123c9bb8f31a78572025210bf38a3b9c416f1914d7 +size 351367 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0bf8388e1479fb2232e9495eb8baf317932ee3d0 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c198a5663e26da8a9af76afefd904cd737d6acf8e97a1024efd708dd9b5a10b1 +size 449658 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d898ea65c01932bb042c02d9fa8f701580e00f77 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38681b6053572b42f7d063c594b057f037cfa782d1f695485540591579ca9ee5 +size 553029 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5ca9c4b9ac750e617193bd96961cff534751a53c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:008dfaafac87cb6f60c04b53972bc9d55fa82969410fa552edbde2b2ed04c907 +size 651639 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d90fa7a501e00971d5a1f6074df12e90a3e92cee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f0f4f897dc142b5ebeac6356c7e4a5991da44fa8168cc73631ecd220bea699 +size 292970 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6efa92429e4ea67827903791ad821f9c7d0fe7c9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd6e86bf018400c7a0aef0ede59950485c68193062290229c40a9be0dc312dc +size 415400 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..727cbe96d8d6a254fa109c14e83fb8ee1cfb8bc1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248ca8f13034ec0f997a36a9edd134bfd02294fd7eb677cf272e895409a620c3 +size 535352 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..fefb36e9e4628b858d470eab499e22fe5614cced --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0669ba4a6fa71dbb6623ad3842374b04c91e40675447f3432cc5f542f3584be +size 660311 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..059874d6d89833da76b1390ea09f2a3ce96cb75a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a153b7ca3bd2e0a0157decba716d5e63db4bed359950b10303c6ed54ea04bb +size 780531 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f48799ba8f2372463dd4ccdeeaeb76f625deb8fc --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1156f85fa76b1a80d3aa9366ad6c195eb42ee32a56c0aa2ba50b7023f6fd5972 +size 258524 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d80d8d080d5486448378e2b6434d7ed525fd4ca4 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe60cdc60c9af2908f329c8f45c8791a99793f5ad384ea4cda56de95a9fe47de +size 363041 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7e59d004062601edbe0f794a6e9067b5b4d15340 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836a2d5c8b0dbec88a9f500cda80d902e6279a870d2214489629d901a9ad3ccc +size 465227 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7b998d592c70c53da70f0a8841fe90ed030ac48 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba5dfdc00b35b9416c460b779828a2cedc3e4e0d530542b5b090c35d87128e4 +size 572464 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ed96715d9579dde0a418c1631a467d55a94452ee --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68de920d01068f9281eea917175b394cbefce093566568e455c21f437d20e25d +size 674953 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1413fafe02b1e6b7d7297876887a4c257edb19af --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7e1a8cc1423022fa963691e2bc8ce2d8721bd8389815232c826c3378631b83 +size 261299 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..62fae241f4a3e3abf61bd28f9154155529f12338 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911663bfbf542519c6827adbbcf96efb1704b55d142547fd4bbc70b4a1ac7611 +size 367752 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..495b56f9a25b3b94dfbb161e50fd629f0b1c7c55 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:320f63de5e21033cb446a6bf6a413dd7cdf6d6267e2ce67c104bd7d0f36db894 +size 471877 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1b493cda47f9008d9d0175cbd165a2fc0aa5f978 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67941da824bd47128cc307ebe318c2c236bd618c5beeb36e04f3307adbefe5b8 +size 581067 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b92b052208ed18c6623de52b4d95f94fb1dc525a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:556b8b1bcdbb8de3b160c5d1cff1c5247fb89d7b13f9c25661da10190f256b63 +size 685492 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..95b7d93fa4905bdaa8611db131958929264f6ce5 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f602aa8da2682eae0a745b8b6db898c01c5fe99a96b1a1bd843659e48030f7c +size 262407 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..23527994b6113302ffbf552de0d62dd6587948e1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51961d933f013f85ed1a7479aa0cc3e5005c573955cfa56e2e1a2e8f0af2abff +size 369691 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8e8e936ed48ebaf68b2010bf1476b4cb0be4017 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379b5239323fb5d5ed68daf74cf8ebf24a569485f993b2fd95ee5152a90528cc +size 474657 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ae999e7b6f5cdd529d48d1f51c919511dd1911e7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48194f26616fff3d809b18016eacd73748eb6d3bc7f3d591adca7d502ded8df1 +size 584703 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e47934d001f2c8e0b213046da1b549f9a04d0021 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a1b045e787b845ae1d9494c3b4325aadc6cefee3d82ca56ff06124c26ef35c +size 689979 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd64df46c6d4093f9441faecde27da7058ce4016 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef2211f694403c3a703a34427928ccfe933ad66a14f7f361662b46293e3e1479 +size 1039286 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a8505f32e4595afeee8c1005d9d7c578f08f662 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6579b766c4539b747c21aefdb0da1fd63485b03162a38307dfa5f2604fdef06 +size 1300170 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..36f94cf93489ff5a3f9f2aa4d7216e69e755df16 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3218b8e3e6a73dc34593a485254ced3dc66c3549860fc8666cdef0375ae76195 +size 1561273 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..be6a75ca1b4a6d3c03086777b5a03332577034f9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b5c8c914d9aafda865918cfe8639b5878e2be94fe6591f173e1bda76e498cf9 +size 1822706 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6 +size 948111 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bdfa6e8e91fb57520160ec657b6779e53dc1fc21 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a714e7428e7fad25c3c717475585cbfb2a78df1fb55c3dc718c9a1cda3744827 +size 1180435 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..60f3a6c7bee1ca50e71670aee0ce41b6bcf28a6c --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bd127ae2d2a8aa77ed897866b6bcfd1a7190df6877814628ecf68bdb6be7ab4 +size 1413063 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..381d78018c74793bea53239adcc5ec231d3429a1 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af394a8cdea54f2786292ea0ebb4b9e6b172af7eb1b01f4c4fa7b1018a3136b +size 1645925 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..164bbcf7596e59f9e66ac00f5e3260afe86cc828 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d310051f7c5e5a50a4279345ff674aaf0b6cc93560410efbcbca01fd0bb1980 +size 1009922 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..127dbfd0846f006f85139fbd246dd46ff16d1a36 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50d16b778ec8f177193efa8190ba737a316622ec46c7320c71262591d5eb0bd6 +size 1243188 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..70ab30db5b52e41516fd328d2b097f6f434b00af --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48fff9d2a45d764d38a54ee06ea9ec7f7988104bd652a65ce8dcf5ed29d97a7 +size 1476420 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..792a7c3f308831248eb63cb06fbaa28ea8567f5a --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5602892e908ccd1d5a86b6b7c4f95b010e6a7c2e950c9a76910923801e28ba7 +size 1709983 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7204149ca8e5889ad8c277194406444bfa63d23 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3ca2e8e25bd697c9feab00f5c7e00c6f30340659105292743041af1adcdc1ab +size 969418 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..194f822453ba4abee24a42ae3a7ff8ebef0dc41b --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41f1ef8007d9c8c77bae6850e7e6c7f43bcc0f380894946a57e8a3ee1b0fb9b +size 1205035 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e56254a893aeef6868a9b9d09ee03164e10812d9 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c1db838e66204c66b152ab6e9158a80e60b09283ca5dd51bc298335699926a4 +size 1440890 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bbec7d8989bce573e33f27d20de027ef9d667f65 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dbc60e4e4e5e0010dfe5e043dc17b995baa2c7defbb807c1e283a143e7b847 +size 1677000 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_0.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9cad3280c35ecf3b14f38b0804f2005cdc67891 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c783c0261f913160ebc5db0af333fa646a77140f0ea162ccc0e8a445016469b2 +size 1016482 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_1.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58659ce5dabb01df5f0f2ccaed36c0ef661d0156 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35bfcb10026dba6a4a76188624e7c13eb5a20d0e0b822ddf8e7f3e8c116ddc43 +size 1257169 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_2.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..83038d46bc1719a3432ceb7818eb2c88d34ad3f7 --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:887c9b9eb694bd7985808d9d0e1302ed513c67a182e3f122c96b7d7a92a2cf31 +size 1497989 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_3.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0824675b63f96c234755892e4fc4f829078c479d --- /dev/null +++ b/4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7573ec4cf154f612e528c4bf50136b3fc2bf8a3bd011cd365b9f51a954396cde +size 1739155 diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..82a3a3c7992573872fe834ed1fb61c7acff79490 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37466495114095594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03116802815957843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07404480158534202, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014902788141002475 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31212436257013715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004568108823452381 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1135227535532633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020226161176734882 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035086188446589825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009237445527173 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1533304291888016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032417614710948096 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05397463032306079, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012920845251919216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07143892188082063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013733993783630446 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3047679674724977, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044697317892439005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10988632514530636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018918967086146744 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07084668944691519, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001397360756298291 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2997116780220477, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004343713168500645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10869588574047251, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019005624996464461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..00295ac96f29c646e919e62c585beae92f63edc8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.45724995573151395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03077826387670849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07380410032029427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013950698189371862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36979685211837077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005157808096954796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11499758191359032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018677715006027427 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03456277372759004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008655744665680597 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1825581650807761, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037692839645507878 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.053994419166879803, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001195483166076756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06966555184597864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012665956123840754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.34681007385001145, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004679678346399935 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10864238895715833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017176631324308034 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07031937468305927, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013198255014853241 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.35067051099532537, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004753075684894469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10948950405713026, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017632128912827953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17fe261f33ebfc981eca0c94ae59a9b6db969010 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.08094756912598497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002409102863540913 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.14127847705177338, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0017639762306188122 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.07376523883400579, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001716973465887151 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.001962954313202467, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00024283176641268995 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.0034433920977868515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003613572553244804 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.001825879960798421, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00021705897236105626 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.07101385752649907, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002044467304290158 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.13371197995222325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016174280494899666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.06611311172184264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014225513611251844 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.06148301912905695, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020032593057346037 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.10281964626274133, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0014198843683178699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.0544351778389216, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013592853283299524 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 0.005224322194484081, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.00021302968079536623 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ef51562c7455515925596e05b6f0b11c580cd00a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.25105219387361044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005170360898027872 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.3963661339879864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.006166209959478081 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.2632442282160369, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004649492833926223 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.10324325384704888, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.003441742210801506 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.16628550216394442, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004378246536170781 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.10898871272052842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0031637179796749015 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.19788246648847183, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004276640363752767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.31861786603681824, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004930470540221916 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.2055562623719173, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0036386336219648187 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.2214424878171101, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004657212513512685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.3486705785899057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005610709630554273 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.2313939327549787, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004163825617671645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 1.9519403405237012, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09494593162165268 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e46d336e578b7e89cbb651822162a5db08499c62 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.09859772585809395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.013778817389007887 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.03287343868832309, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013442739257702343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.21251821310023286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027817450836553795 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.04861449574020396, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012163696170856104 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.0059870925402472075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005674158222234774 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.02426279686355535, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00138986055302855 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.007309673323747182, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000560409181489832 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.030474323323396148, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011252285187580645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.20517906724020168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002637720410420459 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.04587452202968607, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010358216787208214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.02488075874372872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011705975217800647 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.1551694009228435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023022331833182233 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.03578934542908562, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001048259767960614 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..91776be5a8e4dbdab5e3b7b772a8268598b2b0f3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 0.8855730701366655, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05402887660292465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.08570202503900552, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002251191682556817 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.4561121562862774, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005476485108348642 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.13091802273596945, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0025720056432699647 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.03483952529081149, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012306669089095204 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.19122022968628333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004195715063382155 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.052778304024511426, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001476775655349492 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.07097502062202066, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001771784998246896 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.40452647581766443, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004841486668253075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.10935795632748962, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018990796502685228 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.07558585496214218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020894634297122035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.40177716016300163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005117291808345332 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.1149114814090014, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023559804274623885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..335261e7d5bb4070e538698c1264382cc0f86437 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.025938648577982906, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010640148385704566 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.1489094513106922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003474067096470854 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.04162588476887082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015012115710527434 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.006486298578918915, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005370453171441658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.028873117560228444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.002234953017274618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.009979458438683987, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007921199531174797 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.024766861616093486, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009501479562254536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.14572560070301618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0033022912313721738 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.03999367372035714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013595434744328805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.020507241432586306, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009098722057173229 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.11707134925920056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030877854433825056 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.03281665632758161, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013046364902866959 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 0.1350956828500859, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.009865513305687733 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2ce37cf8fc7137fe5ca431b120bd9a8a57353a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.24831089730550815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004780357546739232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6901002304627992, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004128295202936247 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.3160936388762234, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0039777572345839046 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.12446618183412662, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.003184969383336601 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3533119961492384, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004200483017562709 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.15630585395712443, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0028715875088255773 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.1923432613835867, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0037676385944604143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5626163289646795, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004213110071992057 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.24689804124363615, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0031290004102835158 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.2128993219354708, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00421846105175959 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5995327632502109, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004004484127096839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.270802478850764, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035034467745711337 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.910751493059819, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08452357261047755 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json new file mode 100644 index 0000000000000000000000000000000000000000..728824195b5885f34b256f98c05580173fe0f570 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.01911538436920681, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0003503314592826651 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.16632609123655862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0022509999713146564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.032654498601309825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0004894300326827564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.0006671575521923488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 4.2242073462787435e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.00956992595160404, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006564343184013548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.0012146680449496444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 7.587623242998391e-05 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.019101357116934347, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00034860286261423904 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.1662080638295448, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022483526136167662 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.032632732447854994, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004884533202874028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.014415704194013666, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00021940484720304887 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.13514571391109637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0019103084597602812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.02496623803547597, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0003268151474812184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 0.005621231172467306, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.00014199233504074313 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json new file mode 100644 index 0000000000000000000000000000000000000000..da5cd088293bbba91ea0af502f437cfd6d3dde81 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.1553138653033213, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0035449641247749003 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6751958384437035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004465270161109714 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.21808043695289794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003102334578158025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.07201402035915092, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002186406300104381 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.3367496211993743, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004320718638419122 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.09971882616743014, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00198054319313908 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.11983577128787204, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002848039977052699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.551063468158563, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004502206105834914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.16879982896459372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002412131520433171 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.13727804285693102, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00310050036995496 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.610640770573686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004416190675238104 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.19380889018037997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002740435097718051 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 1.959859057235728, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06468986760192885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4d3975eedcbca7000796918a037b62f92681834d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17369745237153775, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019416720591579535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.31135751074053647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028278252703613604 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.20744890306323854, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001889783739010265 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03684832178440491, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008000457616039376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.06963013596903031, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016025397023385504 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04442334363828912, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000913157779834151 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12131088040113613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012101669364236395 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2274075459665916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00217190346880656 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.14688583786669807, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012058252236626717 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.1604070323191151, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017878463609218534 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.2888713576941007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026566622649898735 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.19182985169202296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017460069670729864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 1.9776709548064586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07545402996122554 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1cbf4c3a0356f61208cbec473d69a33b49220b17 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.17640434289005402, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002005007718430865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.3066393157994191, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028800028706439237 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.20809232424724627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019353389343055706 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.03974292535229771, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000857829234608627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.07238796762006708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016527893153633355 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.04721416030214545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009610927999083794 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.12624587067736417, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013254243768189675 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.2277171673299629, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022655349530668477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.1505788683968366, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013106237942102476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.16305254160491786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001850053368836578 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.28490909590208735, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027167401690682915 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.19268916973951744, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017991908347606052 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 2.1658866052052077, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07121279156132994 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..663acab2c46db36952c3b457b310ecdf2ed51507 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.07162326549438301, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013415768556990903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.11684671121153994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0020890726158097313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.08233778081684881, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014136328956612485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.007179946675972951, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003647094563527292 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.013370485490546424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0007936434590550039 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.008674743075720219, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004461694446460864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.06378002446656099, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011167907708427777 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.10585450994467742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018492026296821713 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.07382145485768168, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011967165146959564 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.06755619163537285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012503644836163084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.11062864063321133, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001963576254467577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.0778081106950157, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013233841707317477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 0.43524332681177375, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019834397414917805 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..55ae56766ed01fc4399c9683f614d698ba99c1c0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.1307619769061793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020066017869496055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.21457261780345513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002890368978894931 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.15052430073600576, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020148125325675117 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.024730912344785617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008511600964021851 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.04229212817088445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001416549742011235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.02849747941134415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008756872410718326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.09765613730182125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014148261990747527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.16574054117385323, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022567330723406126 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.1134827324047906, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014198341617713674 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.12175399642762218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018573395823604964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.20083948009008584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027101716504798054 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.14034629936326376, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018623589986031964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 1.7498608764506907, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07906985947026686 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1c4ab0a154273cb894d840aa1b052db33e8136cb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.11935860401561957, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001966951798581278 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.18047962059644787, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002288460298862089 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1298217936839039, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016486636351090917 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.017209465779815906, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008676540226673849 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.02547483526259387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000987206757819632 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.017695339797954485, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006158660622155535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.09989513555905755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015716488614741981 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.1549374771537138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018887319927939226 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.10955506758387423, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012658212991754696 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.11068732307051626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018173163434014095 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.1683872149174189, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0021320843992146134 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.12058299633791197, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001512713070449015 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 0.6633659334049573, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.049734729687044475 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..84e00134f8af1ac881638ec06bdc6af491a9b3fb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.14896561449083853, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020221764972833405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.2478826336868915, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002751441323122002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.1718968216119803, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019220394498241642 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.029109226932211917, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008866459881706034 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.04932178167592407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001497944199301407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.03310418623679188, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008807776755696351 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.11131788375137605, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013990530230697672 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.192588680149845, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00217772926083577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.1299888647577953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013387418656703245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.1384485375043415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018637262914521346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.2314774588045184, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025719766768405997 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.15994475650377557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001770065686045605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 1.856535146505626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06411488880916898 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..772ac4147a5c05b0e5055ae612e8e3f8411d07a2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.14903430496900869, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019044306456448882 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25460834562037304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027808818609942765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17464450784994165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018866549663747307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02932852184049837, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007444651216824196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05378048278152194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015115343557708684 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03487366784140555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008538791169940822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11560989242228453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013437666268655546 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20506193764704436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002299748596638709 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13717038261116743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001369756093492625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13582138493978418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017239221521953576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23341669122784228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025774317116309797 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1593997663047623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017134556998076298 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.5192817590705794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06494006665273461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..879247551bb552afbbd0edd76a5877ac02cc0f85 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19528801849094218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002201427724682199 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3257340657226586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002895580827853013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22509002102944192, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019838100403287333 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04999822434377242, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010227131559085638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08773922376052366, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0018594738182377334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05802448056432808, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001078013458633595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13922763000575006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014964090541992776 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.24015503219395376, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023374560954638485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.16185295875984437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013532637497389792 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18280815280692314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020740368495163132 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.3054903541601766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027607171534094984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.21073250157763135, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018643526489330914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.059381627710027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06573083880760217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9f03e0f7d45ee263e81ed1cc9bcfca2751b0401a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.11894228477581185, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001843199326087485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.19332711537732455, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026663550761881214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.1357386682107375, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001850556213295791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.01856149961078782, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006185592916887031 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.03332455653915248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012178147112721096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.021740420743345538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006905494612438458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.09623545787280265, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013365120430167542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.1604168534029034, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020999550260342253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.11058953803875521, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013384320596615663 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.11045263672416784, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016912872299638876 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.1807713263742707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002494631697666655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.12634196821598928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017082391083172198 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 1.0396077897340674, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0532741309814862 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..673bbe6036024d8f2282e1731c969d796bb462ff --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.1139308599359662, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017654070337474956 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.18500397029775825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002627335573742763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.13062891767110701, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018203352596047476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.01537236441625929, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006021940015586292 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.026591893021603984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011115575787005103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.01792026914674207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006810574947177749 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.08321252562851182, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011456569094090048 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.1391534150194594, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0018825196882808638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.09616512002242138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001182198985439485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.10669664546680603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001644249540235529 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.17374598639039826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024459852240474736 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.12243045922583431, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016925844417807886 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 0.9715621646319063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.057395077491468285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aff98c023c1cf41e6191dbfbb026f82b220a111b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229871 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014734079309311901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e41e992c2770ef3ae6d44a2e48aff8dc215c80 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01497675877162034 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014944140233795023 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3f5002767014dea7514d166ceee872a09cf8d800 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014965960710224475 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014976758771620344 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..94f9dca011d060a0c85478a8fe0861243dd731d1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014888272588203934 + }, + { + "task_name": "anli_r1", + "prompt_name": "GPT-3 style", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928362 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..eeed4c024de05732a0fd7cea399fe403262f637c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732954 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01479492784334864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b734efd28c64e00c5cf787e48c5fa21b6b3690 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0149550879186536 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.332, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014899597242811482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..00c1453c20a42dbfa815911be072144b43977ee1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015039986742055237 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.341, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014998131348402707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..095a19cffcd442c96afccdb0b3c49de2618659ad --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc": 0.354, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015129868238451773 + }, + { + "task_name": "anli_r1", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01506047203170662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d933af5dfe6241d37c3db0426958a0c07807f48d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732956 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01492201952373296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d708f469c3c8eec3703975083e2f14ce59b17ff0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014922019523732961 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732961 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d52b696911f615166fb28d042c053f885d7650d8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.346, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015050266127564438 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356951 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..44da8b5d78f008593d1a9a42142fea2ce3aebfd9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc": 0.337, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.0149550879186536 + }, + { + "task_name": "anli_r1", + "prompt_name": "can we infer", + "acc_norm": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015019206922356953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a58230a996b3233bcc3b8c1e6399459d50e602e9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.365, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01523177622626491 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014922019523732974 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d41400d6c5cab7fc6d7549502679336f2afd1662 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014910846164229863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cf22e128c4a85f6c00dec415962ed55f762749c2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.343, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.015019206922356951 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.015039986742055237 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..23daa53638f62114f4ac3f9fca6a845743150f35 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014910846164229863 + }, + { + "task_name": "anli_r1", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014987482264363937 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf9ea2dc47bb6bc993b09338979f223cfe264456 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014933117490932575 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014806864733738859 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..da010e484e73da4791c710f92e61a578b8617849 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014944140233795023 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.01496596071022448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..59443b785791cba086748b952fad64a6f5827446 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.014830507204541033 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.329, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014865395385928369 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..83816bef256d7e1cfe2eee3a3c178cd556593f70 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r1_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_stderr": 0.01497675877162034 + }, + { + "task_name": "anli_r1", + "prompt_name": "justified in saying", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 1, + "acc_norm_stderr": 0.014965960710224473 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf76a51b2d0db7c467273fd7e170d3ab8cc2dadd --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014933117490932579 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.349, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015080663991563097 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..be02959712cb18b961849a9993617f9a126facb9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014734079309311901 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014683991951087971 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5f26431be94016d5dbda05dfdd4731408c10ab53 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.308, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01460648312734276 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.323, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01479492784334863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..08fec1d4922bde4e9fb6f2b92b2ce8620db75496 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014876872027456729 + }, + { + "task_name": "anli_r2", + "prompt_name": "GPT-3 style", + "acc_norm": 0.326, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01483050720454104 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9772c432e29ebd3c87161696fed2180247aec95c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732958 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014976758771620339 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ff535bd9026e34fcb6ee753fc8cdf5b1c0a9056 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.317, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014721675438880217 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.314, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014683991951087973 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51d8e372313162aab9e66edec5dbde9a76b03e36 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.309, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014619600977206493 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.319, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014746404865473474 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..00c44884a247fd44be2ec6e0919a740e8cb62e0c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc": 0.3, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014498627873361427 + }, + { + "task_name": "anli_r2", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014498627873361428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..822ed44184334e3ba85597d12884790342ea21d2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.339, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014976758771620339 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.015060472031706625 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..40b9060b7d065a153db806c9e4b957e73ec29691 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.316, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01470919305605713 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f5dec1c9b9e6165a86b03e2c06610d1b48ef9127 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014853842487270333 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014876872027456732 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2bd37a87c2c23ab6b9541c7af39398439a861cbd --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc": 0.334, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014922019523732961 + }, + { + "task_name": "anli_r2", + "prompt_name": "can we infer", + "acc_norm": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014853842487270334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..61385503c9c6d97ae28dd9251ea264178177c07d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.345, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.015039986742055237 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014965960710224496 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8df1d16f3d05da109990253ce830d0c7b399edc3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014696631960792506 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014696631960792506 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fbe810e0c9dce9c18a9feb4a3a7b2fd6223d28c2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014818724459095524 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.324, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014806864733738864 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6185131df2e803948a3ea6fa5b8fcb59ea934142 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.338, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014965960710224482 + }, + { + "task_name": "anli_r2", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.331, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014888272588203936 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..48e9963804834e7ad2cabad2a9a300bfdeed0771 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.347, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.015060472031706615 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.328, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.01485384248727033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7849b21701f8492cc567744761777f25d0592f42 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014671272822977885 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.313, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014671272822977885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..50c02a405fd168df33ad2b3950b9673a87feb4df --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.302, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.014526080235459541 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014734079309311903 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d2e4eff785c07dca11b7de42717f2714b9b90469 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r2_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_stderr": 0.01469663196079251 + }, + { + "task_name": "anli_r2", + "prompt_name": "justified in saying", + "acc_norm": 0.318, + "dataset_path": "anli", + "dataset_name": null, + "subset": 2, + "acc_norm_stderr": 0.014734079309311901 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81c4d76c46cd2068bb06b6bd6e476677d4696e7f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.33916666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013672343491681815 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.3308333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013588208070708986 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..df636e0c076043fc8ed8a36c1d1de24ebea82160 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013562032919529019 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.33416666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013622434813136774 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..df71ffe11b30f15c261742e3e54b9f4b2dd7a0ef --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013526454480351021 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.315, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013415009084004866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d15005bc8c405e79a4d784f28fc3781634f094f1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013613950010225605 + }, + { + "task_name": "anli_r3", + "prompt_name": "GPT-3 style", + "acc_norm": 0.32916666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013570806258433616 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..354b63672dba3ddafa100655ca73e0e7d99de67a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.0136139500102256 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.35, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013774667009018556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..65481e5e0c2472718156be5d6c35d92c36d5a0ae --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013562032919529019 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.33, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01357953127780092 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dd080bdbe4feb73fb8f37a2c1f7cdcda429106c1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31416666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013405399314984103 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.3225, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013499258621103245 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e181f3b7e70c81dbc47870469be9b5b6347b227f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc": 0.31333333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013395739415639082 + }, + { + "task_name": "anli_r3", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.31916666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013462309712005136 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b41e95cc7e46a14e3a7763a16bc5c25b051e3f0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013639261190932873 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.34, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.01368049572576779 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..962121b4479162f8d4f54952c0c1ca3fa2b8af22 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013655897185463653 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3358333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013639261190932889 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..822738ccf0c2ab92a3d02b46bcfe344f58e961c4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3175, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01344353868134805 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.31416666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013405399314984107 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..697927082c530ebab3303502404a628b93d6282c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_can-we-infer_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc": 0.3333333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.0136139500102256 + }, + { + "task_name": "anli_r3", + "prompt_name": "can we infer", + "acc_norm": 0.3458333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013736245342311012 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..84321128f850f8baf1c1cf1ef27a11da590aa315 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.31166666666666665, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013376268790982098 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013605417345710526 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3263ab7a6099ea8ded631d56266ee7e6a28a7935 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013647602942406393 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013647602942406393 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4390b1b19fd3ddd4fa84e4ff1bd4c7ac48480bbb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.32, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013471620929769144 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.31583333333333335, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013424568830356448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..db71cf4103ce07908393c3ebbfa531366e39b393 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_guaranteed-possible-impossible_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.3275, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013553211167251946 + }, + { + "task_name": "anli_r3", + "prompt_name": "guaranteed/possible/impossible", + "acc_norm": 0.3425, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013704669762934723 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b22a21cf138ef9f75b54318b21f4a1eaa8acc7a4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.33666666666666667, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.01364760294240639 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3375, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013655897185463657 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5c42ad846c6b45bbc6c9320b58b8432b54332385 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3308333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013588208070709002 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013605417345710528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..703892d877d52076f1e92080f4f4b899e7ae7eb7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.31833333333333336, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.0134529489969963 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.32166666666666666, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013490095282989521 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..22a4bde9d469d2f806567593f76f6869913c12e7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_anli_r3_justified-in-saying_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc": 0.3325, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_stderr": 0.013605417345710526 + }, + { + "task_name": "anli_r3", + "prompt_name": "justified in saying", + "acc_norm": 0.3283333333333333, + "dataset_path": "anli", + "dataset_name": null, + "subset": 3, + "acc_norm_stderr": 0.013562032919529024 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d81383e9b2699147150c36d49a19bcd150569158 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012506564839739434 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012506564839739434 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7009dec1946827b171c06a04ae31967f4a9fbd00 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012399451855004741 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2354948805460751, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012399451855004741 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3f3a918d93db1ee3a11ae3abd6e10449e621c344 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012653835621466646 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.25, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012653835621466646 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8352c6e33746173b04e7147f400f4b1458ebddf0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24232081911262798, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012521593295800118 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24232081911262798, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012521593295800118 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a3f56438c16c4fb0fbc2936e8b37d341ae6c2193 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.01290255476231397 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29436860068259385, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013318528460539433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..285fd3f1523d3fba99a79a31e83ea7a1ae43528b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2721843003412969, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013006600406423709 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3003412969283277, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013395909309956993 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..40bf64748f58f282069662e0bc455018a43e214c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.26023890784982934, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012821930225112547 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.29266211604095566, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013295916103619406 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5772dec6c602e755db9e3c6947e4245f5423f8bb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.2431740614334471, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012536554144587096 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.28242320819112626, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013155456884097217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2154ff3ce224a75f4e52e00e487b6d06fe33750a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.23464163822525597, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012383873560768678 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2627986348122867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01286252317535133 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..60a760c95e9666059655a3d51055fbaa9a98fab0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2593856655290102, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012808273573927095 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.27047781569965873, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012980954547659556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5cba00ed6a885f2c57428871b8af4b49acce063f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.27303754266211605, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013019332762635743 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2773037542662116, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013082095839059374 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9fbfced8796a90cfe08e88576a3a31b66d58d810 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.26535836177474403, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012902554762313964 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.28668941979522183, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013214986329274769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aac2383ead9425e3199c1c70cb7eb83539ed6d8a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23720136518771331, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012430399829260842 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23720136518771331, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012430399829260842 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..617d59662c272f7cc0d373f2d701bf58906f8729 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23293515358361774, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012352507042617407 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23293515358361774, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012352507042617407 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..60a66156e5003b598858bcedebdaa35fbc6aad24 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24744027303754265, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012610352663292673 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24744027303754265, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012610352663292673 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..02e8e2c8388769f68beb20c18eb841dc1f04237f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012506564839739429 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24146757679180889, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.012506564839739429 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93ece03be2cc79b1b1e7a8d674b320b7a71121f9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.24914675767918087, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012639407111926428 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2858361774744027, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013203196088537364 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..56d9242732fc3596d79e25db809c0a220da47fd7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2773037542662116, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.013082095839059374 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.3054607508532423, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01346008047800249 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..92b4f0824db05be1f8cbdd636dcd15accbce6712 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2593856655290102, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012808273573927087 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.28924914675767915, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.013250012579393443 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d63c3dae1eaae76d81e110e9d46eda187691b87a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_challenge_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.25170648464163825, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_stderr": 0.012682496334042968 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.28668941979522183, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "subset": null, + "acc_norm_stderr": 0.01321498632927475 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fd7dc2b71c45fa202eed2d5254f3c28108c52dce --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008850055161459239 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24705387205387205, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008850055161459239 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_1.json new file mode 100644 index 0000000000000000000000000000000000000000..24e7ff7c7ac6e07e4abce9313b592144be172385 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008762298774190588 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008762298774190588 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3ccafd1a0bb27e6734e7717ea9ced1c80fa32f62 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008809171744720559 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24368686868686867, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008809171744720559 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_3.json new file mode 100644 index 0000000000000000000000000000000000000000..731b99c94f58f357964c824bcb2a9a60c1505ff6 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.2382154882154882, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008741163824469184 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.2382154882154882, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008741163824469184 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..80227c3eddb9b4cb8bb5e70321bdbd3a0b021e88 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3501683501683502, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009788295410093158 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3228114478114478, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009593950220366743 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0d64278ea4f555ef3ae8a9a9a5c275649949c8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.3463804713804714, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009763542075695724 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3202861952861953, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00957415266873942 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a50799160c3d664c2ef5788621367d628d9cb265 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.33291245791245794, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00966995897839532 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3114478114478115, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009502311567905534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f03d21783b6105892dc7f3cc2d7974d3fefb25d4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.2887205387205387, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009298805565435511 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.2828282828282828, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009241472775328231 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6698dcb0e2ddd10747bf0eb1f461bc985e079e94 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.32996632996632996, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00964831157424104 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3148148148148148, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009530150430975607 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8e3f2ddd1333c5ecb1fd066fe16227004e381710 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3282828282828283, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009635749509262161 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.32365319865319864, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009600478182273768 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..08f2cddaeca65f574b4588534b035330fc341467 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.32365319865319864, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009600478182273775 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.32196969696969696, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009587386696300396 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ef5fcbf2aa498455ad186df5eb4b5e3dbb7eb83 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008762298774190588 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.2398989898989899, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008762298774190588 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b374740189fd4081da5b4e81a47d97aa878cf63 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24074074074074073, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008772796145221902 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24074074074074073, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008772796145221902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ad0ef1764c6f6472844ea8fbbd0e78dfd885e28b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008798836444222033 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.24284511784511784, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008798836444222033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..628fc4de16dcd84a918678a5eb56078002ebef8c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23653198653198654, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008719840797175745 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23653198653198654, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008719840797175745 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..12448c8d15982d82e5e27a54bd572e22f3a6db27 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3186026936026936, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00956077550767337 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29335016835016836, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.00934250833170855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..59a1455545e46fe711e1932bfdcc938b36e9f7c7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.30387205387205385, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009437524848293738 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.29545454545454547, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009361987126556448 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5db478f7d0bec5bab0b68ffd7e9a83130317f875 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.3021885521885522, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009422719042483192 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.2908249158249158, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009318815921176657 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e629e1cbacde774e6667b16baa2cb956ac139434 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_qa_options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc": 0.2967171717171717, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.00937355949298684 + }, + { + "task_name": "arc_easy", + "prompt_name": "qa_options", + "acc_norm": 0.281986531986532, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009233124071053636 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f4e271e3e9ddf76c1c549de542afe072ff0e0fb5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5653333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009051951785603835 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6336666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008797928274394058 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e0a81bbfddeb61a64ec3cd5e0ac1340be020dbd7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5633333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009056690207178125 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.628, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008825982437590614 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e65f2c65951d1b35b6206e82a7af7f046c013c95 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5756666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009025076316539062 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6276666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008827592133099669 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ad668c7dd06eb33920b2b45848e29ebf7d43fa52 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5776666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009019409415904175 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.631, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008811292732995702 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1c9417568c83c83313c327a98e9d9cf4837d2a6c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.6056666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008924016166504413 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.37833333333333335, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008855801251873014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fd3c7b4c3e748f82bdf147191ec48d1863bd67d7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099982269204863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f5e881132df5a9cfc43b6657f486e34a69ee34ee --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.5953333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008962735560535853 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.5943333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008966262991425925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9c41c59b95ced7067f84c8797bee6de8ff420bee --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.612, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.0088982241372984 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.6063333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008921375326707089 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64fcba0394ebb999ab9bbdc3627047de7aedb3c7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.6223333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00885273830576469 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.566, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009050339010891723 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1602d8883f65e621aa2b592613cf59ffcdabd136 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.541, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009099483512819305 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6a631f126559b8c2b384454f6a8860506f7ae129 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5343333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009108680663441197 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5213333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00912191679884186 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..411cb67b8f8eaf6cbf4c0a4a2427ca7706f06034 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.613, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008894007408882734 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.507, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009129336317272385 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a72ddc3f0d083697929851d0490dbd3ebbec55a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5486666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009086879312708495 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.542, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009097962646004983 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bb8da8adb5d96954c7bedc98605d036b0c039986 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.584, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009000463105420334 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.561, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009062029213030572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f8203ccb01a3b41c3ef67505b53618d2a1dbb244 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.589, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.00898442578218232 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.574, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009029683183412069 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4fa55f3dd455b3a0990a58a8ef9853fb59b30e87 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.62, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008863380835773167 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.6236666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008846558976258922 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0597f7de43398864fc456054038dcc1209135728 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5406666666666666, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009099982269204863 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5403333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.00910047692710895 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0582160252610ea9e6a0954f069b034c76ab0d90 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.5883333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008986619341172333 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.5933333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008969751860881003 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..65c97d1d6e3188b5e1c0638228ef585223383b2f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.375, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06527912098338669 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.1818181818181818, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..81317c3816ad176507ba4de9625bc5712c0af46e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.3055555555555555, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d81c8f832d28005c3f26e1a103806e941023ff85 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0663363415035954 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.29170113041080786, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f7b88bbe45b9e4d578175b095fa5febbd336920f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_GPT-3-style_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "acc": 0.32142857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0629736228905634 + }, + { + "task_name": "cb", + "prompt_name": "GPT-3 style", + "f1": 0.21804611488737838, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..08bf49b683b53baa3b55ed62b9426cefd304ba2e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.19555555555555557, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cabcbb456666a642088f25768c3e4e5f260b824b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.2964646464646465, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7e9f03a1d97600a21ba237df7cbb421c2c70e22b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06741998624632421 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3471345029239766, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c5cd2b592f163e16fd1f473d4ce8d9874f06b4c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_MNLI-crowdsource_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813059 + }, + { + "task_name": "cb", + "prompt_name": "MNLI crowdsource", + "f1": 0.3018867924528302, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec1a42ae0d67d787b3d5d4d5592f8d29a6b6511a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06724777654937658 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.34629629629629627, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b5dacc24cf5150b8cb2d4bfc8384f95854ce31f1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_2.json new file mode 100644 index 0000000000000000000000000000000000000000..09db9367fcad00879fe0bfacb5ae75c1278df166 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.44642857142857145, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06703189227942398 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.3227672955974843, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bb1691112d2609992d6f93afc294040b28ce690e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_can-we-infer_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "can we infer", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06460957383809218 + }, + { + "task_name": "cb", + "prompt_name": "can we infer", + "f1": 0.2505963590709354, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_0.json new file mode 100644 index 0000000000000000000000000000000000000000..914c5cd344b20b7961672ad7a719ac70bae99107 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.25, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.058387420812114225 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.20170940170940174, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5012c098a89a81bcbadb64c8ad4e291ec4c31133 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.2842025699168556, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e4f194316093aab3c65bbbd90101b34b0d5fe07e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.39285714285714285, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0658538889806635 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.28758169934640526, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_3.json new file mode 100644 index 0000000000000000000000000000000000000000..663bbd99f5ce97c8a6db01d32cf25d2547f0d5a3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_guaranteed-possible-impossible_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "acc": 0.35714285714285715, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0646095738380922 + }, + { + "task_name": "cb", + "prompt_name": "guaranteed/possible/impossible", + "f1": 0.25836477987421386, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ed7e3c663cf16f475d21901c5bb09ed281b5e3ad --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_0.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.5357142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.0672477765493766 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.37777777777777777, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b296f0361c0eae005043e6536493406014e1a4a5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_1.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.4107142857142857, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06633634150359541 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2964646464646465, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cb02ff8fc132d8acc90f65fecf0ffb3fd804789a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_2.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.3086702262903636, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a175c2075021f2fab3046f0df831ead517cb7f67 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_cb_justified-in-saying_3.json @@ -0,0 +1,33 @@ +{ + "results": [ + { + "task_name": "cb", + "prompt_name": "justified in saying", + "acc": 0.42857142857142855, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null, + "acc_stderr": 0.06672848092813058 + }, + { + "task_name": "cb", + "prompt_name": "justified in saying", + "f1": 0.2988943957300801, + "dataset_path": "super_glue", + "dataset_name": "cb", + "subset": null + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b9d1f5fde6a472190825b9715a16aa98d6d6b32c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.55, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c3c7ce533ddf668b1976ac2c1254c978704271dd --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a3ef9f7848cbb76e8cae2d42db1e9356a98c6ff6 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.54, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.51, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05024183937956912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_3.json new file mode 100644 index 0000000000000000000000000000000000000000..24dea6f266a57d161cdd50112e4228294f99d49e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.53, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e4de5a1b9779c8fcc243981f97ab61a5f319de94 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_best_option_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "best_option", + "acc": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "best_option", + "acc_norm": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_0.json new file mode 100644 index 0000000000000000000000000000000000000000..38a58413fa025f113ceb2bbb5f3e0ef43d5e0ffc --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.57, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ecd349d2a99bd970da545a958b41a8171a011e98 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.5, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050251890762960605 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d10bc6bba936974cc392f8af4f97b8e904698b9a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27bb9af991814af2d89136c926c95236bef30790 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05016135580465919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_4.json new file mode 100644 index 0000000000000000000000000000000000000000..57a15d0e041695bc12689a2a4ceb167eb344140e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_cause_effect_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "cause_effect", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c872db61d6ca813cba2a1aba09aaefa40887bf91 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.52, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.050211673156867795 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_1.json new file mode 100644 index 0000000000000000000000000000000000000000..587d4943211b9fe5ac310248a1d08a4c064168bc --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05009082659620332 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049756985195624284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e104ebc37a2a2b85e516d6be7e9b4cdecef84390 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04943110704237102 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c28686ee75e6d4962a4904752501f796eb5ef50c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_4.json new file mode 100644 index 0000000000000000000000000000000000000000..af297bc5dfff87dd17b0c61f75bb6c3b57547f30 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_choose_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "choose", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "choose", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_0.json new file mode 100644 index 0000000000000000000000000000000000000000..255ac99a86ca2e9623dd6078110af1cf7750169f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.56, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4f964af572659b48f5ab4fac4242c3ebec8a34ab --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049431107042371025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c11d2539d286782f50a773c1e37d6c48a5c29aaf --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.4, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049236596391733084 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049604496374885836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d739303a12716fcec1366ef7af092cddb402871d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04999999999999999 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3877812aa93e9831bc5426c719028d0a2c0fec3d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_i_am_hesitating_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049431107042371025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_0.json new file mode 100644 index 0000000000000000000000000000000000000000..557c58401e44eca98f5781389662d8d3ba9ced7e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.49, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05024183937956912 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.41, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.049431107042371025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_1.json new file mode 100644 index 0000000000000000000000000000000000000000..19be96517f68e8536927a130d05ec208e7cea793 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.47, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.05016135580465919 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.45, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04999999999999999 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_2.json new file mode 100644 index 0000000000000000000000000000000000000000..da7f274fccb232ced3d3a4cbc794a349bdcdd96f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.42, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049604496374885836 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.46, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.05009082659620332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2575c736d9ea078f0d44407378cf0a214841ca0b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.43, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.049756985195624284 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.48, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.050211673156867795 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1050080e9796bbe78e28983575ed31d20a7a6f56 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_copa_plausible_alternatives_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_stderr": 0.04988876515698589 + }, + { + "task_name": "copa", + "prompt_name": "plausible_alternatives", + "acc_norm": 0.44, + "dataset_path": "super_glue", + "dataset_name": "copa", + "subset": null, + "acc_norm_stderr": 0.04988876515698589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..441ffe8f84615deccf7cb35062e1982ea92be5a5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 0.024902886709539385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.001859827390010786 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.030688350761161772, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0010111628349295336 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.03522159436879318, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0007432341163294273 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.03002411449589292, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0005566731529837383 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.00213834226369479, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0007515435096961347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.0011739194410139697, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00018856260255220028 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.000820851337990567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00014510778108128557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.02989418896176067, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0009996326577278941 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.03396980123152662, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0006930467792590991 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.02908307755018932, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0005238680979850407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.030194649653587414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0010048849542946053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.03435223902229173, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0007062613786269745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.02941389349627349, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005364047320293597 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6399f5ddcc452b11300ef0307fc53ed5a3580c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 0.10526165569419166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.014436035997175528 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.07911450069828341, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029097835615642907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.06738993210750575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0012035687077170335 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.05790598949403749, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0010250743934079594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.01429960185912336, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0015929231677413393 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.008672646269225282, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00042018139591029456 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.007254357472661951, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00035056222442385245 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.07079248030581328, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0026369733712376764 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.06044596402655965, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0010577536507608413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.05171508838489101, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0008789909961010179 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.07162010937536421, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0026623851377461905 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.06136184462960388, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0010727989981957834 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.052508873461168226, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008937737880985334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..54a7e8419cf8b92190a0bc118869fbe149a852d5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 0.30539641277266166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.011391767070946923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.07633505189299611, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0009839415501958642 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.1364351378477736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0013697596740544205 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.09437252246992375, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0009227636769121052 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.012720631413562505, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00034491119917086203 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.023922584715591402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0006622677812103944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.016087743931264227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00042298521406965243 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.06444618171708413, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0008733244510681126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.11522495932109246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0011523473615329262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.0795119996992231, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007622335976376695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.06758563678521023, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0009001740406512067 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.12020034868975105, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.001178439476775103 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.08329416606334707, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007986225124758468 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json new file mode 100644 index 0000000000000000000000000000000000000000..35f6ddf46973d8f5838be2c3a12c64cb525f7872 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 0.8289729029926227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.03819124682808821 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.0920126739306129, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001180610214294338 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.18394597416501404, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0019089777737464745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.12027784856141198, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001398336070917741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.010230404488421895, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0005619899673032154 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.019302317278520317, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0010317275213485766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.013068517538036242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0007056166127861284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.074950558069943, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0009851928299547022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.15108041046374748, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0016839786264005012 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.09820775827857402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001187279171052657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.08168257075261778, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000990831090160885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.16469012605064637, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0016151196905014934 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.10706801145048024, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011738963266738863 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8a44b9aade63b38406b5cee0beb8c4451c1e04bb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 5.871602035182637, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0689265094519103 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.315381910546664, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016991270157383524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5677658862678204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002537463753002365 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.39687295629000013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001731150333313827 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13054327509920194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011487605105947785 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.24249304875101457, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021455306919602565 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16573259273166063, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013731741464944188 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.21023197351207698, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011813742144540786 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.3852625280422845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023003532851786253 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26601847609457513, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013291072676722162 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.26030259129820893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015643235606867766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.4687183064058039, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00245885193806218 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.32751173027045133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016658340639491723 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f45dee793e3fb824ae5be974e40f557a59dc574f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.536088149934491, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06368060447210919 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.3132683994986297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016779315277452248 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5703289435095071, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025564238054670923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.39605202887518287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017242982181801837 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.13683093815324895, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011747506318268393 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.25773200513287264, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022557263550937465 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.1746381612318158, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001413301666897888 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.2150826473861559, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012099040939194908 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.39825037303644484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023757642907636734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.2733679239070397, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001374220180286804 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2618159333058282, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015495574948136697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.47735870931030694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025148308302801285 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.331089401046371, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016665130097241836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..376760ebd69920b4da5cb08329e835e8923fc322 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.012666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002042089808097441 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.0005112039433192838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 8.466574166758451e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.000980746025187589, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00016211133855868694 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.0, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.012666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002042089808097441 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.0005112039433192838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 8.466574166758451e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.000980746025187589, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00016211133855868694 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.012666666666666666, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002042089808097441 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.0005112039433192838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 8.466574166758451e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.000980746025187589, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00016211133855868694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e827c3f0dcf3d5bbc1676d020931e135779d07ed --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 1.70530637243207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.15229080097327555 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.1372020348331753, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00462234189751826 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.1273232990519524, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00418720214271013 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.10350002350150173, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0032360935018220568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.03517252247207065, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013354896708446466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.05185401724384904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001963532037641408 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.04015090640873629, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014719873423558562 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.10800866370039118, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.004210703866730323 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.08625864193958034, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00288629903926175 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.07039375693186166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021877703117276575 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.12236274969411427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.004403741172008285 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.10565592747607992, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003502725730071462 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.08642605279408319, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0027110760348653747 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..11bdd0131576ccb94bb2688a116e660583f7ea0f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 5.431024960602319, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.2000472576829238 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.1861296338649477, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0043744882526644034 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.22029441022547872, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.004928253663307522 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.17737847294451, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003820008304012659 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.06437721898797501, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016094196221665435 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.0955303581448959, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0024590646449618413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.07440738035625322, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018424656344762078 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.13782618332035232, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0037520766631794914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.1523944911740286, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0034953930296791947 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.12250849478802743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002662162350752231 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.1600822254903216, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0040372964538571325 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.18302982963140504, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0041611517396543864 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.14761442818242548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0032202549154167377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7c531389f205c1f6641bc71b614a558115ef68aa --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.14846670480161736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.013127750204704683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.26299103271373675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006382723076774683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.05264625320052111, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0014565379578075452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.07826511485489751, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019164435017392188 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1576462193644736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005855563265723578 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.01645135749156905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0007166454954847944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.026122584651124606, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00101915256853543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2524687494414541, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006192701638587216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.04953446876968493, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0012966364411702697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.0738166454356822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017248337500189589 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.25330917167188566, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006207826569406349 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.04925315584474901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0012892531673512504 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.07392255592729936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00174327269105063 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ca906267e4fa2d607b9e0015263cc5fa7b645208 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.167405083071978, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09131785485896106 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4027750555180633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022654704081963494 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46782170590516187, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029091530201079247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4160674099572731, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020114833441626043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.16815397557349618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016540332686745738 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19988678530914838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021644647347172754 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17514483800816646, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016790153644324725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.27883038558148265, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001734453659792866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3267350269933985, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023849450074570734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28884448468062934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016305979819716226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.33427994949554696, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00212000114598495 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3882701548838279, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026955155305927177 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3452818942622058, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019526544280227683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6fc8de743ca42830f3bfbf3879d931cfdca3d193 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.796301153911008, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.146708341661776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.41448086950190893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002308739240960833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.47403110687392724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002885238583477618 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4253927318807347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020174640710145207 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.18273899527170673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001736753975334171 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21447823413610112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022428818903169902 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.18933491861931817, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001739001650440241 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2947058978443582, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018691331443875447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3390095486229893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024297003361930447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3029238715575528, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001712217734742568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.34519699878769394, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0021888771482807224 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3948779703096386, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002705400716089457 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.35424136247162186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019838710065518987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a25b0fe03cc2d49656eaf27ed0b49d6df2468114 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 1.5394066408010387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.0484052485543162 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.08175423335349441, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018392810471837139 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.20081999461253017, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00439749257662734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.1145750851342104, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002534080030689994 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.03295322832313943, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0007898704120356384 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.08455406384327209, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020235178180524603 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.046679692708971626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0011061130467268918 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.05815283722861802, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001320004858940561 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.1445691822798896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0032622309703185166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.08171290144166414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001829868972362332 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.06823005737355141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001559301582529244 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.16771851909582491, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003735794333743973 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.09564293647544783, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002149947772773811 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_1.json new file mode 100644 index 0000000000000000000000000000000000000000..398674eb76a8a1069550b86f5a8b57362059fe49 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 7.053373401330846, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07996544807817436 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.40161297429569254, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0020446945435097607 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.624047917208595, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0023085024322038396 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4772245380978214, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017798625812432496 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.18443805368794422, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001382599111890123 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.294010691187684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021124144674316923 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.22068920317525503, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001470678209160857 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.28104449885694216, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014226979752300062 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.4460789661395535, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002361021804488117 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.33641422719087777, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014270703519470888 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.33024724391235083, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018849037688533563 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.5138393997335251, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00235509008538393 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.3925320856681083, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017735001162616428 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_2.json new file mode 100644 index 0000000000000000000000000000000000000000..21b45325ba62ce2a98b8108d2d4ff8cf5536709b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 7.164537116124885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.054507166163951015 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.38094779000061807, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002099999503143787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.602346894502485, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025758615927279917 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.45619650620795643, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019676479058125347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.17573474931980426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0014244066880121941 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.28548391121662253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022557879734491403 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.21206289529597055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015695849144458885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2696292732553264, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014925011735967284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.43415653380870267, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00242922591565751 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.3249494154166397, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015536724178672875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.3165678865577877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019256981310923358 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.5013663660112442, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002552156661977458 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.37931197366758207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019111586108465242 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64f501a5b42c7bb4304b7fc9c2ef8fdebabc4f11 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.09364826859257304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016148133314340363 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.23630822110313882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0038835580096539166 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.13252809822797437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0022104399017904954 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.01344629279224066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006749467817958984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.035406016690730915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001796407213787606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.01926089676236191, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009630205657333692 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07694102339755571, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011850177068464373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.19522320409536523, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00290859788654431 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.10902400441153387, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016213703660804158 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.07494465354319416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0013254739678396428 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.1907022378721153, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003277425301621285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.1063051390341436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018302500166387287 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.7459765558871847, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06660040271380883 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fa422e8720e638f0226fb105aa5dd37f70457dd1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.1100394961313742, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016618822859456213 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.2706920375533275, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003706425918534696 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.15421893513473225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002184640762782 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.012049819428811073, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006058660485736242 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.029943656724121994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001488692683861016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.016984977462351773, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008440833023618337 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.07738480691068281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001120493205359145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.19125268196473866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0024475701699631387 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.10845297039939822, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0014181692997681172 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.08829821326051698, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001339504713458616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.21836071248948102, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003018448833279907 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.12383067773003087, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017407939439714815 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.567617586363226, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.03557588297526116 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0976922c634a8a2a2cdb1fb7cb2fa61e111529e1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.12266620130499661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001845349320673143 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.30342243290136806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004372631345252368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.17259522097991983, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025062111927192676 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.023355918902647873, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009196790050941139 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.06023586351705483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00245666795200196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.03324496960884671, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001307280464130723 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.09357054736674877, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013572221631896278 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.23269687986034876, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003341303513413307 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.13181341351110437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001855127448763809 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.09728766869371021, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014913277984300816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.24235701247120034, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003705415152960698 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.13711574191352463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002049753398864549 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 1.249916166884964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07484579982708357 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_0.json new file mode 100644 index 0000000000000000000000000000000000000000..53ddad5ce6163de878f1f8299c825a958d8d56af --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.16111094810574414, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0031216354350091023 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.31873549566951037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004234939673038656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.20228388865404873, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029050667038010834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.04233924505198599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0019046448700156524 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0812403134968317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00273838697975092 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.05167374363553887, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018551837511418816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.1289972501536601, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00256911680281591 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.25811719223888474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034826602134800538 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.16252418590968234, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0023298980440963524 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.12625677747003347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00264701935811548 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2516551815287747, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037032430110956454 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15869131179009913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024607270521408895 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.9335794206127328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05641229737061542 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5a942e3cd96dbc5c610c10a4e4b2d241586e2186 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.1391935772840409, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00185790613309646 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.34411662043430347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004457125721936953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.19574368508124906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025064123942186048 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.034069340066578226, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010923839167265616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.0883466846688517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029190316207930176 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.04850165996745252, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015420780039749062 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11239213670673616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001467031544921221 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.27953512474588155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003691970095518396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.158271842003029, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001997766812063051 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.10893210361976352, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015446349102827366 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.2719037060557822, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003926408821594589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.15356091204478078, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002118965655419107 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 1.8666040036646128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06772362697405075 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e0c631182eecad54cc5f2564941e317316518952 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.14553793099391538, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018785878017142804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.3543654560324437, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004391158800565359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.203563620733602, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002510639667265985 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.038371943502079627, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011745762777744093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.09841236909288589, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.003073775666446705 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0545069734074903, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016532341451475344 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.11617466261457347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014951292876035514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.28481399699460613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036998611942237183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.1627195580247563, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002021374913395388 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.11452597251830655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016212346273290488 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.28106149008733095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0039736918211797 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.16046012151104688, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002197436651210038 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 2.12135549546371, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06496091187932158 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6ea8df207b0e9061d1f1cf2a8ca1d0097d934036 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.15936520803829934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002502914065388867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.36698072640704477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004683622882613499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2162684407733824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028487305378065864 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.044026641927011036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0016092489576812487 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.10240085620355213, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0032524404949168598 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.059415288606804666, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0019237063507349787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1250497631451513, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002063727102434744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2883013244617606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003758676199438092 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16944484885441244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022672242410205668 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.12672592447230924, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021949367251175547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2924636147885988, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004172229113513385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17181025266219813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024925143856936887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.4667492251754903, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10381088859974377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0297916fc656899dbbd8a1603ca66c00ca041f32 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13872518272148576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019196010106841707 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34230252867072486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004465857111154106 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19511361094013366, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025883144750141985 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03163153944854582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010782973297168107 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08173836317054556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028348136277117873 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04506092989376365, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015273324009655352 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10692232891845452, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001417359741210213 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26668023859416595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035247154238071715 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15075736335997478, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019327841457787809 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1091361614782161, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015647258593366256 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.27137465028647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003816794352052926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15379184945424418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021360349555465917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.758172461446717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07910003790624576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..67cedb3e95938de64510b22d1af64ab6ac49cfb9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13980890149209954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018977428372584247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3402939621206596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00439029308436291 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19587821805854708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025521664288773132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03376773335297692, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011277676246399322 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0855882084980882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002933007569998225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04784619134007039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015903598069214155 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1126666102037684, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014774766678267424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2759357644277514, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035546429323666893 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15806290558333977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001998316404733819 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10915831064990977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015695216746497264 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2680641977382496, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003807444122985714 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15327775753143216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002141837328174842 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.929152474522292, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06152527877922885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4eb65511e0468110b042bd8c7487d6e8a60b19da --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1546986054170722, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027155969874516615 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.34385746004492035, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0045343512683236305 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.20512716732365058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002811760618953105 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.04048674160408378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0018097291671313742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08889402148710406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030181836014503615 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.05233186334644149, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018371483058294063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.1195641111418401, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0022640684013532537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.26542116077776273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036521538304345952 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15803771397406507, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002246566183671595 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.12369530288790914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0023542427491872885 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.27570867031120994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003999061615231806 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.1638620451208335, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024263948798584254 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 2.0700685384125195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06506482367590657 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7e3d874ab76094afe2ff4524e0b259769f74ad97 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.1497360041541259, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019583143857350815 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.36642339054764944, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004583540722534914 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.2101464481816641, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026373715286000547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03473453491991313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011313039238557406 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.08859197419170174, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002966411473841889 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.049316100613614085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016010378254702274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11131301420898317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014579758214458545 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2744297041783303, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035891934973145126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.15649367452361052, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001985780262960137 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.11850262725207555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001642034902769166 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.2925517605104228, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004056463253514793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.1667089554989451, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022501688088607166 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 1.9587846581999537, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09170521457372413 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f773d33a126da7e17dd249c46f72807a42122049 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.154790186630409, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018694567281612956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.374668382001663, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004418688017830266 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.21636269162646565, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025004031143732804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.03883770533881618, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001182146819894263 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.09827865909009756, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030936550153937525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.05494913652343897, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001667424386002709 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.11817351796015063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001428773921986939 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.2879474437688573, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035604161567386007 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.1654386218767027, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001935585571099022 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.12201516306191562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015836560252758835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.29808537515150224, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004003333302645505 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.17098539953460037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002170984636765844 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 2.2345054070007717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05615802080529323 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5f63f7f78d36aed0f90b4712246267b261adc8c0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.16292807064215686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024194702503072605 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.37125899666294615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004494682114366064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.22006038313971912, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027315907690086526 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.04460384667729313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001568216125121224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.10384107175312106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0031301013911463464 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0602200602063056, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0018748349329553756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.12676536773949365, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0020350190574563182 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2897271196200456, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037239805309158074 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.17107849234270603, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002234222280629345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.12930934118664014, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002140548472143606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.2966631408145694, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004112653472647035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.17475995659766042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002424194865649916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 2.4860410253783094, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13032698389659206 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..701273422b3ab9d5fb8d485816a78db6c1336a05 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.1377737697205421, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001918577597610342 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.33444844365458937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0044702777649980804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.19281203992284088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025879159231049723 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.030357367199789848, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010605636257399766 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.07732093515185194, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002809130137054114 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.04304657136147545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015026722425806724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.10751173681134886, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014476076962821913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.26339792038733567, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035594110181837476 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.15078520597524717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019701846993564385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10716847669626124, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015343163798387418 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.26280225666402524, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003792042588585284 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.15041271573959936, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002105741194516134 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.709372091512763, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07003089206006699 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7bdf5e06896d55922ea565bb94991c76d7dc975c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.13594404907364932, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018873868032456109 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.3217165994125383, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004330682877525108 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.188822793597779, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025303316596448017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.032957187238861776, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011262880908098065 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.08138888326812603, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002864272559833109 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.046347749632318976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015815570739294248 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.11204662329061932, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0015323186320801464 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.2665767372342556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003639126727666291 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.1558262462818579, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020680137397783953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.10385634177850699, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015435899459486468 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.24825254653449977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003728036269680926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.14465796916720383, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00210712558351804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 1.821098322958732, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06452240599200931 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f53f648e2f0b19c075e9755fb398011dbc7db89c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 35.61734472522983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.9537810245026835 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.464684845944155, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.005858144057381323 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7271378863199152, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006504954123944544 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.543575916103388, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006104898483973868 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.3702105052685584, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.006265422305938615 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5873452845007291, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007693229199172635 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.4356172150035944, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00674569689069812 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.45328759213308434, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.005906582270824208 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.7107375872624206, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006690670049916566 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.5306285880503983, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0061926362174269355 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.4577703365666644, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.005904827825133563 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.7163396172728412, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0066419818950900775 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.5354876397976078, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.006176130553320618 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ee9885a496bec234e6b68d69faa9b4296b2a8c81 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 14.876946234243762, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.6789391344335735 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.32711934744770815, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006006032300444084 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.6971853152299208, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.006708558994798512 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.39613005227745696, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00619071042023707 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.25040069744375015, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005867687424247696 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5445186159492055, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.008140522499998817 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.30633527666094623, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00629102968697341 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.3179138525456248, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006008468626549582 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.6816707364901432, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.006948228129049167 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.3858027770673977, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.006240254684879286 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.32067669493513273, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006020699147284835 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.683075266312059, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006908323840034804 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.388545252477902, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0062447020220877105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3ca0896d0c27062ffe9e2ccd7529eda73b09b34 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01166526473007815 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.49510337323177367, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01166526473007815 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6e0bf6e409447ae88f849f80ba52f07cfb066e04 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665713661738877 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5021762785636561, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665713661738877 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json new file mode 100644 index 0000000000000000000000000000000000000000..789c16cde3432758b5af02e9567996d65625ae4c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5103373231773667, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011663330673075898 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5103373231773667, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011663330673075898 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c982f2ddf73d2163f90a517915b1e980cea71e8a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_choose-the-most-appropriate-solution_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc": 0.5195865070729053, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011656869979288458 + }, + { + "task_name": "piqa", + "prompt_name": "choose the most appropriate solution", + "acc_norm": 0.5195865070729053, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011656869979288458 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b14c2a6f48213dbebfbaa3cab291bf09198cc4e0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.14382150833937613, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.010529063400117741 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018356416472001407, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0005521354880273494 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19546239190947554, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0038247859111534004 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.03157680503247346, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008291906514774739 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0025044688702336895, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00013773434188164336 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03017313707041212, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0016830101887106172 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004379612267179601, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00023123994933486116 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01690631599627158, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000459560056479906 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.18505711146460074, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003610768847019555 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029248358492157706, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007131377793194621 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.014654036427626792, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0004353374039170972 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16485045526174213, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0033835169417924032 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025299871642617325, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006568016014138125 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3d55c0f237ae114a9022cc1c426bf951a1d9ed --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.1648609749888846, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.01622044569980307 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.01866628841872322, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007473574991847542 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.19449333311979644, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003899061501410445 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.030873993968869175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0008327468572271476 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0029967448137577595, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00026220893484564474 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.03260572456531526, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0017463997230649506 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.004837227558655057, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00028908858497832816 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.01780151075704589, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0007098586143396394 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.18745438746572335, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0037410036769154925 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.029446893043948482, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007622138034132659 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015331028080827962, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000691747836518428 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.16437664916642197, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003422791511095359 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025145545929009763, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007026811345719036 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2b1468a098db6ceb08ea6dc23b16dd25c226c583 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49455930359085964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665133500637059 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49455930359085964, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665133500637059 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7725bf28d8f0dbe56618a67354e5b467eb7d0e6a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4929270946681175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664656918145945 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4929270946681175, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664656918145945 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5a5d2cc5afd1ad5728d101661531336b71909c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.49836779107725787, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011665762007194876 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.49836779107725787, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011665762007194876 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8aded0a50bd9d46b8a058e66d0a3a20d81f465a1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_pick_correct_choice_index_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc": 0.4923830250272035, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011664470424044986 + }, + { + "task_name": "piqa", + "prompt_name": "pick_correct_choice_index", + "acc_norm": 0.4923830250272035, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011664470424044986 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1da68755a3f40e84e1e92581aa4381e48509b14f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5685527747551686, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011555657298864612 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5527747551686616, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011600659443292933 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b63d78dadaa5c0d34cb366688c7f3c77ae94a504 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5767138193688792, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01152769947361448 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5696409140369967, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011552114834700509 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0db85e4faa8fad4c0f4afd2097a9072e9b08d444 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5642002176278563, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011569259195486613 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5538628944504896, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011597936590301233 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..13069dd497f971a177cb7e174b629c661cd23b20 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_what_is_the_correct_ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc": 0.5505984766050055, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.011605936624156083 + }, + { + "task_name": "piqa", + "prompt_name": "what_is_the_correct_ending", + "acc_norm": 0.5484221980413493, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.011610989358814284 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3094cc586aaf8b8caf6d8999e35fe229675230ed --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.471, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015792669451628896 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.452, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015746235865880677 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8f16c402d30dc7355b54f864a87d84b9379a961a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.4, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015499685165842592 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.38, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015356947477797575 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c692e5f2475bcbb86b66fc8d611269348530f381 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.364, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015222868840522019 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.355, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015139491543780532 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f925084c802d3b171a1ada7c85c55524417ed2b2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc": 0.335, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014933117490932577 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question (Closed Book)", + "acc_norm": 0.334, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01492201952373296 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5407bf8aed9dba96f35897c70a66a0b8cd565e98 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.808, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.012461592646659969 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.743, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.013825416526895055 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_1.json new file mode 100644 index 0000000000000000000000000000000000000000..96f90ca82fd286f5e06438df231f0b0ad7210e45 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.877, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01039129342184988 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.841, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.0115694793682713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fc92aaf3ef8e2225042cd06025c4bd3c3c8e694b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.918, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.008680515615523722 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.903, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.009363689373248094 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1a78e14b9fe9d96294d472abf274cf8db62748be --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Direct-Question_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc": 0.924, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.00838416926679638 + }, + { + "task_name": "sciq", + "prompt_name": "Direct Question", + "acc_norm": 0.913, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.008916866630745887 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json new file mode 100644 index 0000000000000000000000000000000000000000..39db62b8784cbdcd38ce8eb104868be79f95727a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.399, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015493193313162906 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.378, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015341165254026647 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6dbe6dd4ee21cc2a18db6a1b6a0e6975b716aa7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.34, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.014987482264363933 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.353, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015120172605483694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c0f9413eb15eef89310c311a5d8cefe87c668156 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.341, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01499813134840271 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.332, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014899597242811487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a226ba5109e0b2991192496098037dae27e6a03c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc": 0.33, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01487687202745673 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice (Closed Book)", + "acc_norm": 0.327, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.014842213153411247 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8755263374e3ef61dfbd4423daada8669442d1f1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.49, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01581613575277321 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.47, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015790799515836763 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json new file mode 100644 index 0000000000000000000000000000000000000000..07d8be51bf328fcd2c53caed4a1376efbd808ba7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.412, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015572363292015093 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.401, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015506109745498329 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d34e72d99832d0fc75546bf8ad54e39774ed8e98 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.425, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015640320317040105 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.416, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015594460144140605 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json new file mode 100644 index 0000000000000000000000000000000000000000..621c412086899f553ba0b54d77efc7807ca3018f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.436, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015689173023144067 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.443, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015716169953204105 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2a954be0d73c00c0e34bae3678a9585d39dc0482 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.597, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01551875741906653 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.515, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015812179641814902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd683f94e5c839c8dfc4c001411c05988ffda74c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.461, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015771104201283186 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.411, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015566673418599276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8870a854d44e93e056b8a4901fbe4996b49364bb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.427, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015649789644462217 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.42, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015615500115072957 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_3.json new file mode 100644 index 0000000000000000000000000000000000000000..033a12eb22a5c84d06d93f17e16f44e30a260542 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.485, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.0158121796418149 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.464, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015778243024904586 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6c5e3325d8dae7a99c3dd4fac246ab7418fad80c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.49812934259754144, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011562351329083268 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.5077498663816141, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561043278863545 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4495dbf7ce15399f59af7b5307e5720183e4e47f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4649919828968466, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011534056494505862 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.49545697487974344, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011561954965856516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..207183b256fb60aed76b02e7af848bb466c86957 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.45430251202565475, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011514040245583501 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4681988241582042, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011539022035111228 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0c1368781e778cad6d57aec53d1d0c41616f6866 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Answer-Given-options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc": 0.4596472474612507, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01152471548624066 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Answer Given options", + "acc_norm": 0.4655264564404062, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011534917341355139 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ed6dce252027e39ebd54ea197f8f5b568c0b1079 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.5280598610368786, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011544210396951669 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5312667022982362, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011539803085637727 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..89a8762d476429b98294b64a525bf6af6e1fa798 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.5066809192944949, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011561400034509398 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.5141635489043292, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011557792331301676 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..31b6d2f8a27828151dd47e0b82b27e8d18f33253 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.47033671833244256, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011542066509767008 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.4853019775521112, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011557435464292914 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7a71d7334f8d5343948f5afe2bf85e744b9673ba --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc": 0.4569748797434527, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011519544865928062 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Choose Story Ending", + "acc_norm": 0.467129877071085, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011537420054210297 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa5e4f2b532a93df31671599964fe25e6b94660 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_0.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..965a7e27b7d112cb49bdc88cf80340c23304b5cc --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_1.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f428b3c9eca78ec17b7e9bfb13d663cadc2c4fd7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_2.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f4645be795e7f5e03cf3aee8ad2787c1dbf4ed6e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Generate-Ending_3.json @@ -0,0 +1,15 @@ +{ + "results": [], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e7a3088cffbbe74bd74ea099566d522b02ae23d0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4906467129877071, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011560409019420367 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.5056119722073757, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.01156170392878433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a0e488a13f53a0779f623de77d311c9761bdbd --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4778193479422769, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011551049647290312 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4863709246392304, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011558135970599896 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5cdbf21b95841e1ca83fd25cee8c87bca3fe41ea --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.44414751469802244, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011490067784518679 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.46178514163548906, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011528611805439893 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70001ee9885097bb0c54dd5c45d0417cc654a16e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc": 0.4489577765900588, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011502027057558888 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Novel Correct Ending", + "acc_norm": 0.4649919828968466, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011534056494505866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ff2dc784c1ecc3975c3dd5eb31a85a71e5531055 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.5008017103153394, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011562417388300206 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5195082843399251, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011553628196999318 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5e1b3ab6311a22e415c529f3d254989b87d83eb2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.4756814537680385, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011548748301487317 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.5002672367717798, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011562430600098487 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json new file mode 100644 index 0000000000000000000000000000000000000000..17c22bdc30f023524ddb84c3f5b9b0e1e2cce249 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.46392303580972744, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.01153229486915312 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.4633885622661678, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011531394084549621 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dee30e041cf0ca59863990b53487ab799fee5880 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc": 0.45323356493853556, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_stderr": 0.011511744771088354 + }, + { + "task_name": "story_cloze_2016", + "prompt_name": "Story Continuation and Options", + "acc_norm": 0.45163014430785675, + "dataset_path": "story_cloze", + "dataset_name": "2016", + "subset": null, + "acc_norm_stderr": 0.011508201145928354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8b2c31cb819ff294f34be4f34413bfbcaaf2055e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.49458483754512633, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.4368231046931408, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.02985524739031494 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bef6f4734d2e9fe1b74540cf0e942e6ec66367e3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.4657039711191336, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366426 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.48375451263537905, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a0d50b7b91c0f6f6a61e87edaefd8e54739bec94 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5342960288808665, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030025579819366422 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3db82ef9755c72987ad7bd448de6fe0735b39876 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_4.json new file mode 100644 index 0000000000000000000000000000000000000000..eb44673115727089f3d07b149d26db5fa0924804 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_GPT-3-style_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc": 0.5415162454873647, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.029992535385373314 + }, + { + "task_name": "superglue_rte", + "prompt_name": "GPT-3 style", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a7f53c48f4d52ba3d4f1621e55d04de652189368 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.44765342960288806, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02993107036293953 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c4052a21e9cacf5d08c15a3a43209360e6403cbb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976633 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e21ad6a521cb9b88d7cb9672dc6ab0e2572c836c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030094698123239966 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4515d4d8921a8d668be7e4c243ca0a9c7e291401 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030086851767188564 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json new file mode 100644 index 0000000000000000000000000000000000000000..00de915f2fe054f9659e9829fc0f98f7366ad5ac --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_MNLI-crowdsource_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc": 0.5018050541516246, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "MNLI crowdsource", + "acc_norm": 0.5126353790613718, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5ceda0901e02cf486a1d9e04c77ea31aa890e1d4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03006330041190266 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9b93dd76889010cb55144cd1d8ddb922cfe9b9f3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c6e72d5746e9361854cf4ac93571abe5a25b73ce --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.03003973059219781 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.0300523034631437 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7ddcb94a15907a9a3fd6224ae23aee28d2a556ce --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.4981949458483754, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030096267148976626 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030080573208738064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_4.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e96756255074694a50abaf1170f5294d4edbdc68 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_does-it-follow-that_4.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc": 0.44765342960288806, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.02993107036293953 + }, + { + "task_name": "superglue_rte", + "prompt_name": "does it follow that", + "acc_norm": 0.48014440433212996, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.0300727231673172 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9a27e920f12a49ab2b4913a2366be4bdfd721e24 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1b61352499255b9ee41b8f507a511bf7c882d783 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.48736462093862815, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030086851767188564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_2.json new file mode 100644 index 0000000000000000000000000000000000000000..33f4ee8182e9a90f179b7188e371b3673a0a20d0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5234657039711191, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030063300411902652 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5306859205776173, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.03003973059219781 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3a51e2edf859af0e23d41dc03a37ab9729e99d67 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_guaranteed-true_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc": 0.5054151624548736, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030094698123239966 + }, + { + "task_name": "superglue_rte", + "prompt_name": "guaranteed true", + "acc_norm": 0.5487364620938628, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.029953149241808943 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d2d1448fafcd40eca9634120c1ba9b18c1589a85 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030052303463143706 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b6bc85aeacbe401882a252b51eaccc0428f9227c --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030091559826331334 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.49097472924187724, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030091559826331334 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_2.json new file mode 100644 index 0000000000000000000000000000000000000000..59fbf37165a6bbdfd9234eb86a3310f4bcb83536 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5270758122743683, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.030052303463143706 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9b40164b4ac24fd230cd4d8f7fe3eb613d12b9 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_superglue_rte_should-assume_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc": 0.516245487364621, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_stderr": 0.030080573208738064 + }, + { + "task_name": "superglue_rte", + "prompt_name": "should assume", + "acc_norm": 0.5523465703971119, + "dataset_path": "super_glue", + "dataset_name": "rte", + "subset": null, + "acc_norm_stderr": 0.02993107036293953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93d40a8401c04c91f33f7805a8a0d69742a0e014 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4964483030781373, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052131146915853 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076906 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ffdb4bd5bc64b8ed33c720f64477b9ea58d6a7f6 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616453 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.49329123914759276, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051220692330349 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e473cfc4bdd5c0cfdfa151a9a7ad419e49d5d276 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616448 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.5027624309392266, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4758bb7eb127d11bc6eed9bd4e6fcc98e5ebc9f1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_Replace_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052446290529015 + }, + { + "task_name": "winogrande", + "prompt_name": "Replace", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367596 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5356b76b1ed58b5a9c06fdb77ed9134a62348030 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4956590370955012, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076896 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5011838989739542, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0fc66bae4ce10923747ef5347fed1b0f79709f87 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225632 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.49013417521704816, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367589 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a74258fc9342dd5ef677f42bbb53d6242aa1e227 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052376259225636 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5098658247829518, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014049749833367592 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c9eb0e3c2748492470f16e3946bc1daf6f8505d5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_True-or-False_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051956064076892 + }, + { + "task_name": "winogrande", + "prompt_name": "True or False", + "acc_norm": 0.5035516969218626, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052131146915848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..beed5985183e00322e7a1141ea19f4e9027247b3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.48382004735595896, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140451261309786 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.47908445146014206, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01404018549421295 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7a4ed2e38a884e054a362fcd332b09c3ec165126 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.48382004735595896, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.0140451261309786 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.48224151539068666, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014043619596174959 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bce24274f5acb7601f766e1e67fd9f56accecd96 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.4877663772691397, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048278820405616 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.4980268350434096, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052376259225636 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f635fa6c4717d26050264c4da65332989359149e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_does-underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc": 0.5138121546961326, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014047122916440415 + }, + { + "task_name": "winogrande", + "prompt_name": "does underscore refer to", + "acc_norm": 0.494869771112865, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405174596179052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b3aa77623a64c8973270bb15f07fe4a154ec77b2 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052271211616441 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4996053670086819, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052481306049512 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0364c68dff222c7341a03154c36584daa22476d5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5074980268350434, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014050905521228577 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4988161010260458, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052446290529019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fa50d4d970eb04cb56fff97bccb6f683002e4f4f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5122336227308603, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048278820405621 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014051956064076892 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_3.json new file mode 100644 index 0000000000000000000000000000000000000000..551a3ae730cf21ea5914fae7554f68fa70945cec --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_stand-for_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc": 0.5114443567482242, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014048804199859325 + }, + { + "task_name": "winogrande", + "prompt_name": "stand for", + "acc_norm": 0.4972375690607735, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014052271211616433 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_0.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e34522aa817568273c78698b2eac0b19493a1e90 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_0.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5059194948697711, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014051500838485807 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4940805051302289, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.01405150083848581 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_1.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_1.json new file mode 100644 index 0000000000000000000000000000000000000000..187c8f0a195080e92e7aed1935cec881855eb7b8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_1.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.4696132596685083, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014026510839428743 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.47277032359905286, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014031631629827701 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_2.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d0000da7e5bd739f6f22fcf696023946bdf0aa1d --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_2.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.01405195606407689 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.5043409629044988, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.0140519560640769 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_3.json b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2ab7016aba634fc9341d6eb48e3933f189be9f0e --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_winogrande_underscore-refer-to_3.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc": 0.500394632991318, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_stderr": 0.014052481306049516 + }, + { + "task_name": "winogrande", + "prompt_name": "underscore refer to", + "acc_norm": 0.4861878453038674, + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "subset": null, + "acc_norm_stderr": 0.014047122916440422 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed2257ab2c3c8315b527a18c9519930084a5dcef --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b2474bb5405a2efa4ebf1daedfa43812b820ab023d61cd04dbc11c7af4cc573 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6812d7b008a9863336c06880a26038f1013c0f47 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403c636f22bededa86442f6f374dbdeb2fdfa3b5efb5e2aa58d0f2991301d4e6 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..806c734e996c8249fd4428390a8959bba3d1ad3a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62714cc9f95a3f50d71daea35b2b9987e67466942a21cd90bb044d09d9000e5c +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95dcfd774c8b87d4834a53c3197b307b31758415 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7739f33f164e1359731b4b9cefcfea29d739ea44938100b6e2b299e388d8abf5 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55bf9e4aaf26840a054c104b2f4a7b9176f06cef --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50eab7cfc795d1c4b154823c3d380d0a91b73da2bb9e1d39d5c1e01bcbe2ce97 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..050144505fe3adc2b59d8abf6266403e40fff3c7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90eff73dde304b113d65a470ea8774544f05a8836e988145dec055f3c03f3c1 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67d098422edfa4798de2f4c5d97cbe12578a8484 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ee301887d6c25563a6a8e0db22675882400dc4b1b4ca161765cc163ddb1453 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..169279bf2d62a94c0f2ffd349fd457ec87ced656 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:463e0cedfb2b1a4432ffe42103eb1b79a2259881c069d5b3579dd49f336b9935 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1012e8ab2f59569127619b8dc4bef966b253816 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93b11b08f921391ad90d5e57468f4520005c03a7d0c08fbac9b73208fc2d0c0 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e71263ff917ea707629e1d24ef100bb2b26e1475 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821c58cd04bfc4eaf0f358cc42d31104a8cff84c21d5b97716d8d44f1040bbf4 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fd39b4bc417ea2dba85e9397ef7d76cb25d7306 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0527c95f90268e44ac20c4067eec5010eec5a4305e89dc854e41269bf5b6a474 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b814bd5c188491c468e10a8a6cf6d7a9325018e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90946cc3a6b95824fb55fdb0184766097b2d7ac9b53f1cec4c7a70a82305cf05 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..767d20efa28ecd8ba1a4b7bca1c9a1e03bc50c16 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6317ec875add42ddc50a835ebe0d5c28a56435d41b02fe841fc45a9a8cddcf2e +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0003ec3ca648b7fd8916914c1305c5963ff4a283 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb33044478e43810e90d7974f96ebf44174131e38dfad2a57d8aea37ead9a66e +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e794be4142fedb344fe71ef449471a8911b57191 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ff8bf323c5cdc08f5dcc6dda0be3f2f69d25580278bb14cd58463d18675274 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd9d202f23eb4bae78bcd77e7530d9c492cc746b --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f9ee20148a50b8b3b6cacc9dd5732468276941d27ba274e2109501fa6c93263 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a10db7daacef27730ffd68ee174fa4f82436b3c0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2724e1d1c61a7cfae247350a4d11a33583b40066f9bdea59c11d188d827c4534 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..088409e21ed65a7ce22f1725a6739a68316f1e8b --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ed4be2d4bcaf75a5d275587af29a4ad18ee0915c57f69bd4843b75056da98c7 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f99c8952bd078e6883edf458affbb923e20053e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c88c72972ae7f708b4d3c3885b92d5574ad87a3478ee83afe663eb18e9d0559 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f059c629768f5c5399c9028d185758deef26a310 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16fb8d40daa2f20971880f441dcbf151536cae784bf7276cf6a9dbf55bf220df +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a02c1742902aa8a103814c1ac9899d48bde6090e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d208bf5d2c986b0d60ef78cdc1baf4a6f769cfe92084fc4028c89d2567cb90 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be7bfdb71cf54b022654581fb95a58ddb04c8b7a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf0c37a7483e84abc34b8175bcfdcccdfe5abce9dba496793a22f0ad8cad405 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ccf7a32408b463000225400e8cee8cbd01bc249 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fab1eec41aadda810da7955052583f5dd2b5ea81fd50894f8429cf576724bb5a +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8df14bf18fce59d538405765cb9ec1a46416724c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:453401b39723c0f9793802962242178c981b273c0ad384cc4857b56663899746 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fd5f0017028cff981131ed76ddd6b950adc7232 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e163585379cce7d97f83f788948d55a875e4d74a06f9fe95b81b9bf6b86492 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca28d1622508efd41e73d555e472f2a6fec0f117 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b7ce664eb5b80583926b5962b8e857061df9523ba51b2df9494e9f06fed2de +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e67f5a343f597b828b08d0e80ce87c08dc9d1132 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30bdf462e17e45d0f259a639b50da593cc1c6876d6ab8bd064d870a529877220 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a91df392d7a06c3617e74e2c739f9b4174b25aa0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc346c966cd9fcf70064a3d22bb747f71ee82a46c31cb3ff72bf5dd02d457a4 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ec61311324b828497812f74745febcd6eaab965 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543a9b68f80184ec9ea6c55f847eb5333ab7c827da81909e3d5c29398f4e349d +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0c17b9a1c0e42b7091d1e77c8726ca57a4020d1 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bcfbf2fe0f277bdbef93cb25a8ea27a4c77c1e6c6d49138c673d4a5faa45355 +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2f0050c320048315ace9f25438659bab0621dab --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a595eb90c85f41a7b929858eb6c2ad942e8a28c2dcbee7c4c509f46e9fe9650 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d384e9ab03de317b623f919775e525fdcae27d89 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4745553fcdfb2cedf243c67e9a68c46dcec03c46dcc26fdfd5a8b32535907bd8 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748964bfea9865d78e79481f7066b60bf39c8439 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464a73b15fdbd7312f0c9b3bf6848279db53351a1f2a62940a927c297668c40e +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ede73017df87d8452025e583c3b4578b1c23008a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c5c19eb0ffcf11f8dcbc2851763ccd70f9d01c22aabfec1ce70cfa15a7a09bd +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25bdff5f4439fe636d99496c10349e7c8bbc3445 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53492dd21c20914b74ed6b2f73bcaed0544c30aae3af029258206cbe89f0b98b +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44227914e883c87e27c0cb825ee8423296cd64b0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7cf4ae8dffff9f822b4695408472bb50b4367b80286b8d4b93f9faaad77eb92 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a16c4a2d090c38927a20213451e8b3f7ddbb08f2 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e88e1b59d748f59b56b23936746aa72c97e2dfc26c64385a846ab483887a6f +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fead18c148432a0dd1ff897c92e812a011926288 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b4a3cf869c7eeb0f73a9412bf60b1b5057a8ac3fd396206fb2494df938453b0 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a40de8542ea880a166f146355f44fab78247702 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c49691c8a310a1de9e536544aee6b5d7856b9720bb06ce4ceebc320296d6212 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e481ab7dfdfa43d723bc27ce90c8429b4f4fa492 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c0d689b99c04053a314f2584dbcdfb8e4e64c04c78f282169f340249e8eeae6 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f9230fa8a17bfd14b56ed627cb6fe6666b4c7b0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba15d915d88bf9f16f26def316cdaddacbcfea7cfaaaf5648b0c18e3c35e1c1 +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eacaf6cb644f889051a1e2ff33a5471aae11fd76 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:644796f62dd1db55202ddec0e7887600ab87e7c37fe8e207c0aa4853348ce05e +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34319058feeebcb6409463759ae6e3f22c0642f1 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff334bc9dd886cf851013d15d2ba9743dca8b996c2b57c18181cc32723f398f +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3eee4b82724c808c134e0cbb12e16abad993c1c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ae3cc7925ace47ddc0317dfd804843e67246fbf890b6dadeef7c70f42ac51fe +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d7f628872859a0bf82462501fbb234c0b471f0f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8edcffbc514aa4eab6816e84adee9b6cb6b872461a5c07fd378fff24c9a3ec +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2994e5ba4298d238e3f2948911e163b43b292a2 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c763647a2adb08e32e4bba267d63f86c967571a252887dfbfffb80fe7e9d3c +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..560260d55af5130d9a66c1bdbaf5afec56e3b84f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acda707df466508aedb539bf6ca2e477d165ff06ca7ab059bc7d8046c9fdb51c +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a46c85bfcfb1ce53d37ba0d3bf707233d265042 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de6bd959cd8c654155153da7cb039e364e90a99a4d65ab97bb79cec9ea68d0e +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..340cec19e0287f77174d78e6d8f00e85a7766ea9 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22e34647ffdcce91d7cfa44512d954a2fa891368c309413fb53273c207582fe +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..899634dee95e2ee226479f0de5b61474b54d058d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8cf3c91d2ff8f03fe174f05f7917c5ea78c9eb71ac4b2cb770776d53cfcc9e +size 199058797 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748b320be198834ee23814781fb3f4a6ccb562a9 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b64f60085e2fd4dae5ab723ee2c4b378c99d58b2ac7c850a158c593f4ad8913 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77fc0466cf33b7cee412677330bd8be4a0ab0247 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67007e0558110dc3d1b8ee5ebd5553ca1da32739eb8d867f9f7fdf1a854f6bda +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d3e411094f008c92870df8c5f46fe7e38e0dd7d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d1f9ea9ff8b633eda1a76509df0c699be99835c354bcbb87445f3c45f597b1 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..130832ee71ae7b1363ab4eac2a0a4728c8bd35a7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013b0578c6ec17d684c7d14f6016e440ae63b00e80aaf614dbfa041337d45112 +size 199058733 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34036a1a62b797eff73b1061a091d6c74216d8c6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe9a35760b97dd453b51c983eb22d2fb64577adfac09ee2f7366d34353b4626 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56d90e8291c5e884c2ea64f5f299a52d71d6f113 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a332379027d357787b97f166d9148a489d59b2e99e1cd7f312e8f519102b4d0 +size 199058669 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27161d01bab4ecdf21e7e79435ff68752203be83 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a380c423317ea16227ee1c4773fad93dcec34fff0f86d5e874d7016e58b6b4 +size 199058925 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5704bc6f68f7ce1d4d178952a6123be6098fcae3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba9edffcf9e9af94649631103421d93dad2f786b5f61f819fd443953a6aa4d69 +size 199058925 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7b92777e5f820bb21a88e0bf2fa7bb32b6c51bb --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6fe9b0b52c99037443c84cc18b0b8df0e980e372936a6bb8011049c5a6c05b +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13558291d4734cd7ab48a2a1ce549006340d066d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849736522984b6f37b075fdac4511c40a2c02d8482a4054b591253e24a874f0a +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04f466e5f0bf184eb0bbcd62f3f2d2da57e604f3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3838e04489786da86cdd2acfc14a94f2fff222e92603b0dd9e81e8f9f6c8d23c +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f95a8edfd1a7b25ec5f59cd4c93e6e6f0af7b7b5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4668af863aabf4b75292d2492d786c69ed75ede0abba4b2b6eace332b3cf5542 +size 199058605 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e1c830263162f6cfbc980688e3d19c0c14ef990 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a336fa437b7e2fe290bbc09a4ddbc25bb4561b21981834d7c492a7cbfb3c09e +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..900cbfd1a7d80c4ddd014cfeeb39773910890c51 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad2211b5447ffa20918b6ea8556d0d2937b344ddbee4d3ee75dbba871417a169 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a31f7c2b67d76cf31e5d80c53f506bc45764aad5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05180ef0e845d2162aa7b7cddacac6d52c0967b5e494576bb1ee95329959fa21 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9afaf5e4313faac8935779e3b2fe69aa07ff6f4a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f9812b6e92c56df1588e73dc526cd6cb1b87eb689fb0554107ec03e4f7ed95 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4de95d7d3124806932ee9edbe3f4335b7659edc --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17897cc2f33adab27f8a6f65d19cae53c015c3e8bdd3c5c71febbdddb604ab2f +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26a428dbba8ea543ee7c578c752c103289d94d7d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bae9e50f002263343a61f2977ffb242f6c6211a42c1ce35fdfc9f53daefe2c7 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cf5affb96e6a174cf1e9ab59e654a55a7e551b6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b3f7bbcbc2b3c769a69725094943ca13ef21d2988530aaca71300c239c9a866 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9773c05312c2a116d730e08f952d9c7a41d6fddf --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705d85aa3c9d360e7724585d0ab3f0a60f6cae0f7605327848379a343ff87c93 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0c392b544c824f25af40c1aed65a7cbce09362e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1fb4b34e1f615bd80785ba83c613b7f5f02734fccd2ecced8e557da910cb2f2 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..131f4a2d4bcc0c6d2307cc12992f448cabf51aaf --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816bd4b451ab0dcd255b75e2d17125d4fa67ad69737523827ac86fd62da4b9df +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2fc48445bbd50e61e93881753c2a2e9d9514241 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30dd646e91b93dc5aec1caabb269956b310dac1331c641fbd1e93be7e6ad69c0 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b3cda4e155249672e97c98ac0009d9057c33cd8 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b820afafa2653d58be6cd3504cdd2284373c2b541282c63d2cc1d495c3239895 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7357002e4e5d84d35f1f76161cb76239e85f4667 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32fa1cb4aa78b3bb344e178a0cc02b926f6edf43b6b10c66e9374e2f06474baf +size 199058978 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2feb464d033434481a07ff4f36cba455e5db94e4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91dc100ba011a5713b37504168f1da832480a116334415eafd1f193961bf6515 +size 199058978 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b0b638926079eb27bebb1108d83adb8f5454830 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8dc8bc3d4846823e362de6c71dba593a63515e7827fde0c8fe17bb7573685f6 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc7f1cf2942bb379f89cd138c2dfa1f4290e7637 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35020a9e40adfaa17a56817d5baa3efa00e5e52379c68b0ef8d9e10b0a79081f +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d95e3361fca7c85cbbdad3f06f7eb5fc8bf1c056 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c383381745a836b1da63007da9672991cf6292d4378ed3ca82f045d8a8e122 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0ca22b00a14bfb6712f46b551a4f635df1ff2e0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:748db6f39e62b4778507c1ce2ba5bd2d872df405069b3904e25e8456a50b72a4 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcc68a68986520131ee43d8fa8a555ccdd4131ff --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c1745523dabb496e0061cee74edd88a8e940a73d658213487e2327eaad6f1c4 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfb541c571a11d7e7557c0ecf8291643206e81f7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c987a6878c82be33408913eb7a68d96f2206478445b9e39c21568fec949eb3ad +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1870854a55dfa66a89f7904785240278ad97454c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc48766efc5c959f401f3995ffca29acbd0d7159d75dc582d46523c2558a9712 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..744c1872de1d946aaf334ae842d0c86a42a5587b --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8a7194010310e268453c63e2338ef7f0027e9cc75de6a895d03cae86203a1d +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c89dab0452397b9bb17a0264ba753fa3410ecbcd --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe247e8cee8958c41d73aecc36bc6c687577c32ba7423437ffb422441b3f904b +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a03fba4719e6cddff34364d87e8994761efefa10 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da30919e7730b1fc6eb6eb50fd81062a0df4ab9f95c22f68f8262034228d1b45 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ffdc8d6d3ae6e318e8226b29327571680dec765 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d178d7c4a2977f57a8a66193d85b8ba0bb1bf4773f6255c905747a5ead9cb21 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d80e5f22aa2132472e00a1f8add507f83401fd82 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c0dbe7fbe9844caf97ba46e61ad873cc932b074aa926c5c4ecdd6b61dfa91da +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09148a83bb4aa3845e5fd4422890ecdf52ad6632 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb9d045009f280abedbd16c380c3380572106e85ea21f114be12afa33e26f01 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34746bf6a4d36887a2b4c0b4e308f92d4743fcd2 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8e2502f2c33509bc88b1c413f3db2c1fa3c30a9e64e888c6b6627dd00cb7ad +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d4528ca8a77989c5b73c1d20fbe04f65df77b6f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e3391e600886f3f2361e931383b14860a60672b365e04cb592fd5d4b2ac152 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d17935b39d17f96e992ef28dfa0676e86e7c5be6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:545978a433c155382a79571f3bd6bd36e4bbaf4a28b50d13b98b767954907fba +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7d666ec2c12538c2c31abd0bc0dfc595fe87b45 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce329244e70b28f936e6adbf23415a74af1da7a25e180abe0217bcdc0aa72210 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4e89ab75f542614d03bfd0f5bd89c9d4da7352c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ccf2a5cd4467d55ab34288d0cadc36c74e0ebecbd82da8bcf95d69743f12dad +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc0ab7d98ed7d62c7c080c7ab305cbe9eebf5211 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b278a49f5a8fc15ed3f40c63f5f1b7033810ee450054cc2f03dd1d3891e4ca04 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..643ab6a75bca79b253081cbb373b8e14b974f3c7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a101c67fbeeb242d74748b5f2f5979b9e8f7f054962f9d454360e517d89c6cd2 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eff0935787f470c3db2b392b09b960a6de55aa4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bdc8a104fc3040da8bd280b75124a1670aca311d3a9aa621810a7ac1a6c0694 +size 199058594 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..107f3719949793a17abab03faba480a7be2b60c0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcf424a7d1d5dff5e629183f06886e13d9bba5eea3ee1049862b595f226bcd3a +size 199058594 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31b616b845e715b4c3cb8eaa95413b5b06c4b951 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1692a191c83e16ae4d25bbbfb0974fd3601cc2659829f286c5b4d287713232ac +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6fcda55d143dc03f971d34e7fe7968db15d0a12 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433178d14780aea0bdfda33bd05f7fcd7488ba6b57da0b2c007498e13d14d43f +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d0e238e3b8bac4f072a63700b735fe041276795 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3a08911e230efdab2cd75ab3947c32a3510b38d4a6f54ad4803887e998498a1 +size 199058711 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2b8c3c95dc880e7616b0cde9d3655c70203c16a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7460b566644fe4f189ba52c02dd4158bc314f8243fb9c40469cddcfa81bbc9 +size 199058711 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d20facbbf85af98fcf76d986218e51d545709da --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c977c1dcb8dfcfdd626b6ccbe21cffbde6a654452c4476df1621ce72bf526d88 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c34f820ac5551e401482aa13c8fb786d097e0b45 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34894c3e1669baec38dcbdc5a2627e38ac43287371c0e91fef504fce552aeb36 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01e0e2acc21b04166ad2efc45a032d86043797ae --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abab0b8f317b56b1e3f947233526901639ac9a681c83592bd9423cec432b9385 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3d2b35e4bec68151f7dc6df04aa8d1ff9ebb97f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef745c2b6c76519f8c6950e14142521f5b35313d4243d3e52ca84e3a0944b66f +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78855cf86ea1bccb761e69546056b1b2ab24b9f0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc7db5e58ecd622004b206b524c5b7fdb88f401d65f8a804e69ef96a8abef713 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..648ee9be4b4143f044c18cb2ee1c77aa6fce9ae1 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af471c3ecaa2a0c18f7c85ab2cee16ad68f7170da44c9c945f71381d76b6c6d7 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b12c9be6796e6092cb38eca94eeb08e3842fb6e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c9c8d92d1f0fa5367742034e22b53ba90ce14279663eb25068d9ec09cff277 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de67efe1b84712b36efafa754a08caf37d8ad17a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a4b8a56a5bc9d61256bda2e679ef537c1fa843298731a79161744ed5e45af8 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4beb511bf040d4c415706a91221f3cf9b9989163 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729884f3fbdd322d7a9138be4ba8b401b0a0d13337726b6d474438c2464cb690 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0de5178c275298d03fc082d70da0aa0f4a4809a5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094207e886b1cb53f34e68ca26224d41242b463a80f966f0f46b5d19e5b3e5ab +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c4d9215184eac80ddf4830ae4bfc13c9ec6ee83 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d8424ca7e2faa333ff4d0f2bcf952927cb50563426921d73041b8baf5cc598 +size 199058594 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..343277a5a218ae863052fcabb2a305e547fbef2d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a5df6617ee123b4ea84abb1995a57f59c77d0f83a7c32a212b70c07cf9fd5f +size 199058594 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c70a48176320e29326542181091e3f671efbbdb --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff92138005740b57216a962152d56b0f9ca07ea5dabdc75591e674cf8348fbb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dc40b9e012169f0947d9ff3497c85238c848123 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9fc05194daf580f4f0eb0249f83f8ef0bee4d25a32b0d6c86e0f0e0d40e0a2 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f278dc2c3916de61e8bb4ce5f59a4d28103eb277 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4102c4f3a1cd41d6e968d658720cbf3bbf6f54a08da8cee691daee7107942c9 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e46202f0ea9e24a4596ea33d078d2f4b82f7d40 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99701b1eae48538641d1c2cefd91b1e6f298d8161276b43df42275070c5cac30 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..945719902769a1443bd81bac0f655d69c92b2a2b --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa553e29620b65cb82e0cbc90459f6d5f0385a73b6c4312c5367e7ba1918c7b1 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b89346d8495dc4c1ae4b44724c8de7690da65afe --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:051c0f6c0325fcaf326cd8d69b805a30be4424babbbe052606a0c29ee5bda44e +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..931d2aa3af1d185fed6d09790af6b753c2f3c6c3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62254b4825db7c605a48fac0ccb4cc7f60a9bed5968afd3f6d529d08099d850f +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5a3be30113ee928d64fa137103cd9436c9b42c6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8006a6c6691c5e08f0468e74a53545a41a67bc8aeff75a03eb798fce5b47f3 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c0fe5076fdf19273ef49454f297655480ab602e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b12903f8e9ff0ccb04e8bcb02a48f0949435b3bf81e36e1ae95d8b1f2c5d41 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3511a2d933112c9d54e5188e6b09adcb95f43ab --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c49d57b0a3cc3634cdae3b8f1a900f6fd8960ab08058837ffd92cb326640df +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b06e4864819530315a5e195c7720ae3df227713 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116d9259b4a817cf07ff922ddd7c1141777693ca79c99aaf54f384b3c35d7360 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be52e24eab59798f4631b1e5fb76669e8422701e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b2ed55a99fbfb2502ee72c298cfb2dc93ae1b366205a585db1948a25767535b +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a3aed885d045126cd978c447dffe8d3d0e3bec8 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38f4b53e263ed7119ab759b311c3f12b0b4b1e2558836241e9cb6919818e3e95 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89733aa4d05ba72c166c5b7f5e04f778ea7c538f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1510cb4dbbe41c67f97a6f071199113626653a47c283ca362054ec6aff97afb1 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba05c114d5816dc6558526931b7eedcd0f52f255 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f303c05fcd9b9541c97ce6c54dd3a6f72374923eec220ef14f79f56dc70f606 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0af0722be2998ce9bc9f6aa335496bd309ea10a4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b257d9a2756a7106516e68bd2d91f648f85445432c388fe68bad6288dcab8f75 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7449c1da359c85987f12ba8de928a51b5379bdea --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26659a3e0cd5ff843013d6dae8f446a1b756d62168fdd7991b3e5558e3f55c52 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75864c2410ec2af65529a4925eb18cf535c3bdc6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a893d411a95458fe5b7554882f529b3ec96eb6bfcf8e0736bbfd16f1e348e3fb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2faaa74655ebd015e6a9a615ab9a349aa4dfa75c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4632b72acd30b85b0ec8567ae1a18d77b804d56a49081d1915fe9a7c246665c +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79260a921cc756215bce0ad411c63888e318650d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa196d90546c8018948c670fe19cd4a8388655e3b798a5f824875afc0e864176 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..155e54b2ebac2066f26fd7bd23b0f66cbbe31a3f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a52c8ff8cb71e29d5d774d00dcbca68b9482391ed9f19caba01fb5685f288a8 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3d13766bf63a19af6567c377f647e98f1758c09 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1866223ceead3d2d10a40c1d7780627ba7357ac752f86a39991ccb5b946f6dbd +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f335f7975a00328c2210054bd82b4886ea88547 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91436e509ed0656960bf62ac45acead377c38b92f6a4c75d105c0b7d5d15ab3a +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef93f03d00edb862d50613f69073e1b241d074a3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5444b971e8d4ee1875461bcc21e55f85f7d203c6e5a13144a96c27d7e8c42313 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f7c2f5c65b32ddb9cd54ab4f5f0cffd1f6f8204 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30811ab4173bd3733628eb47d9d857e0c9113c74b704cea5dbd279a959bd8379 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddb043cc9c272f897bc0a366bfeb5055432df3ce --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c97fdb8eb56e00094ab16cbcc71f89d2349e76d275e880ef4836f47e88ba43 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fb1a4f596f5be81791c0bf27cfbbfe0e488af48 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6917e9338ad5204a31a26742e591303eea763f48062ce83439c0d87444758950 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c04138a4e0051083bb5efd419259503712faf1d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44321e2c5bc91091dd9e6d64a83bfab2834bcae130a9dea53b77a9520ffafc2 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b91875f6d39788f1dd5820dab9c32acadd1c08d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47fe8ff813fe2223554c21ca8f07e11997a4fa569ab38a7b6fd6bb71a644d18a +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f308e580fe5ea6840dc37a60621ffa6e21b10c5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe35920c0a5679df34d18cf506cf23f726c0f5f3449a5a73e0e8b3de975e7e53 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4026442ea1e3e0caf1846ae075129695f2d552ff --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac2ac9002f3716507fcd4a4c04512a247597fa9d02d31c0ced3cacd1480c86b6 +size 199058775 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e24b05ee902abfa72a4ca776329df4e5287a6be --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426e37e76e2b2fbcb06ab6532099d5e4a60da54e1632b1e9bd97f78ef4078588 +size 199058775 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..756f837101bf376a6e3ac330aa13aec364bc935e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:497618a5e58de1ef0eee07f74e41602df14e4788464d7702ecf286214b3092cb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9bc1b83dfc51f8857ffaf50d8ac55b99dc3af35 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc498c86149edef0b41cce12943eec911f5dacc31c7e958199ddc1b4d83aa4be +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cfe090d53504bcd7ea4e7151ee7e763c5dc930e --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d821608b76259e7be804338032c1dc579376ba3c2889d0838f1fde5dddaa3daf +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76c9a5d6a649b36f2aa716eae2ad71493248518f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4f71246b7ea76d4b2ecbaf09957e156fb0bccf79c238a95cf11f6176d1a37a +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbf503eee35232c5addabea2d621771ec16b3b80 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4994f37d7bb2654ac0cebfa9294f670b7d67714bffbd58bce6981df5d272ede6 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d84cd5161a7e125391467335b77030dc36a1aca3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99dc67407396d025daba9a4c18959cc6865febebbf7c655885437e2ca151b35 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b74bfd9abd4e2b6fab3471cbbc3e97c71b0818c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a11c8e30d4251118c35aef63b5be74f32b1bf6e465201b9bde00dc605d705b +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4847b057e4bdb58122f5b297fe9ec4b3c2816f14 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1662a7e16b980c1a716bf5b4c7e4e34e35757c442e141f9ed60672191238d3 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f934ce8e21d99e8107dcb04f7f3de5621717edb --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c8bfe8fb0c1cda2f8fb491f38d7b4f8ed09f68ab85a22a55146badf48c042bc +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68a89ddd1034199cdbac5bd4ef1f60be1bc7afc3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd344f56fa510a4dcd232103f43252fcfbb3838b0d1d9923c9c3d08006ada2eb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41d6ce506702a832de875f0b26cdc62387ae085c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1eae66f0f9c85caafd21c19e22c943a71e2d36596d974b0b053b6b377ab8fa +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4023f7db8d7007148a42f1b0fef5e325a52ac64f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee2aa9f088cb91d2a84e5e0f84959229e39fe9279a9d58a2b814a3ec55aa54a +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97321ffe3e7dcaeb9515495e58ff004f1202df78 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d831df7861904ef993989beec5de273f9df44af364060700563b4d19f6374ec +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f4e16b9fea71a7964a0de611155d335e78c2963 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd60ec2faf3d0b746bb1cd1b637893f9cfa4d1c333f5ce739468bc3e257436b3 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9c0d31eccdafc8c884f893ade7b1abe68bb9355 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42ce0660d42e3f6fc5ee0c6498fc31432509c9d573926d240b797c0254ef02ed +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4453139a028239e144aaebeda0ac0eb1eca68cea --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9bdd92ed7638e2b730d7c145a19b71fc4572bfc0d6c84abf4731d8776187f60 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23c16e12f5d373d2a0ea41dd65609a089e07764a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92a437cb7c23b5a17c6261b05f2e2ad1a47d24d12553a03af76b7c32a1b5ba6b +size 199058914 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68528691ff8ada88bf0d76905712576b6d22662c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e30bf5e827101248d7e11cab6f624e3f36a02ecc3a9cebf304312ef7e86c819c +size 199058914 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5533118a62f086900347f7982aaad15fc210e391 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc8eed3b5a92ad77be68d26ec8d771ed4430acc9456ab4ce7e540dbd692b003d +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c99547e9c9594209b2e3a1dfc0f8fe1b1d1948c8 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d5f044be29346b3e66eab18d7ae38a58bfe1e0af4bb06f701cf2158b176905 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a4da97ca3ab68b8b2a511f30d5bafb0463a6585 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7862368bb950c2689af295851adc9bcec25dd8eef55f9e3aa079e0fceb03ef8 +size 199058711 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dfb82f4b4c247bc3bbbb0dafd524ed8341198a6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d09f337dcbfa4558ab2f21d3b63ef28d53a73ab1f476ee59c58cb70efc8185 +size 199058711 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3185a26ebf09d61cb4bdb68ced4b8d69f4d3243 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c849c0f1c7e2a2bae0876d795455cdae6d66ab24aeb690fc1cffa98957747ae +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6837ed6ed39ce40d282b13a0ce212951dbb84002 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212abe3098f8a7af3e410aca789d1cf8a028a4d89d8997eb92c7cad71f2641ef +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34e5e6326f3578a83608b63a3cda7ee4ef1056f5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37985e15849786e041f16a6a3677f4db56382c65188f250a221b3178356b53f6 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ffdfbfd50d17635fc6e644086ab03d18f2d6e3c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a06d87e5c5ea8c586a05d7fd496bb9a985b57df30d8b67dce275598fd14ccc0 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21cd7445a4a1220d3df72a219b93c2d1e8143c42 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c31fccb1c4419934987a2cc2c4602233c88cf851e6506956dfda32331ae5f1e +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85fb5303cd7ec38773f4fbd2f353611eba5b5e7a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c6bea94210cfb92f86ea82cdfb4c37b9ee002a9b65acd3942d4d268538be5b6 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f2f53cdd55f800efa0ad97ff656df92a798e09c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816c52b258070297fcdad663d9ecdbec00ec4c98ba86de927f8a55283bb21242 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74002fb334b05d113b1c2faeb3eaed9312aa98db --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c52e2b0ac7fc90bd2ea89517eef274187264678edd0f02b4970c42e50aa7990 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c6278355d32baee554b7685c236f3bb5acd4398 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06640942e6c397b225e43048084b09f42f31c5ffaed0e4f67ddec037b3b76068 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1afae00c9c9c52a58fd8c6ee74881849b4b77e4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6e16097bd4ac03ba2c36c4aae9d0978f6628fe2103146957972d284931f6b54 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59c5d9ea185cf8d860f68ab854fbb008a3dc7e42 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b965cb6830b45ba126d2261c6b7df19d12ba22a5e74c8c3c5359f40dfc03f3 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dcf1a461e4a66dfc4dfe72363bd45e24310c695 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2987638d6f572ecdc3d2a961ee709b0de9351ff8ea3951a7f051ebefc95e51 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46bf416f72155b80e402c3c118a5ae25b79face3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06b7af2817da44dfa6279178767f64ac7764b6982503d5268bf6990b998c8edb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6a9da5a6999f65f36ebcca2325fe9e7dcca8855 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a66875031dbd99635dc14e0128c44f44603f344e532eb3cfa9d1984901675b02 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0152ace606e7c8860708c91884c413951cc89b8 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6a56808de70ab3704cdf547a24101047a6f8c6aad823ca5c1d151ac27cbf57 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dc00b6fdfb42a6a9e9935d4e93a52c408e34818 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b76a593db1edb78f2335c6545216b975e2fa361980c8d7dbf34c23c1f0fbc4 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0b7b9dfdca928041444d20b1127acc0a3e0c87a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919538ca4108631f518af9c4b702b0fe70ac37f6fc69a142a8a261d6af436720 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b3818d088204365d8a9f46cca6902adf8cc7d28 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cf0460d66c329aec05c6169d6dd79a87390645f703c6556d0a51dd48b8913fa +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81354bf3b232ad4924b9a68bead62e17f680d85b --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb31c8ffc640d6427ca6dcefdff43e5458766c2af61a110f53003fe3abd43b59 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd9d50e4e0e3431d66736bb3846a89fc717a4388 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29089f0121395e76f1f93538437766851eb944b455569364a65b64067647f62d +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e55cab62ea8052b6b9af9ced8e00feab73a62dd7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442a4a29344d478d8f047c7b8ad3570a53f5cb3855274442e0d7b409109fba22 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbf8ca7b1541a83d0455aa643e394cde6d286804 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b90d0d300916e804401c6fab4a9f957bbb753ca19b09cff36e766a2a368e5e5 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4f4952f8fd4ce7176891c7ef2c2dbfc4ffbfc65 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8345efa1b8d763e453afb5072f07ac2e6deba5fc14ca3f4c22b784a4ff5ae383 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b187d25f4c74e2ead05299f25e00b8f33b89233 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64f08b6a87155103ff3f2cee1033495ca93d626c3c8247be6d2b9d9cb1eda988 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8501b26f8c036fa7b314adc6f22a22cf654caf9 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e41274c6d72503e9720fc9a719ad284678c37687737b126f777910a67b5f88e +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..314693dc3049ed90ee3d86dd800b40d29060bfe5 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3862c418f3e80cc869247461293b12437525782fe7b77bc8a37d6db7580bb95 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b65f6f65118791fa527451b16e7546d28a64380a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:896048316773fac83042ad4eaf520f0d012b6eaac06491d63625117a67834454 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b1bfbedf2aab678971a018be4287129d9f0bd67 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:075587d4a2acfb9784bb306be32ffe855d609fef68124375fe868f6a625365e6 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9d9a93dad448ba95733e262cf1df07a6e41dab6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bab76baa312b2ad0939b36b631ae91605c05b416f5fef1ee93b3a9a500765f84 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9db1c0e5a8a2ab38bdb4ab6fb1d0e809a561199d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f904e053613a293008dce5ae1667fe0890f8fa126379a0a6d9cc5abb3150e195 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bded9db9d66be73df4ff4ba17b1c0559daed665 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99bf5323b283320745a588d995c2a4eba013f7b41076a08fa99676f7bb77aff6 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..751a5ab64392131a3417a0009c19b3ae54a7d301 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b99f1e0d6bcc3921530e7296ff8ab8740af41684febb8b7128955cb10da60ee +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7adfec94979906baa6b597c4afc009619004976c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3fa7243ffea44977f1088bff529501b730a8c85a369b0c016ee3528f801b175 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4720b14c1963b8633e56523dd502b771c41c2d4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f4bc69be4788dfd1e7cd4d5e4051a72c413c0cf3cdeebb38944f2505cf4f348 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bee76fff0997f43944000fe32294dbe76e66600 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:255cdde1887e5af300a5ff463df6ead3bdb7b143f03b1b258a2e7bd51a1408b9 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbfec6ee4c6add10a781ada6c47cb3e5997b9492 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b293362af48072ebaff481474fc02632055224f9013f7b9602a70398fbc2621f +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8810d47fb859aaed5c0975302089bf68a5f4b5c3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce9e1cdb6b1fd0c204c13d58c9cc112fdf35d9ea68c2a199af4816e7b45cd4b +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b2f744363d96c26048e3625855f5d0537fca95f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:134a5dedeefb0e0103779e63e0b083118612f25b27f7b6683c1441bbcc4ef42e +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73eb9ead214300222e2d5b2df02608ad6a310e5a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d35dd186a6d895529c3343e299c4057a09df9e291b8dcd44a94d7de538b2ddd +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76d8a683f31b79c8e6ce26aa3db6eba3be61ae96 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c27a0872bf02cb9e004a0b41d5a0908004f58b199ec030d19b55873541d2ffb +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6603752ace2c09c81172fc4894d04618677da3f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9534f5a825ca5fd23efb2710eed79bb8b0fd301c95b054956c4fb01ad5d7aabb +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe49d18306a46bbe0aed379ffe03e44449ee773 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8daaa0a061faad56d6c76d9f68f8270e1d4d2cd22da08306015eb5c84b32571d +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee9cedf9161b7269cb988b9085939f34b4af600d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8640ba97e52c6d7479f7898c3457b2bb19d115a3b1a77664bdabe11b1b0d9a68 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77e480d0d4faaa6ec1aa6de5baef03660829858a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e371977d76889cf6b7b90ca45527374d2ac985feec7b64304b4fb17d943f3b8 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdc59b89cb997ba175771b8bcda5e4c4e76da7a4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a135cc92daeed9ee05b133d1572f3d95f517a7869b0dfbedd2ff5447591bfdb +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31c8bb45670f45ceafa07268e1683f6f8657341d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d67dfb77cf3182a24881b5f79960356f91893f6b20da197547882db37d43540 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e93194b34b58dd9cc18cec8c969b99f9511cf17 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1249ab14502e9e17721aa6a080bb7d479628b006552e38bd3af926109a0248fe +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba94e913758f5df12066a7d7ac457120d483cf5f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138273583571340b5d36dab1c21337af06f70b3ac717b9749e1acb517865b842 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a71a9c311d77492e9a9f9c95cad8e71b521c3cc --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d77c8ef5de83ebab4ee4acfb0fc5823057620b3de57dd5eabceb5a41969f9c3 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b47d5c62765c60c38babcdd2d739ef686ac7776 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa4bcfe5b17973ba374df95ca45467b01db83542c3e3e3072ad3cb8b6e67333 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e501011dd9e3354f971614232340aaf00e9c05a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc102aeb8b777936891844d1afa2b56b9ca1eb719f0dd99c13d130416ca8683 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d15deec594f1c6353936a66824d0352cbe42cf3a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2119cfb5e84700a03f63b9de1a18c5f8e661b8cc90698e14cf9f9756d09b8850 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d57e5d32faeb7be1fa15ec1fa05e47498a0352c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6390fa8747286b624811d276457f2b948ee7463233ef04dd6354dd3edf57d1d7 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efc243d81749ea557c0191e042c87e6b7f187824 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efc140cc15bae9e7b6379d2f0f2468cd12e47ca2606a4114df9aeb6b76ce01b0 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f7f928ea3610642e911751878f08903893ccc63 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beb7a957ad755778a9487d7c6d8c657bed9ec44f8d9460a2f8e180b38fb750aa +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a3d89f36dc25f3609298a4606255fb0a4087c0d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96b0d89ff881bbcc4012447af0816a9c09d07c441ff90b7bcc7876845e2201f7 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b29a5837491551878ca42b9af7fa76adffcab69 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bbafe57a0a7fc822daa12729fbb5712cd2bac825617953f109ff5e34ec7c5e2 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09a2b3f48cb2c68e38556297d544019a78a56e17 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f710bb5178e3acca239ec7decfe08521244032bd5ca1986826e2d51dde8600b4 +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d769235dbfc24a7f999a05346f24feda8866025 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac80b569cb25b1424ebdff7a0bee9bf3c9197bad0d1d7ee26e277e37077e5971 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41d91b37ea747c2ba88bac31c25cffac85c409ca --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a4aed5a5c2f9e8ee5e3b7ab10dedade160167da04c2ee4729da23f6c30ba59 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa9eec9173772000339a1f4da92dc9c9adea4704 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeece2115e281e34271d2c1b055a45a7cc2041f05e81c0683c90b58ad46149f8 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2b4aa91628a0cf735ad35bbfd30e69a224e1ff8 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4b04ea2a7a5664a50fba2769d40bb89df5a3f6c784eca97789b55c3b00dcfd +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6f467885f6a41bf7969224eda7102004a0763cb --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42612259d9ffc298597c95e167aefc955e2c770f2f0879ad4e8efa9722f0f12e +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc5df08a8fd934db514d2773f038316681f4bb56 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f42413dd5d9987454be22809875d5335022487c3ecc3511f30d9a1fe1c9476d7 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed72245791bb4ba60efa8be6587982890d5f6189 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa6d951533747d8349184365725feedf7ae4c1ee0a534328d891ace5e80bd7a0 +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df9dbe4a991276b23b5aeb6e5eca0f5cf803c234 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5f881db7314a22b7ffa986027b9093020144d5c0719cb53d7ba745f1edd9aa +size 199058647 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18973942e4e7b23c9a1f6a6c1aa7089c88095fba --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d8ee2c8f577007ec854caf66adbb09a1794dd3c83960bcfb7e7864880ae852 +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed7b91a6a4b7b996a4f5a39fcc72e0a22f32506a --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595ab143472b8850a9bcfd633261b4129124a20fcb432ff695e4518c62ec94eb +size 199058850 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b4e156cd60bdb2d9309d82516676bc06c296360 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c62befbb5bb15734f3a1cb31e45bf83accdbbd8d4c81a699c54a04d1bdc6d22a +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9aa074381c3e9dcaf55e859b44dea90a2be61aa6 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c84e506fc71419ebf0ec0779a06ac0e8a86b55d3cc4b9cc0213d7dddcbcf562 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adbf191584ce0ef9a23a12c667ad85debb0370a1 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9859419edba7d23cce1591421d4df0d8cdde66bc17617b935999f928bcb0cce +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f97b6194dace2fce4eb138e6abaf6ddd0ae58e13 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862ef8718e38514edc22f4d59e627e2f50281919734d66cc0c9aa2a24eefd877 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..faca9b9fb8c653b9b1e9a5b86f93aaedd9559ef3 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b5640c0c264c5d7c2bb9a57c42737c7dcd286db7d18c38756e319b4f62f6be +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30bbe6bb7c83f9ed133b3758821d112c36d07a92 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c6cdcc5a7d4399e0eefa1ee406a7fc1afe297e37e7c450cc8f958bf83cd5ec +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31d0826826d82cc833818ee3b62d153d299d86e7 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d37cbe6360ce007845bcb962e700f37a891112cc04ac58dd224a9a013f9c4e +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23da879f1d71c7385cc559427244ccc38d216890 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5566bc4146555c8422cce67b93f84427b4cb02389afad14888b3b2270db0f9 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..903c576fbe95b77342c0439afc4a33d9681dcfd0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379d523d761491f3a313be7cb73c4fb2e3febcfaea0d1c977dcfa0c7b66a2c00 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a6ed8774cd3b606a4bf73ac5b9e2db37ab42da0 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8d64ba921ef004da2378b2d9618021ef688c80968c3aab30a66e37a7c16d19 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c36ec6bcf006fb753ce63de8358e2baf7397f0d --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952b74e3b5e853a741851fcee7fc3f0b2f982a7e653021979e78feee8f381361 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5bdab35050058de52d7bff2011ca9357417b70c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89e5dc66a2da9c5abd734793f1abe229ad33c263d690ae83960cc11a46cb655 +size 199058722 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa64a19b46b630190e84e3445762885d632ef0e4 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2675c6f5edf36d3ed8058289c2f72e76d53512defb73ce0f84d3ce9b524f140e +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a81dbf581da46b7d76d4d1ba664b41714dfa0a67 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee8fdef70497edd97aedf176b8922f6e541859687ddc022f8a1ef9f37507947a +size 199058786 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5102f53c2dc39c4f8fb5de5478df11d785d5ebdd --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d517cecfe34d0bd25ee66346940e50d1b54cd52e2c47f7ebbbf9b26d38f0283 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec07c62587afd6058a639e30aebf22dc266375dd --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f135b26801c1ee542540fc66f7ae613ed57d5f58672c7b516e41bbd70a77b99 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f01d2c68ae80e3d7bc5ae2a675cd148de5f74a7c --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24fced1575de44a8114ac7eb19390096f1c650d8d385bc391da031cc2ea744c +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f1f8c4f749aae171c53a9006f72732b9353f752 --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1621a7c5b205e75e1fb82a6d1b7d5e90b718df2551a29edd985116a18ea472f6 +size 199058658 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..150e8cb9605ba57052a4a7c8567f9e396ea87eeb --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd02e5d5c32982bb7cdda99728d73a3bce8aaaf15b3c710156891d6a6340476 +size 199058839 diff --git a/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8f10155ab098c7cd697c80513b9fc37c00a110f --- /dev/null +++ b/4b284b84bc4/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4491e71349b64d7e504050a0149a4c632a6702d83ff924145a6a7d10da7ed440 +size 199058839 diff --git a/4b284b84bc4/global_step80108/layer_01-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1c934759628872bc6a11d298730e7572c41acfc --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd30168d714811025e5a7cdd0542f60e0ad024cb4c015b6ec90c63a875a34c5 +size 167511299 diff --git a/4b284b84bc4/global_step80108/layer_01-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12143f8a144fda1d1cae93e476a5f9eda08dd8d2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeecbf16d1665b50b89ead59648a064d11544092620e94ddd9fb2e9d12ed594f +size 167511299 diff --git a/4b284b84bc4/global_step80108/layer_03-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1fcabf007432de596106404f6e181dc563049c4 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827d2db9f8295ed7d81c31ab935097f42c7da1a67004518c14e82cb78d7fd590 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_03-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b13711cf42e479be1ed2284bdd5da40071a4a2e5 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d67295e261349111f4b24c59813680288efb4f920d133b39b3e1ade6d068731 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_04-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be072809633677e311b55598c93d9637bbdd3800 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f1d8b49f1973449e114a787be776ff19695b0f033aba8a7172b2de0ea89d022 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_04-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab1be866ff3e58ef2975d8d42ed68bf32951a368 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c812fbc22ea9070d2c53d56f82424773aa20c05a97998329b7f2f4b5109fc634 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_05-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b47286404bababb71df7e5146dee66d2c083f70 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d56eaae8dcbc41158bb41809f3ffedbf2fee933f328464045ac0ed0da9d5ab37 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_05-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35f7f04b82835b23d9005ee6fe217d2bd7e1a3c9 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e45d8fbd20026450bf648a695e42c11f8c7ea25fa200a0376907d4a93db468 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_06-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb2dc733c6d3d6d6d8d0628200c5b2b80db4368f --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b064b9a84c9add12aa97ea72dfefb0fcd785f5aa0d93d897b9aeee9fdbc4806f +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_06-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63f80aa93bf8a435042eca081a5f762f7c968ff6 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5c350dd40f1a44892512c3d7dbf001ad31298b8879dc02b1d2c5870dad9097 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_07-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00f1c1f039a42ba0fb8ded6bb5db6ec224245b0c --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f73cadd95cf2d7b2483a102808623c50eca0ceb6de8e289aa4b10371bdadea1f +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_07-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b95b5727af35a489d6ae7107593f6a32b322203 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b6e3dd7a8f2095c86629fa28b6f8db18cc637de8a6d5a1a448aaad759275b7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_08-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a159756d3e63d81df55365426e14e2d2efb80a71 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdb839495997d2d5aba82948346164449ca8ce57610cce75179629638b324ae1 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_08-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89d24b48fb624d65a6dece92d3ffa1cd749ea7ac --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47df49113b8122ecdc2d7717620e5a14d5a8a0f58c30495589ddeb3448d9504 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_09-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81852812e9c38a3e72a018ea19af0bfb3c1e0222 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39f1badfe2d642cb2fec2afc8de49e8ba57ac757726255e2b99aaa4016982b20 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_09-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b18b27cc73db19fa87e1b1fe42d2d25b43ec479 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2182909635ac810c4377dba51ff10fd6b389426d914b77d4492216e019efa4d3 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_10-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7762dd771ec7d45253fab53eb5c4fe22aae840ec --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f49d3b08dbd45f3360c6cc7532c37e2842c3dbffc45bf2616d97cfea898639 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_10-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3a599fb82638c2332856999565a777de86b2e81 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36fc68deceb078d1980a8d08c79923c5cb9d2f908ac615e05f580bdea679fbd1 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_11-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e861faf353ffd2dd83dd5671c0efb980f053035a --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c7cb2def5ddb3b3a3aafab2bb5da64699d043522c166472c1806387c6269cc +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_11-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25f0e07f5dc118b7469f8772e0d823ca987364c2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c46af3b4cbec3eca6b521cc7d9b66347f8db739d4ee26092a9532f45ce26494 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_12-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cfc0bbfed632860c62f38b9537a3e4840ecbafb --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44a52d44f3090829ec4a1c8a5799eabe3179c0d9e09eb6bde0cefd475a860ae8 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_12-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eeffe489ec22011828aed5d1bd0b73510fd05658 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f637bc86da11f865da68676b90e9fdbe532be1c36c0222e3fe4edf5ebc58a3e7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_13-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4495cafb0ae102616dee48ef815a37938868a3dd --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7039dae792895fe67bcd0cdc2d7c6f0ae46fe31b81e14dc9c6e72cc5fc8a0db +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_13-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcb72887e5575611bd30efbf9cf32fc8eb7f9f0b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e3c257cadc8ff30376ad847d494907e30184d2d9cfc9f2bd3899e2dfe17658 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_14-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fc023e60ce5254274f37c4ecc128ea9cea599a4 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d044ab1751bd58c26b55d82b8252726778d4f3ceb6ccd51f29d22617af2890 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_14-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aec154db171532ed8e37ff95df2b5c02629e28f2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5415d94557bd15a8c5ae230ffbabc06cd9d90933b1b14a38aa2492b9feddbc0f +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_15-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01682f4454c24c40a699ae5cc9c1a8f3f5fd46ef --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54a651f8cb312fa38f3977d512dd91d23989f42f3d6ead9dbd30f5a99a97584 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_15-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..930b772def546e9710dafd331ac81baecaf67ea4 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e250df175887c501d454c6328dfd2b9dbb78e9245a05b3d4edfcda803d5d24cb +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_16-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09440422182fc7b3e8c138fa474514f21c1e7e78 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ef4ee76a32c4971c024ed028616222e3c1ab2186ec3c117a8743edc1b167af +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_16-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e583e70aa9c730b9393a7e6f217e55f8fd8b083a --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfa3a403d04c9f330bc52ef1248f173288c1beca96ad2ac13d97f90b3d2a910f +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_17-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5328fedd03b83394fa0758bc2489d3f3ae6f8e4e --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8692714f5bfc7972c177131d55a2626c2388e27dc2e0dbba5167a830af99e1b7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_17-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed602968077aca02bfd51d86f8845518f8c22fe9 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8feeeff0dd51f028b47a515fc699e5d7e6d2ab7ec459ce355a6e31fd8d31a724 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_18-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e117835c5f7e2fbd8d6a481426925f1203d10a2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9054bf69de10772fe8208e9be21cb5e1dc824752c9c974b5a09f733554ff04 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_18-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..901b98fdb8d5879c6242e0d98255a1b93fc291b3 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5973c739b6bd5d60b5f436df2d98632fbbdf4d2b1e9327cbbad0deebf626c017 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_19-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ecb7a29b8a8733f0ee6b4ab1c749c005fdb8da4 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d039182cdf21f372643ffaf73ff5269a991d69ce4a246c29d6b4b86f466e83ba +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_19-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7700286d1e39de726eacb3463e0c519a8c52cd64 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6587f54ad4d08a9316ad02a48a9c41773e25a13136719aa7619333dcd402def1 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_20-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26922a3c9fcd13816f9b6393fc3d8ec86e84859d --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983901e6efc4b3cc69ece5bbc8c2c561f44b18b491e03ea76305087bde4fc6c0 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_20-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b0a67da11a3d1ed1fb95fb87f6600bff48f4244 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:651ded9a5c265a705536930aaa78a5f2e357cbbbd7ab5d67b3edc8ec0830c67a +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_21-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a305076c0ca9170b88f21edc4438a24e89d5b9d7 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ccb48afd1c01d848fd1d976e16a33af831334b4aa7f5211f1f98b550b109a8 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_21-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0854dac25cbccd8cc301ac9c969c16ea2ebb9c73 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37757f06c565c303a6088bc324b102c7166805aa4c551d9fef8d1c142527fe55 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_22-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12c7dd35d8f19b011c1bcb8414c45d96001857c4 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93354a8c3007653d1205b7acd2e04fb90f268b48fec7455d60a92fe985ca241 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_22-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e10ed20dfaa7da2bc08c8f3a8f4a5a3f8001008 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac74f77ff4368792989a6217cc0f202f02e35410b3c4cdd1c2ca4f5fbbcff39d +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_23-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..735f9d90452a85ac73eddf89d6af7f50ebbf65fc --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c02acdae56b405931c6a4498b6c47122e46af0bc4f73c0473593ea600ed8039 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_23-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad94bd35f9397d90453a6f0f9a42e59d9d465bfc --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0436f792af6e73a957bd6b3cc9f38843720872a166f545f2e07fb943442c2011 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_24-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3528412553b1aed291a5a84e8d87b45d7793ae9a --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7cebdda21b5eca8561446326df9d6d2a99dd2e80d33122bc1bd3c15205a67fd +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_24-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..826d6830affec713f93e59506ba4939b8ccbea0d --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3927de584f3b7a2566003b243cb9488ac68b588fd68718d5ae4284593ac4d4 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_25-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9053d8305d9d82e37f21844be736b6cf7da6e4f --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78077e106f433171ec832b902b321071bec3fbd5abb2dd5cbce0eb68d9e0f685 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_25-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..751760d67341e5582be865a2227302c74209897a --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9c27cb33c5e8dc0a7a1a72e9089e1874295fc8ef979475ebe2694c77336311 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_26-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a27b2beac5798fab2997560e36351b4e474cdf5d --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28e1e3aac1f872f20b1f7257754e55d79fb2cd6236e2f5418ad10f1aadac266 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_26-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..760982c39ec1ed24e065aaf1fec9839d32d01e1b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b86ccb2645f7a93d36930666b5e865cc28a41bab3c5c5487dcf3a410f0e6620d +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_27-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97e6ccba5b0df348e1f77ed3a86c9930cb7b427b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ee9ecfc181fe3c6b819a7684878d1bdb0ca3cf681799d99cd4ceeb3d56c5fc +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_27-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9575fa96298b652aa4b3fbf56dbdade6c690d031 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a268ce0e4fe830fd1b3f5af2e9c52032f70c3208d893f6264476413e6f860f3 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_28-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..454db667b17984f2937177bf9144e8327447ef4b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b2bce0443147ad57cef1e04ada9296ec9293d2505edb1a48c0b7019f6c592b9 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_28-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..515571dbfda6d6ceb77c811ba92556069ce2b2cb --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec76655127a689dd050357b3db8448c5f129b63d8d9f2a28e134d55aa771250 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_29-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e73593a19ef8c8130ad00a615b7e30cc307042d --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf78d036dc4e2909ee5c8067a9b3a667fc1c4ca5efd45f552997d0a428b4d252 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_29-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9b77acbc8cad2c35567b75ce7834609a378cf3e --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1bf3ca8c76b6619e75d31cf5a9d08b0f00fef017099928fd700fe7e9fe3632 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_30-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..874851c70ef176c666fca47b970e0049a59ff960 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f492730ae3e2ae19f7353366e0d0b708d0c665a54b4c7d3db5b4449605e717c +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_30-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbf6ce2b67835dd87182816ba56aed416b8c2820 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e282b65f00f47949f84b08529ef9e75d7dcb0d31a8e54257a7b931e0e399a25 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_31-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82dcb203cd54aaed26b585bb46ec68e6364c21a5 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3516747a7161ee34a533371bed42026cfe493893bfd826779468abe91c32c3c7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_31-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44635a2ef8cf0fd578233dbdf161e153fbbbad43 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7906ed25c5ebf20c704f4b340592657ff1693f4eef4fb4a6e70219f8322616f9 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_32-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c78bc5f14001cd66292f9747c64db8a7f8c4806a --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44afa77c0f4a9470feafad1b05c7bc299f426a71197fd53f649ca57f6a4cc8b7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_32-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5539aa4e360045965345c966bd361165b0e8cda8 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfd3222eac2d5213252acb9aec3ec4abb3fa77096379692ce4366e12d113ea7 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_33-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cbabcdb46e146fce8ec8f00b2356569bf1a35be --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32cd6b2322ff41ef7110b9bb2587bcb66b364a9fb622e9e618bc7f313b31a31c +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_33-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0647fa2c693a5eaa7318f1eafee631d6b92d67a2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c966ebf86161858b13fb11d0dc6dc4d54c50e1b8bde8c90b50291b87c17d1dc0 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_34-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62c314ba135a110e2a58aca6d24fee8a5879318b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fefdeac14a844437bb684a9fc8349bcef78075350fc47729cc1b0d3f2455387 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_34-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4285ea48a542ad315ad80b38e1caeef02cd5f5ab --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3775bbd535bc469c00bcd760c018c73b9a963e5f82f0f098cc012d6b4d8d7b5f +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_35-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a64888df8839553989515e16eb6cf72acbe29bb2 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0bfe72aa94e5f867b10334a9e6acff879cdbc71647ab018655e949f0b83b478 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_35-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d1f9837097dd13304a1eb8a1d4e19dc2e24ac9b --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24d66cbd590783e9678be9fb08a62b9c3bfaf8b89daf4c249784df331e9a4d91 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_36-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fabfd9272ffc1cd7b6f23473161749a4d0fb1ae3 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a879a792f23047d4e87509bedae248e77cc8a8002bb612f8ab77e91a452d41e4 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_36-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..257bd26349abcb0436b32db69b9571e961510345 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a5dd04e5df1cc18f99aa1c0f5192cb01441cf5bd2717423f78c76ce1094a94 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_37-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9168fc3dca3c563910c30f1028f5085fac226562 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d609459603adcdc02acc23f5d7df6b9b1023710034cbf059dc27d8d7687c0ec5 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_37-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad55f28db37a2da0ffb535c5885befdba2e96449 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e483ce82a25a6af128bdb5eba63d7676d51ea6c2d6026e3e3fd77f61ffb55f34 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_38-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83130657bf289de1a349d590fa3120103b4d8f3d --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7f9429972ed20a36531c2ebbc6de4c888dc2a0e5d5eb30706b4ee4795d4227 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_38-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6296fbd3a2785b7594d8457406cd69315b73f218 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8757dbcf4f0c357c04e0146c518519fd2bac3be88ee5c850741d75cb1f3e0d0 +size 113308931 diff --git a/4b284b84bc4/global_step80108/layer_40-model_00-model_states.pt b/4b284b84bc4/global_step80108/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce74a970630a223a7c00f39a390e998989925234 --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84bd3f94e56b57b96b0f8658f68639b702ed5a80a19b52a42498152708516145 +size 13507 diff --git a/4b284b84bc4/global_step80108/layer_40-model_01-model_states.pt b/4b284b84bc4/global_step80108/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a720b8fda3ba6a61456c385dc155467b8f4727fd --- /dev/null +++ b/4b284b84bc4/global_step80108/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81df1349eb59c338f2d7e1afa1a61b68f3dda052b226fdb7a87079d161a28e5c +size 13507 diff --git a/4b284b84bc4/global_step80108/mp_rank_00_model_states.pt b/4b284b84bc4/global_step80108/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1c177e8889e5a1b79575521b6c34b18bf65627c --- /dev/null +++ b/4b284b84bc4/global_step80108/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6016b671b006cb38c600f9ba08cc84bbced429c2d708d877977a47221d3f0fe1 +size 51443 diff --git a/4b284b84bc4/global_step80108/mp_rank_01_model_states.pt b/4b284b84bc4/global_step80108/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8be4d47707cb6438a3b857201ba0cc550f4555f --- /dev/null +++ b/4b284b84bc4/global_step80108/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1318b0fef5a26779ff517774fa89232c2133604baf523d7080ed1db3bceb1f1 +size 51443 diff --git a/4b284b84bc4/transformers/config.json b/4b284b84bc4/transformers/config.json new file mode 100644 index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba --- /dev/null +++ b/4b284b84bc4/transformers/config.json @@ -0,0 +1 @@ +{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"} \ No newline at end of file diff --git a/4b284b84bc4/transformers/pytorch_model.bin b/4b284b84bc4/transformers/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ea7379a24dd3b4f93d723e6a84b6eaac5f2e9ae1 --- /dev/null +++ b/4b284b84bc4/transformers/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eedc176e0fe3aa2c55b994d17a5bb0edff4e898335deae7d8e3df9aac3c8b40d +size 8781203669 diff --git a/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674430922.nid006861.49893.0 b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674430922.nid006861.49893.0 new file mode 100644 index 0000000000000000000000000000000000000000..7f0b3675f6be77b9d8f08a9eb03340daa4cd13a3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674430922.nid006861.49893.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558809dcfa11d4338d3694ee055d2f83b9f1dcbdb8997d2bf00a9d4773e360e9 +size 40 diff --git a/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674432857.nid006861.63769.0 b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674432857.nid006861.63769.0 new file mode 100644 index 0000000000000000000000000000000000000000..454b30b528b4741110f4e6f43f075226a6e53661 --- /dev/null +++ b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674432857.nid006861.63769.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5f3c149f3f2d8816ecf1040f90a9a1058f40733f60b02f89d266b0f741a6b5c +size 40 diff --git a/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674434866.nid006729.107224.0 b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674434866.nid006729.107224.0 new file mode 100644 index 0000000000000000000000000000000000000000..1710971d4a6e4500628702065e729a56e27fd197 --- /dev/null +++ b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674434866.nid006729.107224.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5567f9210396c2c865092074e3e501831c429eea96ebfd2fe83ffe8c0a2ca698 +size 111226517 diff --git a/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674607714.nid006921.15108.0 b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674607714.nid006921.15108.0 new file mode 100644 index 0000000000000000000000000000000000000000..80d9d24a7a57f0c5269154ac8b398601b36b31e9 --- /dev/null +++ b/tensorboard/tensorboard_4b284b12bc4/events.out.tfevents.1674607714.nid006921.15108.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe2bfd2b947953d6b55ad53e22df71aabc43db84ad6c57ac2f87df037b05b5f9 +size 36156475 diff --git a/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674429120.nid006541.58580.0 b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674429120.nid006541.58580.0 new file mode 100644 index 0000000000000000000000000000000000000000..b5223380189a595efba77fb6790dfcbbfff91e78 --- /dev/null +++ b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674429120.nid006541.58580.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fa4534e68b51f1f2889f2e3062c191f62f7eb5defff3d5c1d7edd371b58b7d +size 111365167 diff --git a/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674601905.nid005345.55668.0 b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674601905.nid005345.55668.0 new file mode 100644 index 0000000000000000000000000000000000000000..d44aca439eb717a27b681e046cd0dc0c1c46cf73 --- /dev/null +++ b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674601905.nid005345.55668.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5903e7d7987ef985a6ef425bd4c60472f010e883918723dbd9f4f6ceba6e858 +size 36156475 diff --git a/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658193.nid005339.88561.0 b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658193.nid005339.88561.0 new file mode 100644 index 0000000000000000000000000000000000000000..82b3fb4bd2fbdc5dc2a7389198731259fc53fe05 --- /dev/null +++ b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658193.nid005339.88561.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f93eee93b466ec4cfa758845d6c760906f81e9890d49a43beb11a6f2c8f9ac66 +size 16424 diff --git a/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658406.nid006878.32820.0 b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658406.nid006878.32820.0 new file mode 100644 index 0000000000000000000000000000000000000000..04b89ec50bd1bc0968e7a897c11828294fd6dda4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658406.nid006878.32820.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c73fc8dfc39a0416d164d50f594d95afeed4a8cf060617e5a07e21a016b5718d +size 16424 diff --git a/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658624.nid006915.32122.0 b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658624.nid006915.32122.0 new file mode 100644 index 0000000000000000000000000000000000000000..36aef42dde821d48c3e3468f9b90160ca8610578 --- /dev/null +++ b/tensorboard/tensorboard_4b284b17bc4/events.out.tfevents.1674658624.nid006915.32122.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a673dbc03ce3eb9b742b3bd58231208655c367cf91b8583f4af1c5b1ba7038d +size 16424 diff --git a/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674431096.nid006923.109131.0 b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674431096.nid006923.109131.0 new file mode 100644 index 0000000000000000000000000000000000000000..d2c12a9675c5b2268cbb582b8ab749d4f7215ba7 --- /dev/null +++ b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674431096.nid006923.109131.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e4e5b30fe2630b1d55a3129d2b7b706655e94e869e49ff8b61bef5a3fa319ed +size 111106135 diff --git a/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674604050.nid006651.96414.0 b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674604050.nid006651.96414.0 new file mode 100644 index 0000000000000000000000000000000000000000..6f5d19a46ced92f17e329a360a80ec03fe955998 --- /dev/null +++ b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674604050.nid006651.96414.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83fbc3a3a4b0b1188f27a0f980b8d339eaff108b0167f1f23a6a909cc8850290 +size 36156475 diff --git a/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660087.nid006878.49607.0 b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660087.nid006878.49607.0 new file mode 100644 index 0000000000000000000000000000000000000000..8961c9cff449e56d5817b7361930a8bf3f5e3ee0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660087.nid006878.49607.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a99a1fec80a89c0b69d8f9cb46832b7257e2aba4d2c1a0d58613eaf0535691d +size 16424 diff --git a/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660297.nid007146.130858.0 b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660297.nid007146.130858.0 new file mode 100644 index 0000000000000000000000000000000000000000..15dc72cb3d7416b79bc98d447c4665c0c6285606 --- /dev/null +++ b/tensorboard/tensorboard_4b284b21bc4/events.out.tfevents.1674660297.nid007146.130858.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212de781c7a40a3ccb16edd91746a697b1da73a0c0629213c347b3740c787b4f +size 16424 diff --git a/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421365.nid006817.40785.0 b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421365.nid006817.40785.0 new file mode 100644 index 0000000000000000000000000000000000000000..e1104e7776f38fcc322ba63c3624875079f83032 --- /dev/null +++ b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421365.nid006817.40785.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d144e8a31d00d77cacb2251f4ed73ed22fff0d7582e1eddab6c7229bc7106d9 +size 40 diff --git a/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421874.nid006817.50951.0 b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421874.nid006817.50951.0 new file mode 100644 index 0000000000000000000000000000000000000000..3f5321d91917e81f8218b0cdd46ec619e8883ca9 --- /dev/null +++ b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674421874.nid006817.50951.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:915cce58f9ea416c82a73ac3a0f773f7ca00ccc432c64d6be3058f8db2be52b5 +size 111489175 diff --git a/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674594761.nid006641.34997.0 b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674594761.nid006641.34997.0 new file mode 100644 index 0000000000000000000000000000000000000000..7a12061b9971c2acf6fb22988006ac08eacd4ae4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674594761.nid006641.34997.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52b45708063cea3d63a04260dcb03403dede1b32f1f87e3d66822226b6a788c7 +size 110212 diff --git a/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674595614.nid006817.55910.0 b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674595614.nid006817.55910.0 new file mode 100644 index 0000000000000000000000000000000000000000..1d9f36b73447a4405b330e4707c52ed6359acbe4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674595614.nid006817.55910.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca11a4dddad38c16c19a663d360b96de56cd0cba467b30b2ee3a8856608fbad +size 36156475 diff --git a/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674651818.nid005935.18974.0 b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674651818.nid005935.18974.0 new file mode 100644 index 0000000000000000000000000000000000000000..5f9b319e2d9fecfce27ae179ca219b25749b5df1 --- /dev/null +++ b/tensorboard/tensorboard_4b284b28bc4/events.out.tfevents.1674651818.nid005935.18974.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c87a7c0b6d3e8c2d8a4feb3526d5388a2b5a19a786e28317c69a2d2487bd8d0 +size 16424 diff --git a/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674432495.nid007191.46442.0 b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674432495.nid007191.46442.0 new file mode 100644 index 0000000000000000000000000000000000000000..cf58bd0a867f50a9fa80a3ca3352c29c1aee608e --- /dev/null +++ b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674432495.nid007191.46442.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:810608f694d7cb6c477521b126b87770f3a44e72de017c1f47c5d47b00d6abf8 +size 110891290 diff --git a/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674605364.nid005245.45651.0 b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674605364.nid005245.45651.0 new file mode 100644 index 0000000000000000000000000000000000000000..c31d77fefb51ec12a309b2b4c65e90161f4ab6fd --- /dev/null +++ b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674605364.nid005245.45651.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:181a10325e2717d5459d0a11a4bf528d0475046f07a20709fd67057eb9978cac +size 36156475 diff --git a/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661706.nid006878.63207.0 b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661706.nid006878.63207.0 new file mode 100644 index 0000000000000000000000000000000000000000..e038956f4f58230e625521f655bd47b6a7cdf6b6 --- /dev/null +++ b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661706.nid006878.63207.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741662bc32ada14f4af61a2724cea10fb6cec3c40502020d6f5cde478f0865ae +size 16424 diff --git a/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661934.nid005245.76273.0 b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661934.nid005245.76273.0 new file mode 100644 index 0000000000000000000000000000000000000000..cbdba616bea8cc21a3369027f1a0c455ea020933 --- /dev/null +++ b/tensorboard/tensorboard_4b284b42bc4/events.out.tfevents.1674661934.nid005245.76273.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04fa3f3a89d94ab14131cacdd22b5b8bdccfc2703ccda0d5a01759534bbf9150 +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674421365.nid006889.7185.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674421365.nid006889.7185.0 new file mode 100644 index 0000000000000000000000000000000000000000..d7a4698c28ec8ae9e72e8ce9cfba76812f7e47f2 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674421365.nid006889.7185.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f1d3859a825f15ea6bae7a5573bd2991cfddd462eb5daef731b69932782dceb +size 111094890 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674594284.nid006438.112015.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674594284.nid006438.112015.0 new file mode 100644 index 0000000000000000000000000000000000000000..7fffccff3b543c4c6d4b952166efa18c6a9f627c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674594284.nid006438.112015.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aee68fe07635d453fdb414e0ed62dfc9bec2d64839e99a3232b43932d15da8f +size 36156457 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650495.nid006878.103243.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650495.nid006878.103243.0 new file mode 100644 index 0000000000000000000000000000000000000000..82810246b4c81e5d05c4be1fbe74053f094d2c42 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650495.nid006878.103243.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff8def624a1f4ff5fe42508877b6ea644c7e80ee372a7381fb6877b0d33ae92 +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650733.nid006557.33943.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650733.nid006557.33943.0 new file mode 100644 index 0000000000000000000000000000000000000000..2c6c48209093c46d63b2e9a949963f03010c10a3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674650733.nid006557.33943.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54672c08fb1c02f85f1405b961b103b82ab7b8f1b1959b3f909febf5866f0090 +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid005381.10637.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid005381.10637.0 new file mode 100644 index 0000000000000000000000000000000000000000..1b6ab4957e030206261e6da241bd61d8b30e2e76 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid005381.10637.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf005400a95c7c3dcd4ab935de2eb7d3d9b25ee5bde28089e8011e823ae0e53d +size 40 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid006557.77357.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid006557.77357.0 new file mode 100644 index 0000000000000000000000000000000000000000..e310f62af7aff7aaf47e556139a80baa53fc4437 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657652.nid006557.77357.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:528a4b4802c65983edb0bc6314903885ae2dce3ea1b270b536df2205093158b3 +size 20386 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657996.nid006557.83234.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657996.nid006557.83234.0 new file mode 100644 index 0000000000000000000000000000000000000000..22b7449ae7dc717f84159fc4008b103394218f6e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674657996.nid006557.83234.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e052901807616086fa5b8f1b4a1b4ee44ad71ea663213105ba3e0ddcd2ff0d8a +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658212.nid006557.88128.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658212.nid006557.88128.0 new file mode 100644 index 0000000000000000000000000000000000000000..af937a6a34353af7fe386846a4661853f6de7d1d --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658212.nid006557.88128.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:316681f55169fbaa1525f6a82ed40ae6296e765fa7ce94212c1051427feba95a +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658447.nid005339.96946.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658447.nid005339.96946.0 new file mode 100644 index 0000000000000000000000000000000000000000..946c9d935d8ed46ff68add4d3a66070f7967bc16 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1674658447.nid005339.96946.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4be471e1dce98af83979143cb69ed432dbe99ce3543d645928c84f7f1d12c9aa +size 16424 diff --git a/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1675008169.nid005381.7334.0 b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1675008169.nid005381.7334.0 new file mode 100644 index 0000000000000000000000000000000000000000..76e74347e49150ff71c6db63c19120f1bcf773e0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84bc4/events.out.tfevents.1675008169.nid005381.7334.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35c8c147c75221869b2188ddeab9a7273b35baa9369313cce5ae0069ae1efff2 +size 40